├── PCA.ipynb └── README.md /PCA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Principal Component Analysis(PCA)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import matplotlib.pyplot as plt\n", 17 | "import numpy as np\n", 18 | "import pandas as pd\n", 19 | "%matplotlib inline" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from sklearn.datasets import load_breast_cancer" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "cancer=load_breast_cancer()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": [ 48 | "dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])" 49 | ] 50 | }, 51 | "execution_count": 4, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "cancer.keys()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 5, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "Breast Cancer Wisconsin (Diagnostic) Database\n", 70 | "=============================================\n", 71 | "\n", 72 | "Notes\n", 73 | "-----\n", 74 | "Data Set Characteristics:\n", 75 | " :Number of Instances: 569\n", 76 | "\n", 77 | " :Number of Attributes: 30 numeric, predictive attributes and the class\n", 78 | "\n", 79 | " :Attribute Information:\n", 80 | " - radius (mean of distances from center to points on the perimeter)\n", 81 | " - texture (standard deviation of gray-scale values)\n", 82 | " - perimeter\n", 83 | " - area\n", 84 | " - smoothness (local variation in radius lengths)\n", 85 | " - compactness (perimeter^2 / area - 1.0)\n", 86 | " - concavity (severity of concave portions of the contour)\n", 87 | " - concave points (number of concave portions of the contour)\n", 88 | " - symmetry \n", 89 | " - fractal dimension (\"coastline approximation\" - 1)\n", 90 | "\n", 91 | " The mean, standard error, and \"worst\" or largest (mean of the three\n", 92 | " largest values) of these features were computed for each image,\n", 93 | " resulting in 30 features. For instance, field 3 is Mean Radius, field\n", 94 | " 13 is Radius SE, field 23 is Worst Radius.\n", 95 | "\n", 96 | " - class:\n", 97 | " - WDBC-Malignant\n", 98 | " - WDBC-Benign\n", 99 | "\n", 100 | " :Summary Statistics:\n", 101 | "\n", 102 | " ===================================== ====== ======\n", 103 | " Min Max\n", 104 | " ===================================== ====== ======\n", 105 | " radius (mean): 6.981 28.11\n", 106 | " texture (mean): 9.71 39.28\n", 107 | " perimeter (mean): 43.79 188.5\n", 108 | " area (mean): 143.5 2501.0\n", 109 | " smoothness (mean): 0.053 0.163\n", 110 | " compactness (mean): 0.019 0.345\n", 111 | " concavity (mean): 0.0 0.427\n", 112 | " concave points (mean): 0.0 0.201\n", 113 | " symmetry (mean): 0.106 0.304\n", 114 | " fractal dimension (mean): 0.05 0.097\n", 115 | " radius (standard error): 0.112 2.873\n", 116 | " texture (standard error): 0.36 4.885\n", 117 | " perimeter (standard error): 0.757 21.98\n", 118 | " area (standard error): 6.802 542.2\n", 119 | " smoothness (standard error): 0.002 0.031\n", 120 | " compactness (standard error): 0.002 0.135\n", 121 | " concavity (standard error): 0.0 0.396\n", 122 | " concave points (standard error): 0.0 0.053\n", 123 | " symmetry (standard error): 0.008 0.079\n", 124 | " fractal dimension (standard error): 0.001 0.03\n", 125 | " radius (worst): 7.93 36.04\n", 126 | " texture (worst): 12.02 49.54\n", 127 | " perimeter (worst): 50.41 251.2\n", 128 | " area (worst): 185.2 4254.0\n", 129 | " smoothness (worst): 0.071 0.223\n", 130 | " compactness (worst): 0.027 1.058\n", 131 | " concavity (worst): 0.0 1.252\n", 132 | " concave points (worst): 0.0 0.291\n", 133 | " symmetry (worst): 0.156 0.664\n", 134 | " fractal dimension (worst): 0.055 0.208\n", 135 | " ===================================== ====== ======\n", 136 | "\n", 137 | " :Missing Attribute Values: None\n", 138 | "\n", 139 | " :Class Distribution: 212 - Malignant, 357 - Benign\n", 140 | "\n", 141 | " :Creator: Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian\n", 142 | "\n", 143 | " :Donor: Nick Street\n", 144 | "\n", 145 | " :Date: November, 1995\n", 146 | "\n", 147 | "This is a copy of UCI ML Breast Cancer Wisconsin (Diagnostic) datasets.\n", 148 | "https://goo.gl/U2Uwz2\n", 149 | "\n", 150 | "Features are computed from a digitized image of a fine needle\n", 151 | "aspirate (FNA) of a breast mass. They describe\n", 152 | "characteristics of the cell nuclei present in the image.\n", 153 | "\n", 154 | "Separating plane described above was obtained using\n", 155 | "Multisurface Method-Tree (MSM-T) [K. P. Bennett, \"Decision Tree\n", 156 | "Construction Via Linear Programming.\" Proceedings of the 4th\n", 157 | "Midwest Artificial Intelligence and Cognitive Science Society,\n", 158 | "pp. 97-101, 1992], a classification method which uses linear\n", 159 | "programming to construct a decision tree. Relevant features\n", 160 | "were selected using an exhaustive search in the space of 1-4\n", 161 | "features and 1-3 separating planes.\n", 162 | "\n", 163 | "The actual linear program used to obtain the separating plane\n", 164 | "in the 3-dimensional space is that described in:\n", 165 | "[K. P. Bennett and O. L. Mangasarian: \"Robust Linear\n", 166 | "Programming Discrimination of Two Linearly Inseparable Sets\",\n", 167 | "Optimization Methods and Software 1, 1992, 23-34].\n", 168 | "\n", 169 | "This database is also available through the UW CS ftp server:\n", 170 | "\n", 171 | "ftp ftp.cs.wisc.edu\n", 172 | "cd math-prog/cpo-dataset/machine-learn/WDBC/\n", 173 | "\n", 174 | "References\n", 175 | "----------\n", 176 | " - W.N. Street, W.H. Wolberg and O.L. Mangasarian. Nuclear feature extraction \n", 177 | " for breast tumor diagnosis. IS&T/SPIE 1993 International Symposium on \n", 178 | " Electronic Imaging: Science and Technology, volume 1905, pages 861-870,\n", 179 | " San Jose, CA, 1993.\n", 180 | " - O.L. Mangasarian, W.N. Street and W.H. Wolberg. Breast cancer diagnosis and \n", 181 | " prognosis via linear programming. Operations Research, 43(4), pages 570-577, \n", 182 | " July-August 1995.\n", 183 | " - W.H. Wolberg, W.N. Street, and O.L. Mangasarian. Machine learning techniques\n", 184 | " to diagnose breast cancer from fine-needle aspirates. Cancer Letters 77 (1994) \n", 185 | " 163-171.\n", 186 | "\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "print(cancer['DESCR'])" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 6, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "df=pd.DataFrame(cancer['data'],columns=cancer['feature_names'])" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 7, 206 | "metadata": {}, 207 | "outputs": [ 208 | { 209 | "data": { 210 | "text/html": [ 211 | "
\n", 212 | "\n", 225 | "\n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | "
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst radiusworst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimension
017.9910.38122.801001.00.118400.277600.30010.147100.24190.07871...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
120.5717.77132.901326.00.084740.078640.08690.070170.18120.05667...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
219.6921.25130.001203.00.109600.159900.19740.127900.20690.05999...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
311.4220.3877.58386.10.142500.283900.24140.105200.25970.09744...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
420.2914.34135.101297.00.100300.132800.19800.104300.18090.05883...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678
\n", 375 | "

5 rows × 30 columns

\n", 376 | "
" 377 | ], 378 | "text/plain": [ 379 | " mean radius mean texture mean perimeter mean area mean smoothness \\\n", 380 | "0 17.99 10.38 122.80 1001.0 0.11840 \n", 381 | "1 20.57 17.77 132.90 1326.0 0.08474 \n", 382 | "2 19.69 21.25 130.00 1203.0 0.10960 \n", 383 | "3 11.42 20.38 77.58 386.1 0.14250 \n", 384 | "4 20.29 14.34 135.10 1297.0 0.10030 \n", 385 | "\n", 386 | " mean compactness mean concavity mean concave points mean symmetry \\\n", 387 | "0 0.27760 0.3001 0.14710 0.2419 \n", 388 | "1 0.07864 0.0869 0.07017 0.1812 \n", 389 | "2 0.15990 0.1974 0.12790 0.2069 \n", 390 | "3 0.28390 0.2414 0.10520 0.2597 \n", 391 | "4 0.13280 0.1980 0.10430 0.1809 \n", 392 | "\n", 393 | " mean fractal dimension ... worst radius \\\n", 394 | "0 0.07871 ... 25.38 \n", 395 | "1 0.05667 ... 24.99 \n", 396 | "2 0.05999 ... 23.57 \n", 397 | "3 0.09744 ... 14.91 \n", 398 | "4 0.05883 ... 22.54 \n", 399 | "\n", 400 | " worst texture worst perimeter worst area worst smoothness \\\n", 401 | "0 17.33 184.60 2019.0 0.1622 \n", 402 | "1 23.41 158.80 1956.0 0.1238 \n", 403 | "2 25.53 152.50 1709.0 0.1444 \n", 404 | "3 26.50 98.87 567.7 0.2098 \n", 405 | "4 16.67 152.20 1575.0 0.1374 \n", 406 | "\n", 407 | " worst compactness worst concavity worst concave points worst symmetry \\\n", 408 | "0 0.6656 0.7119 0.2654 0.4601 \n", 409 | "1 0.1866 0.2416 0.1860 0.2750 \n", 410 | "2 0.4245 0.4504 0.2430 0.3613 \n", 411 | "3 0.8663 0.6869 0.2575 0.6638 \n", 412 | "4 0.2050 0.4000 0.1625 0.2364 \n", 413 | "\n", 414 | " worst fractal dimension \n", 415 | "0 0.11890 \n", 416 | "1 0.08902 \n", 417 | "2 0.08758 \n", 418 | "3 0.17300 \n", 419 | "4 0.07678 \n", 420 | "\n", 421 | "[5 rows x 30 columns]" 422 | ] 423 | }, 424 | "execution_count": 7, 425 | "metadata": {}, 426 | "output_type": "execute_result" 427 | } 428 | ], 429 | "source": [ 430 | "df.head(5)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 8, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "from sklearn.preprocessing import MinMaxScaler" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 9, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "from sklearn.preprocessing import StandardScaler" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 10, 454 | "metadata": {}, 455 | "outputs": [ 456 | { 457 | "data": { 458 | "text/plain": [ 459 | "StandardScaler(copy=True, with_mean=True, with_std=True)" 460 | ] 461 | }, 462 | "execution_count": 10, 463 | "metadata": {}, 464 | "output_type": "execute_result" 465 | } 466 | ], 467 | "source": [ 468 | "scaler=StandardScaler()\n", 469 | "scaler.fit(df)" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 11, 475 | "metadata": {}, 476 | "outputs": [], 477 | "source": [ 478 | "scaled_data=scaler.transform(df)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 12, 484 | "metadata": {}, 485 | "outputs": [ 486 | { 487 | "data": { 488 | "text/plain": [ 489 | "array([[ 1.09706398, -2.07333501, 1.26993369, ..., 2.29607613,\n", 490 | " 2.75062224, 1.93701461],\n", 491 | " [ 1.82982061, -0.35363241, 1.68595471, ..., 1.0870843 ,\n", 492 | " -0.24388967, 0.28118999],\n", 493 | " [ 1.57988811, 0.45618695, 1.56650313, ..., 1.95500035,\n", 494 | " 1.152255 , 0.20139121],\n", 495 | " ...,\n", 496 | " [ 0.70228425, 2.0455738 , 0.67267578, ..., 0.41406869,\n", 497 | " -1.10454895, -0.31840916],\n", 498 | " [ 1.83834103, 2.33645719, 1.98252415, ..., 2.28998549,\n", 499 | " 1.91908301, 2.21963528],\n", 500 | " [-1.80840125, 1.22179204, -1.81438851, ..., -1.74506282,\n", 501 | " -0.04813821, -0.75120669]])" 502 | ] 503 | }, 504 | "execution_count": 12, 505 | "metadata": {}, 506 | "output_type": "execute_result" 507 | } 508 | ], 509 | "source": [ 510 | "scaled_data" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 13, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "from sklearn.decomposition import PCA" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 14, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "pca=PCA(n_components=2)" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 15, 534 | "metadata": {}, 535 | "outputs": [ 536 | { 537 | "data": { 538 | "text/plain": [ 539 | "PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,\n", 540 | " svd_solver='auto', tol=0.0, whiten=False)" 541 | ] 542 | }, 543 | "execution_count": 15, 544 | "metadata": {}, 545 | "output_type": "execute_result" 546 | } 547 | ], 548 | "source": [ 549 | "pca.fit(scaled_data)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 16, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "x_pca=pca.transform(scaled_data)" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 17, 564 | "metadata": {}, 565 | "outputs": [ 566 | { 567 | "data": { 568 | "text/plain": [ 569 | "(569, 30)" 570 | ] 571 | }, 572 | "execution_count": 17, 573 | "metadata": {}, 574 | "output_type": "execute_result" 575 | } 576 | ], 577 | "source": [ 578 | "scaled_data.shape" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": 18, 584 | "metadata": {}, 585 | "outputs": [ 586 | { 587 | "data": { 588 | "text/plain": [ 589 | "(569, 2)" 590 | ] 591 | }, 592 | "execution_count": 18, 593 | "metadata": {}, 594 | "output_type": "execute_result" 595 | } 596 | ], 597 | "source": [ 598 | "x_pca.shape" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 19, 604 | "metadata": {}, 605 | "outputs": [ 606 | { 607 | "data": { 608 | "text/plain": [ 609 | "array([[ 1.09706398, -2.07333501, 1.26993369, ..., 2.29607613,\n", 610 | " 2.75062224, 1.93701461],\n", 611 | " [ 1.82982061, -0.35363241, 1.68595471, ..., 1.0870843 ,\n", 612 | " -0.24388967, 0.28118999],\n", 613 | " [ 1.57988811, 0.45618695, 1.56650313, ..., 1.95500035,\n", 614 | " 1.152255 , 0.20139121],\n", 615 | " ...,\n", 616 | " [ 0.70228425, 2.0455738 , 0.67267578, ..., 0.41406869,\n", 617 | " -1.10454895, -0.31840916],\n", 618 | " [ 1.83834103, 2.33645719, 1.98252415, ..., 2.28998549,\n", 619 | " 1.91908301, 2.21963528],\n", 620 | " [-1.80840125, 1.22179204, -1.81438851, ..., -1.74506282,\n", 621 | " -0.04813821, -0.75120669]])" 622 | ] 623 | }, 624 | "execution_count": 19, 625 | "metadata": {}, 626 | "output_type": "execute_result" 627 | } 628 | ], 629 | "source": [ 630 | "scaled_data" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 20, 636 | "metadata": {}, 637 | "outputs": [ 638 | { 639 | "data": { 640 | "text/plain": [ 641 | "array([[ 9.19283683, 1.94858307],\n", 642 | " [ 2.3878018 , -3.76817174],\n", 643 | " [ 5.73389628, -1.0751738 ],\n", 644 | " ...,\n", 645 | " [ 1.25617928, -1.90229671],\n", 646 | " [10.37479406, 1.67201011],\n", 647 | " [-5.4752433 , -0.67063679]])" 648 | ] 649 | }, 650 | "execution_count": 20, 651 | "metadata": {}, 652 | "output_type": "execute_result" 653 | } 654 | ], 655 | "source": [ 656 | "x_pca\n" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 21, 662 | "metadata": {}, 663 | "outputs": [ 664 | { 665 | "data": { 666 | "text/plain": [ 667 | "Text(0,0.5,'Second principle component')" 668 | ] 669 | }, 670 | "execution_count": 21, 671 | "metadata": {}, 672 | "output_type": "execute_result" 673 | }, 674 | { 675 | "data": { 676 | "image/png": "\n", 677 | "text/plain": [ 678 | "" 679 | ] 680 | }, 681 | "metadata": {}, 682 | "output_type": "display_data" 683 | } 684 | ], 685 | "source": [ 686 | "plt.figure(figsize=(8,6))\n", 687 | "plt.scatter(x_pca[:,0],x_pca[:,1],c=cancer['target'])\n", 688 | "plt.xlabel('First principle component')\n", 689 | "plt.ylabel('Second principle component')\n", 690 | "\n", 691 | "\n", 692 | "\n" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [] 701 | } 702 | ], 703 | "metadata": { 704 | "kernelspec": { 705 | "display_name": "Python 3", 706 | "language": "python", 707 | "name": "python3" 708 | }, 709 | "language_info": { 710 | "codemirror_mode": { 711 | "name": "ipython", 712 | "version": 3 713 | }, 714 | "file_extension": ".py", 715 | "mimetype": "text/x-python", 716 | "name": "python", 717 | "nbconvert_exporter": "python", 718 | "pygments_lexer": "ipython3", 719 | "version": "3.6.4" 720 | } 721 | }, 722 | "nbformat": 4, 723 | "nbformat_minor": 2 724 | } 725 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Principle-Component-Analysis --------------------------------------------------------------------------------