├── Chapter_10_sec_10.9.ipynb ├── Chapter_11_sec_11.8.ipynb ├── Chapter_12_sec_12.5.ipynb ├── Chapter_13_sec_13.6.ipynb ├── Chapter_2_sec_3.1_3.5.ipynb ├── Chapter_3_sec_6.1_6.7.ipynb ├── Chapter_4_sec_7.1_7.7.ipynb ├── Chapter_5_sec_3.1_3.4.ipynb ├── Chapter_6_sec_6.5.ipynb ├── Chapter_7_sec_7.8.ipynb ├── Chapter_8_sec_8.3.ipynb ├── Chapter_9_sec_9.6.ipynb ├── ISLR_v1.pdf ├── ISLR_v2_2021_Nov.pdf ├── README.md └── data ├── Auto.csv ├── Bikeshare.csv ├── Boston.csv ├── BrainCancer.csv ├── Caravan.csv ├── Carseats.csv ├── College.csv ├── Default.csv ├── Fund.csv ├── Hitters.csv ├── Khan.json ├── Khan.rda ├── NCI60.json ├── NCI60.rda ├── NCI60_data.csv ├── NCI60_labs.csv ├── OJ.csv ├── Portfolio.csv ├── Publication.csv ├── Readme_datalist ├── Smarket.csv ├── USArrests.csv ├── Wage.csv ├── Weekly.csv └── dog_test.jpg /Chapter_11_sec_11.8.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 11.8 Lab: Survival Analysis" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "\n", 20 | "from lifelines import KaplanMeierFitter\n", 21 | "from lifelines.statistics import logrank_test\n", 22 | "from lifelines import CoxPHFitter\n", 23 | "\n", 24 | "%matplotlib inline" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "\"\"\" \n", 34 | "I am also new to this topic, so let us learn those concept together. Feedbacks are welcome.\n", 35 | "Survival analysis is a statistical method used to estimate the survival function of a population.\n", 36 | "These arise in the analysis of a unique kind of outcome variable: the analysis time until an event occurs.\n", 37 | "\"\"\"" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## 11.8.1 Brain Cancer Data" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "BrainCancer = pd.read_csv('data/BrainCancer.csv', header=0)\n", 54 | "# use some options in .describe() to get a quick overview of the data\n", 55 | "BrainCancer.describe(include = 'object')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "BrainCancer.head()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# I did a bit of google search and found the package lifelines.\n", 74 | "# % pip install lifelines" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "# create a kmf object\n", 84 | "kmf = KaplanMeierFitter() " 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "# fit the data into the model\n", 94 | "kmf.fit(BrainCancer.time, BrainCancer.status,label='Kaplan Meier Estimate')\n" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "# create an estimate\n", 104 | "kmf.plot(ci_show=True) ## ci_show is meant for Confidence interval, which is the shaded area in the plot." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "kmf1 = KaplanMeierFitter() ## instantiate the class to create an object\n", 114 | "\n", 115 | "## two Cohorts are compared. Cohort 1. Female; Cohort 2. Male \n", 116 | "groups = BrainCancer['sex'] \n", 117 | "T = BrainCancer.time\n", 118 | "E = BrainCancer.status \n", 119 | "i1 = (groups == 'Female') \n", 120 | "i2 = (groups == 'Male') \n", 121 | "\n", 122 | "\n", 123 | "## fit the model for 1st cohort\n", 124 | "kmf1.fit(T[i1], E[i1], label='Female')\n", 125 | "a1 = kmf1.plot()\n", 126 | "\n", 127 | "## fit the model for 2nd cohort\n", 128 | "kmf1.fit(T[i2], E[i2], label='Male')\n", 129 | "kmf1.plot(ax=a1)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "# we can perform a log-rank test to compare the survival of males to females,\n", 139 | "results=logrank_test(T[i1],T[i2],event_observed_A=E[i1], event_observed_B=E[i2])\n", 140 | "results.print_summary()\n", 141 | "\"\"\" \n", 142 | "The resulting p-value is 0.23, indicating no evidence of a difference in sur- vival between the two sexes.\n", 143 | "This also can be seen from the overlapping the confidence intervals. \n", 144 | "\"\"\"" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "df_dummy = pd.get_dummies(BrainCancer, drop_first=True)\n", 154 | "df_dummy.head()\n" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "# use Cox Proportional Hazards model\n", 164 | "cph1 = CoxPHFitter() \n", 165 | "cph1.fit(df_dummy[['status', 'sex_Male', 'time']], 'time', event_col='status') \n", 166 | "cph1.print_summary()\n", 167 | "\"\"\" \n", 168 | "I was not able to use the optional 'formula' in the fit() function due to the compatibility with new version, instead I sliced the dataframe to only contains 3 cols ('status', 'sex_Male', 'time')\n", 169 | "Here the p value is at 0.233. Regardless of which test we use, we see that there is no clear evidence for a difference in survival between males and females.\n", 170 | "\"\"\"" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "# use Cox Proportional Hazards model and use more features \n", 180 | "cph2 = CoxPHFitter() \n", 181 | "cph2.fit(df_dummy, 'time', event_col='status') \n", 182 | "cph2.print_summary()\n", 183 | "\"\"\" \n", 184 | "after adjusting for the other predictors, larger values of the Karnofsky index, ki, are associated with lower risk, \n", 185 | "i.e. longer survival.\n", 186 | "\"\"\"" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "## 11.8.2 Publication Data" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "Publication = pd.read_csv('data/Publication.csv', header=0)\n", 203 | "Publication.head()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "kmf1 = KaplanMeierFitter() ## instantiate the class to create an object\n", 213 | "\n", 214 | "## two Cohorts are compared. Cohort 1. posres==0 ; Cohort 2. posres==1\n", 215 | "groups = Publication['posres'] \n", 216 | "T = Publication.time\n", 217 | "E = Publication.status \n", 218 | "i1 = (groups == 0) \n", 219 | "i2 = (groups == 1) \n", 220 | "\n", 221 | "## fit the model for 1st cohort\n", 222 | "kmf1.fit(T[i1], E[i1], label='Positive Results')\n", 223 | "a1 = kmf1.plot()\n", 224 | "\n", 225 | "## fit the model for 2nd cohort\n", 226 | "kmf1.fit(T[i2], E[i2], label='Negative Results')\n", 227 | "kmf1.plot(ax=a1)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "# we can perform a log-rank test to compare the survival of males to females,\n", 237 | "results=logrank_test(T[i1],T[i2],event_observed_A=E[i1], event_observed_B=E[i2])\n", 238 | "results.print_summary()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "df_dummy = pd.get_dummies(Publication, drop_first=True)\n", 248 | "df_dummy.head()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "# use Cox Proportional Hazards model\n", 258 | "cph3 = CoxPHFitter() \n", 259 | "cph3.fit(df_dummy[['status', 'posres', 'time']], 'time', event_col='status') \n", 260 | "cph3.print_summary()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "cph4 = CoxPHFitter() \n", 270 | "cph4.fit(df_dummy[['status', 'posres', 'time', 'multi', 'clinend','sampsize', 'budget', 'impact']], 'time', event_col='status') \n", 271 | "cph4.print_summary()\n", 272 | "\"\"\" \n", 273 | "After we control for other features, posres becomes an important factor (well, at least significant).\n", 274 | "We see that there are a number of statistically significant variables, \n", 275 | "including whether the trial focused on a clinical endpoint (clinend), the impact of the study(impact),\n", 276 | "and whether the study had positive or negative results (posres).\n", 277 | "\"\"\"" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "## 11.8.3 Call Center Data" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "np.random.seed(1)\n", 294 | "N = 2000\n", 295 | "Operators = np.random.choice(range(5, 16), N)\n", 296 | "Center = np.random.choice([\"A\", \"B\", \"C\"], N)\n", 297 | "Time = np.random.choice([\"Morn.\", \"After.\", \"Even.\"], N)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "# we generate a similar random data set\n", 307 | "X_pre = pd.DataFrame({\"Operators\": Operators, \"Center\": Center, \"Time\": Time})\n", 308 | "X = pd.get_dummies(X_pre, drop_first=True)\n", 309 | "X.head()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "true_coeff = np.array([0.04, -0.3, 0, 0.2, -0.2])\n", 319 | "# well, I was not able to fully following the simulation in the book. \n", 320 | "# I think the highlevel idea is to use those coefficients to generate a dataset and show the model fit could \n", 321 | "# sucessfully recover the coefficients." 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "# this simulation code is wrong. And I will come back and fix it. \n", 331 | "X['y'] = 350*np.exp(np.sum(-X*true_coeff,axis=1).tolist() + np.random.normal(0, 0.005, N))\n", 332 | "X['answered'] = np.where( X['y'] < 300 , 1, 0)\n", 333 | "X.head()" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "# use Cox Proportional Hazards model\n", 343 | "cph5 = CoxPHFitter() \n", 344 | "cph5.fit(X, 'y', event_col='answered') \n", 345 | "cph5.print_summary()\n", 346 | "\"\"\" \n", 347 | "Since the simulation is wrong, so the summary is not correct. \n", 348 | "But at least we can see the p-values for Operatator, Center = B, Time = Even. and Time = Morn are very small, \n", 349 | "and they are directly related to the ground truth coefficients.\n", 350 | "\"\"\"" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "# End of Chapter 11" 360 | ] 361 | } 362 | ], 363 | "metadata": { 364 | "interpreter": { 365 | "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250" 366 | }, 367 | "kernelspec": { 368 | "display_name": "Python 2.7.16 64-bit ('base': conda)", 369 | "name": "python3" 370 | }, 371 | "language_info": { 372 | "codemirror_mode": { 373 | "name": "ipython", 374 | "version": 3 375 | }, 376 | "file_extension": ".py", 377 | "mimetype": "text/x-python", 378 | "name": "python", 379 | "nbconvert_exporter": "python", 380 | "pygments_lexer": "ipython3", 381 | "version": "3.6.2" 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 2 386 | } 387 | -------------------------------------------------------------------------------- /Chapter_12_sec_12.5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 12.5 Lab: Unsupervised Learning" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "from numpy.linalg import svd\n", 19 | "import matplotlib as mpl\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "\n", 22 | "from sklearn.preprocessing import StandardScaler\n", 23 | "from sklearn.decomposition import PCA\n", 24 | "from sklearn.cluster import KMeans\n", 25 | "from scipy.cluster import hierarchy\n", 26 | "\n", 27 | "%matplotlib inline\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## 12.5.1 Principal Components Analysis" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "USArrests = pd.read_csv('./data/USArrests.csv', index_col=0)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "USArrests.head()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# pandas has a built-in function to get the mean and variance of each column\n", 62 | "print(USArrests.mean())\n", 63 | "print(USArrests.var())" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "sc = StandardScaler()\n", 73 | "X = pd.DataFrame(sc.fit_transform(USArrests), index=USArrests.index, columns=USArrests.columns)\n", 74 | "# The loading vectors (i.e. these are the projection of the data onto the principal components)\n", 75 | "pca_loadings = pd.DataFrame(PCA().fit(X).components_.T, index=USArrests.columns, columns=['V1', 'V2', 'V3', 'V4'])\n", 76 | "pca_loadings\n", 77 | "\n", 78 | "\"\"\" \n", 79 | "Depends on the version of python/module, you may see a flipped loading vector in signs. \n", 80 | "This is normal because the orientation of the principal components is not deterministic. \n", 81 | "\"\"\"\n", 82 | "# X1=pd.DataFrame(sc.inverse_transform(X), index=USArrests.index, columns=USArrests.columns)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# fit the PCA model and transform X to get the principal components\n", 92 | "pca = PCA()\n", 93 | "df_plot = pd.DataFrame(pca.fit_transform(X), columns=['PC1', 'PC2', 'PC3', 'PC4'], index=X.index)\n", 94 | "df_plot" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "fig , ax1 = plt.subplots(figsize=(9,7))\n", 104 | "\n", 105 | "ax1.set_xlim(-3.5,3.5)\n", 106 | "ax1.set_ylim(-3.5,3.5)\n", 107 | "\n", 108 | "# plot Principal Components 1 and 2\n", 109 | "for i in df_plot.index:\n", 110 | " ax1.annotate(i, (df_plot.PC1.loc[i], -df_plot.PC2.loc[i]), ha='center')\n", 111 | "\n", 112 | "# plot reference lines\n", 113 | "ax1.hlines(0,-3.5,3.5, linestyles='dotted', colors='grey')\n", 114 | "ax1.vlines(0,-3.5,3.5, linestyles='dotted', colors='grey')\n", 115 | "\n", 116 | "ax1.set_xlabel('First Principal Component')\n", 117 | "ax1.set_ylabel('Second Principal Component')\n", 118 | " \n", 119 | "# plot Principal Component loading vectors, using a second y-axis.\n", 120 | "ax2 = ax1.twinx().twiny() \n", 121 | "\n", 122 | "ax2.set_ylim(-1,1)\n", 123 | "ax2.set_xlim(-1,1)\n", 124 | "ax2.tick_params(axis='y', colors='orange')\n", 125 | "ax2.set_xlabel('Principal Component loading vectors', color='orange')\n", 126 | "\n", 127 | "# plot labels for vectors. Variable 'a' is a small offset parameter to separate arrow tip and text.\n", 128 | "a = 1.07 \n", 129 | "for i in pca_loadings[['V1', 'V2']].index:\n", 130 | " ax2.annotate(i, (pca_loadings.V1.loc[i]*a, -pca_loadings.V2.loc[i]*a), color='orange')\n", 131 | "\n", 132 | "# plot vectors\n", 133 | "ax2.arrow(0,0,pca_loadings.V1[0], -pca_loadings.V2[0])\n", 134 | "ax2.arrow(0,0,pca_loadings.V1[1], -pca_loadings.V2[1])\n", 135 | "ax2.arrow(0,0,pca_loadings.V1[2], -pca_loadings.V2[2])\n", 136 | "ax2.arrow(0,0,pca_loadings.V1[3], -pca_loadings.V2[3])\n", 137 | "plt.show()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# in previous chapter, we talked about PCR. In that case, we could use the downstream task's (i.e. regression RMSE) \n", 147 | "# performance to select the hyperparameters (i.e. # number of PCs).\n", 148 | "# here let us use the portion of explained variance to select the number of PCs. Those info is available in the pca object.\n", 149 | "print(pca.explained_variance_)\n", 150 | "print(pca.explained_variance_ratio_)\n" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "plt.figure(figsize=(7,5))\n", 160 | "\n", 161 | "plt.plot([1,2,3,4], pca.explained_variance_ratio_, '-o', label='Individual component')\n", 162 | "plt.plot([1,2,3,4], np.cumsum(pca.explained_variance_ratio_), '-s', label='Cumulative')\n", 163 | "\n", 164 | "plt.ylabel('Proportion of Variance Explained')\n", 165 | "plt.xlabel('Principal Component')\n", 166 | "plt.xlim(0.75,4.25)\n", 167 | "plt.ylim(0,1.05)\n", 168 | "plt.xticks([1,2,3,4])\n", 169 | "plt.legend(loc=2)\n", 170 | "plt.show()\n", 171 | "\n", 172 | "\"\"\"\n", 173 | "In this case, if we want to preserve 80% of variance of the data, we need to select 2 PCs.\n", 174 | "\"\"\"" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "## 12.5.2 Matrix Completion" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "\"\"\"\n", 191 | "I am happy seeing this lab added. SVD seems pretty heavy in theory/math, but this has lots of application in real problems, \n", 192 | "such as recommendation systems, clustering, outlier smoothing, and so on.\n", 193 | "\"\"\"" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "# run sigular value decomposition on the data (SVD)\n", 203 | "u, s, vh = svd(X, full_matrices=False)\n", 204 | "u.shape, s.shape, vh.shape" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "# this vh will be the principal components similar to pca.components_ (up to an unimportant sign flip)\n", 214 | "# The matrix u is equivalent to the matrix of standardized scores, and the standard deviations are in the vector s.\n", 215 | "print(vh)\n", 216 | "print ('-------')\n", 217 | "print(pca.components_)\n" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "# reconstruction based on full SVD\n", 227 | "np.allclose(X, np.dot(u * s, vh))" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "# reconstruction based on reduced SVD\n", 237 | "num_components = 3\n", 238 | "recovered = pd.DataFrame(np.dot(u[:, :num_components] * s[:num_components,], vh[:num_components,:]))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "print(recovered.head(n=2))\n", 248 | "print(X.head(n=2))\n", 249 | "\n", 250 | "\"\"\"\n", 251 | "Change the num_components from 1 to 4 and see how the reconstruction error changes.\n", 252 | "\"\"\"" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "## 12.5.3 Clustering" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "K-means clustering " 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "# generate data\n", 276 | "np.random.seed(21)\n", 277 | "X = np.random.standard_normal((50,2))\n", 278 | "X[:25,0] = X[:25,0]+3\n", 279 | "X[:25,1] = X[:25,1]-4" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "n_clusters = 2\n", 289 | "km1 = KMeans(n_clusters=n_clusters, n_init=20)\n", 290 | "km1.fit(X)\n", 291 | "\n", 292 | "n_clusters = 3\n", 293 | "km2 = KMeans(n_clusters=n_clusters, n_init=20)\n", 294 | "km2.fit(X)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "print(km1.labels_)\n", 304 | "print(dir(km1)) # we can use dir to see other saved attributes" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "fig, (ax1, ax2) = plt.subplots(1,2, figsize=(14,5))\n", 314 | "\n", 315 | "ax1.scatter(X[:,0], X[:,1], s=40, c=km1.labels_, cmap=plt.cm.prism) \n", 316 | "ax1.set_title('K-Means Clustering Results with K=2')\n", 317 | "ax1.scatter(km1.cluster_centers_[:,0], km1.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2)\n", 318 | "\n", 319 | "ax2.scatter(X[:,0], X[:,1], s=40, c=km2.labels_, cmap=plt.cm.prism) \n", 320 | "ax2.set_title('K-Means Clustering Results with K=3')\n", 321 | "ax2.scatter(km2.cluster_centers_[:,0], km2.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2)\n", 322 | "plt.show()" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "Hierarchical Clustering" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18))\n", 339 | "\n", 340 | "for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'],\n", 341 | " [ax1,ax2,ax3]):\n", 342 | " cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0)\n", 343 | "\n", 344 | "ax1.set_title('Complete Linkage')\n", 345 | "ax2.set_title('Average Linkage')\n", 346 | "ax3.set_title('Single Linkage')\n", 347 | "plt.show()" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "## 12.5.4 NCI60 Data Example" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "PCA on the NCI60 Data" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "# I was not able to make the json work, so I went back to R and saved the data and label separately.\n", 371 | "X = pd.read_csv('./data/NCI60_data.csv')\n", 372 | "y = pd.read_csv('./data/NCI60_labs.csv')" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "pca2 = PCA()\n", 382 | "X_standardized = StandardScaler().fit_transform(X)\n", 383 | "df2_plot = pd.DataFrame(pca2.fit_transform(X_standardized))" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,6))\n", 393 | "\n", 394 | "color_idx = pd.factorize(y.iloc[:, 0])[0]\n", 395 | "cmap = plt.cm.hsv\n", 396 | "\n", 397 | "# left plot\n", 398 | "ax1.scatter(df2_plot.iloc[:,0], -df2_plot.iloc[:,1], c=color_idx, cmap=cmap, alpha=0.5, s=50)\n", 399 | "ax1.set_ylabel('Principal Component 2')\n", 400 | "\n", 401 | "# right plot\n", 402 | "ax2.scatter(df2_plot.iloc[:,0], df2_plot.iloc[:,2], c=color_idx, cmap=cmap, alpha=0.5, s=50)\n", 403 | "ax2.set_ylabel('Principal Component 3')\n", 404 | "\n", 405 | "# custom legend for the classes (y) since we do not create scatter plots per class (which could have their own labels).\n", 406 | "handles = []\n", 407 | "labels = pd.factorize(y.iloc[:, 0].unique())\n", 408 | "norm = mpl.colors.Normalize(vmin=0.0, vmax=14.0)\n", 409 | "\n", 410 | "for i, v in zip(labels[0], labels[1]):\n", 411 | " handles.append(mpl.patches.Patch(color=cmap(norm(i)), label=v, alpha=0.5))\n", 412 | "\n", 413 | "ax2.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n", 414 | "\n", 415 | "# xlabel for both plots\n", 416 | "for ax in fig.axes:\n", 417 | " ax.set_xlabel('Principal Component 1') " 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "pd.DataFrame([df2_plot.iloc[:,:5].std(axis=0, ddof=0).array,\n", 427 | " pca2.explained_variance_ratio_[:5],\n", 428 | " np.cumsum(pca2.explained_variance_ratio_[:5])],\n", 429 | " index=['Standard Deviation', 'Proportion of Variance', 'Cumulative Proportion'],\n", 430 | " columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "df2_plot.iloc[:,:10].var(axis=0, ddof=0).plot(kind='bar', rot=0)\n", 440 | "plt.ylabel('Variances')\n", 441 | "plt.show()" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "fig , (ax1,ax2) = plt.subplots(1,2, figsize=(15,5))\n", 451 | "\n", 452 | "# left plot\n", 453 | "ax1.plot(pca2.explained_variance_ratio_, '-o')\n", 454 | "ax1.set_ylabel('Proportion of Variance Explained')\n", 455 | "ax1.set_ylim(ymin=-0.01)\n", 456 | "\n", 457 | "# right plot\n", 458 | "ax2.plot(np.cumsum(pca2.explained_variance_ratio_), '-ro')\n", 459 | "ax2.set_ylabel('Cumulative Proportion of Variance Explained')\n", 460 | "ax2.set_ylim(ymax=1.05)\n", 461 | "\n", 462 | "for ax in fig.axes:\n", 463 | " ax.set_xlabel('Principal Component')\n", 464 | " ax.set_xlim(-1,65) " 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "Clustering the Observations of the NCI60 Data" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "sc = StandardScaler()\n", 481 | "X_standardized = pd.DataFrame(sc.fit_transform(X), index=y.iloc[:, 0], columns=X.columns)" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [ 490 | "fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(20,20))\n", 491 | "\n", 492 | "for linkage, cluster, ax in zip([hierarchy.complete(X_standardized), hierarchy.average(X), hierarchy.single(X_standardized)],\n", 493 | " ['c1','c2','c3'],\n", 494 | " [ax1,ax2,ax3]):\n", 495 | " cluster = hierarchy.dendrogram(linkage, labels=X_standardized.index, orientation='right', color_threshold=0, leaf_font_size=10, ax=ax)\n", 496 | "\n", 497 | "ax1.set_title('Complete Linkage')\n", 498 | "ax2.set_title('Average Linkage')\n", 499 | "ax3.set_title('Single Linkage')\n", 500 | "plt.show()" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "plt.figure(figsize=(10,20))\n", 510 | "cut4 = hierarchy.dendrogram(hierarchy.complete(X_standardized),\n", 511 | " labels=X_standardized.index, orientation='right', color_threshold=140, leaf_font_size=10)\n", 512 | "plt.vlines(140,0,plt.gca().yaxis.get_data_interval()[1], colors='r', linestyles='dashed')\n", 513 | "plt.show()" 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": {}, 519 | "source": [ 520 | "Kmeans" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "np.random.seed(21)\n", 530 | "km3 = KMeans(n_clusters=4, n_init=50)\n", 531 | "km3.fit(X_standardized)" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [ 540 | "km3.labels_" 541 | ] 542 | }, 543 | { 544 | "cell_type": "markdown", 545 | "metadata": {}, 546 | "source": [ 547 | "Combine with PCA" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "plt.figure(figsize=(10,20))\n", 557 | "pca_cluster = hierarchy.dendrogram(hierarchy.complete(X_standardized), labels=X_standardized.index,\n", 558 | "orientation='right', color_threshold=100, leaf_font_size=10)" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "# Hierarchy based on Principal Components 1 to 5\n", 568 | "plt.figure(figsize=(10,20))\n", 569 | "pca_cluster = hierarchy.dendrogram(hierarchy.complete(df2_plot.iloc[:,:5]), labels=X_standardized.index,\n", 570 | "orientation='right', color_threshold=100, leaf_font_size=10)" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "# End of Chapter 12" 580 | ] 581 | } 582 | ], 583 | "metadata": { 584 | "interpreter": { 585 | "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250" 586 | }, 587 | "kernelspec": { 588 | "display_name": "Python 2.7.16 64-bit ('base': conda)", 589 | "name": "python3" 590 | }, 591 | "language_info": { 592 | "codemirror_mode": { 593 | "name": "ipython", 594 | "version": 3 595 | }, 596 | "file_extension": ".py", 597 | "mimetype": "text/x-python", 598 | "name": "python", 599 | "nbconvert_exporter": "python", 600 | "pygments_lexer": "ipython3", 601 | "version": "3.6.2" 602 | } 603 | }, 604 | "nbformat": 4, 605 | "nbformat_minor": 2 606 | } 607 | -------------------------------------------------------------------------------- /Chapter_13_sec_13.6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 13.6 Lab: Multiple Testing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "from scipy import stats as st\n", 20 | "from sklearn.metrics import confusion_matrix\n", 21 | "from statsmodels.sandbox.stats.multicomp import multipletests\n", 22 | "from statsmodels.stats.multicomp import pairwise_tukeyhsd\n", 23 | "from statsmodels.sandbox.stats.multicomp import TukeyHSDResults\n", 24 | "\n", 25 | "import json\n", 26 | "\n", 27 | "%matplotlib inline\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## 13.6.1 Review of Hypothesis Tests" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "np.random.seed(21)\n", 44 | "X = np.random.normal(loc=0.0, scale=1.0, size=(10, 100))\n", 45 | "offset = 0.5\n", 46 | "X[:,:50] = X[:,:50] + offset" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# here I used scipy. During google search, I came across bioinfokit module, could explore more. \n", 56 | "result=st.ttest_1samp(a = X[:, 0], popmean = 0)\n", 57 | "print(result.pvalue)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# let us run the same t-test for all 100 columns\n", 67 | "p_values = []\n", 68 | "decision = []\n", 69 | "for i in range(100):\n", 70 | " result=st.ttest_1samp(a = X[:, i], popmean = 0)\n", 71 | " p_values.append(result.pvalue)\n", 72 | " if result.pvalue < 0.05:\n", 73 | " decision.append('Reject H0')\n", 74 | " else:\n", 75 | " decision.append('Do not reject H0')\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# after computing the p-values, we can use the ground truth to evaluate the performance\n", 85 | "ground_truth = np.repeat(['Reject H0', 'Do not reject H0'], [50, 50], axis=0)\n", 86 | "labels = ['Reject H0', 'Do not reject H0']\n", 87 | "cm = confusion_matrix (ground_truth, decision, labels=labels)\n", 88 | "print(cm)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "fig = plt.figure()\n", 98 | "ax = fig.add_subplot(111)\n", 99 | "cax = ax.matshow(cm)\n", 100 | "fig.colorbar(cax)\n", 101 | "ax.set_xticklabels([''] + labels)\n", 102 | "ax.set_yticklabels([''] + labels)\n", 103 | "plt.xlabel('One sample t-test')\n", 104 | "plt.ylabel('Ground truth')\n", 105 | "plt.show()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "# we could make the offset larger (from 0.5 to 1) and see the change to the confusion matrix\n", 115 | "offset = 1\n", 116 | "X[:,:50] = X[:,:50] + offset\n", 117 | "\n", 118 | "p_values = []\n", 119 | "decision = []\n", 120 | "for i in range(100):\n", 121 | " result=st.ttest_1samp(a = X[:, i], popmean = 0)\n", 122 | " p_values.append(result.pvalue)\n", 123 | " if result.pvalue < 0.05:\n", 124 | " decision.append('Reject H0')\n", 125 | " else:\n", 126 | " decision.append('Do not reject H0')\n", 127 | "\n", 128 | "\n", 129 | "ground_truth = np.repeat(['Reject H0', 'Do not reject H0'], [50, 50], axis=0)\n", 130 | "labels = ['Reject H0', 'Do not reject H0']\n", 131 | "cm = confusion_matrix (ground_truth, decision, labels=labels)\n", 132 | "print(cm)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## 13.6.2 The Family-Wise Error Rate" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "m = range(500)\n", 149 | "fwe1 = list(map(lambda x:1 - pow(1 - 0.05,x),m))\n", 150 | "fwe2 = list(map(lambda x:1 - pow(1 - 0.01,x),m))\n", 151 | "fwe3 = list(map(lambda x:1 - pow(1 - 0.001,x),m))" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "plt.plot(m, fwe1, label = \"0.05\")\n", 161 | "plt.plot(m, fwe2, label = \"0.01\")\n", 162 | "plt.plot(m, fwe3, label = \"0.001\")\n", 163 | "plt.xlabel('Number of tests in log scale')\n", 164 | "plt.ylabel('FWE')\n", 165 | "plt.xscale(\"log\")\n", 166 | "plt.legend()\n", 167 | "plt.show()\n", 168 | "\n", 169 | "\"\"\" \n", 170 | "We see that setting α = 0.05 results in a high FWER even for moderate m. \n", 171 | "With α = 0.01, we can test no more than five null hypotheses before the FWER exceeds 0.05. \n", 172 | "Only for very small values, such as α = 0.001, do we manage to ensure a small FWER, \n", 173 | "at least for moderately-sized m.\n", 174 | "\n", 175 | "Of course, the problem with setting α to such a low value is that we are likely to \n", 176 | "make a number of Type II errors: in other words, our power is very low.\n", 177 | "\"\"\"" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "Fund = pd.read_csv('data/Fund.csv')" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "Fund.head()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "# we will do the one sample t test for the first manager\n", 205 | "result=st.ttest_1samp(a = Fund['Manager1'], popmean = 0)\n", 206 | "print(result.pvalue)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "p_values = []\n", 216 | "manager_number = 5 \n", 217 | "\n", 218 | "for i in range(manager_number):\n", 219 | " result=st.ttest_1samp(a = Fund.iloc[:,i], popmean = 0)\n", 220 | " p_values.append(result.pvalue)\n", 221 | "\n", 222 | "print(p_values)\n", 223 | "\n", 224 | "\"\"\" \n", 225 | "The p-values are low for Managers One and Three, and high for the other three managers. \n", 226 | "However, we cannot simply reject H01 and H03, since this would fail to account for \n", 227 | "the multiple testing that we have performed. \n", 228 | "Instead, we will conduct Bonferroni’s method and Holm’s method to control the FWER.\n", 229 | "\"\"\"" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "# we could bonferroni to adjust the raw p-values and take care of family wise error rate\n", 239 | "reject, p_values_corrected, alphacSidak, alphacBonf = multipletests(p_values, method = 'bonferroni')\n", 240 | "print(p_values_corrected)\n", 241 | "\"\"\" \n", 242 | "Therefore, using Bonferroni’s method, \n", 243 | "we are able to reject the null hypothesis only for Manager One while controlling the FWER at 0.05.\n", 244 | "This information is also available in the variable reject.\n", 245 | "\"\"\"\n", 246 | "print(reject)\n" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "# Bonferroni's method is more conservative. We could apply holm's method to control the FWER\n", 256 | "reject, p_values_corrected, alphacSidak, alphacBonf = multipletests(p_values, method = 'holm')\n", 257 | "print(p_values_corrected)\n", 258 | "print(reject)\n", 259 | "\"\"\" \n", 260 | "By contrast, using Holm’s method, the adjusted p-values indicate that we can reject the null hypotheses \n", 261 | "for Both Managers One and Three at a FWER of 0.05.\n", 262 | "\"\"\"" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "# we can see the average for each manager \n", 272 | "Fund.mean(axis=0)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "# next, we could test whether 2 managers are significantly different. For example Manager 1 and Manager 2\n", 282 | "result=st.ttest_rel(a = Fund['Manager1'], b = Fund['Manager2'])\n", 283 | "print(result.pvalue)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "\"\"\" \n", 293 | "However, we decided to perform this test only after examining the data and \n", 294 | "noting that Managers One and Two had the highest and lowest mean performances. \n", 295 | "In a sense, this means that we have implicitly performed a manual selection \n", 296 | "from the 5(5 − 1)/2 = 10 hypothesis tests, rather than just one. \n", 297 | "Hence, we use Tukey’s method in order to adjust for multiple testing. \n", 298 | "\"\"\"\n", 299 | "returns = Fund.iloc[:, :5].to_numpy().flatten(order='F') # we flatten by col (i.e. order='F')\n", 300 | "manager = np.repeat(['1', '2', '3', '4', '5'], repeats=Fund.shape[0])\n", 301 | "\n", 302 | "# perform Tukey's test\n", 303 | "tukey = pairwise_tukeyhsd(endog=returns, groups=manager, alpha=0.05)\n", 304 | "\n", 305 | "print(tukey)\n", 306 | "\n", 307 | "\"\"\" \n", 308 | "Notice that the p-value for the difference between Managers One and Two has increased from 0.038 to 0.186, \n", 309 | "so there is no longer clear evidence of a difference between the managers’ performances.\n", 310 | "\n", 311 | "\"\"\"" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "## 13.6.3 The False Discovery Rate" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "p_values = []\n", 328 | "manager_number = Fund.shape[1]\n", 329 | "\n", 330 | "for i in range(manager_number):\n", 331 | " result=st.ttest_1samp(a = Fund.iloc[:,i], popmean = 0)\n", 332 | " p_values.append(result.pvalue)\n", 333 | "\n", 334 | "print(p_values[0:10])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "\"\"\" \n", 344 | "There are far too many managers to consider trying to control the FWER. \n", 345 | "Instead, we focus on controlling the FDR: that is, the expected fraction of rejected null \n", 346 | "hypotheses that are actually false positives. \n", 347 | "\"\"\"\n", 348 | "\n", 349 | "reject, p_values_corrected, alphacSidak, alphacBonf = multipletests(p_values, method = 'fdr_bh')\n", 350 | "print(p_values_corrected[0:10])\n", 351 | "\n", 352 | "\"\"\" \n", 353 | "The q-values output by the Benjamini-Hochberg procedure can be interpreted as the smallest \n", 354 | "FDR threshold at which we would reject a particular null hypothesis.\n", 355 | "\n", 356 | "For instance, a q-value of 0.1 indicates that we can reject the corresponding null hypothesis\n", 357 | "at an FDR of 10% or greater, but that we cannot reject the null hypothesis at an FDR below 10%.\n", 358 | "\"\"\"" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "# we would find that 146 of the 2,000 fund managers have a p_values_corrected below 0.1\n", 368 | "sum(p_values_corrected <= .1)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "# if we use bonferroni method, we will find None\n", 378 | "sum(np.array(p_values) <= .1/Fund.shape[1])" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "## 13.6.4 A Re-Sampling Approach" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "# I saved the gene expression data as a json file, in python we could load the json file using the json library\n", 395 | "# after reading in the data, we can use the data is same as a dictionary, we can use the keys to access the data\n", 396 | "\n", 397 | "f = open('./data/Khan.json')\n", 398 | "Khan = json.load(f)\n", 399 | "\n", 400 | "X_train = np.array(Khan['xtrain'])\n", 401 | "y_train = np.array(Khan['ytrain'])\n", 402 | "X_test = np.array(Khan['xtest'])\n", 403 | "y_test = np.array(Khan['ytest'])" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "x = np.concatenate((X_train, X_test), axis=0)\n", 413 | "y = np.concatenate((y_train, y_test), axis=0)\n", 414 | "unique, counts = np.unique(y, return_counts=True)\n", 415 | "print(counts)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "# x1: take the x for cancer type == 2\n", 425 | "# x2: take the x for cancer type == 4\n", 426 | "x1 = x[y==2, :]\n", 427 | "x2 = x[y==4, :]\n", 428 | "n1 = x1.shape[0]\n", 429 | "n2 = x2.shape[0]\n", 430 | "print(n1)\n", 431 | "print(n2)" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "# performing a standard two-sample t-test on the 11th (gene_index = 10 in python) gene produces a test-statistic \n", 441 | "gene_index = 10\n", 442 | "original_result=st.ttest_ind(a=x1[:,gene_index], b=x2[:,gene_index], equal_var=True)\n", 443 | "print(original_result.statistic)\n", 444 | "print(original_result.pvalue)\n", 445 | "\n", 446 | "\"\"\" \n", 447 | "The 2 sample t-test produces a test-statistic of −2.09 and an associated p-value of 0.0412, \n", 448 | "suggesting modest evidence of a difference in mean expression levels between the two cancer types.\n", 449 | "\"\"\"" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "\"\"\" \n", 459 | "Instead of doing a parameterized 2 sample t-test, we could do a non-parameterized test(i.e. permutation test).\n", 460 | "we can randomly split the 54 patients (in cancer group 2 and 4) into two groups of 29 and 25 \n", 461 | "(same as the original split),and compute a new test statistic. \n", 462 | "Under the null hypothesis of no difference between the groups, this new test statistic should have \n", 463 | "the same distribution as our original one. \n", 464 | "Repeating this process many (i.e.10,000) times allows us to approximate the null distribution of the test statistic. \n", 465 | "We compute the fraction of the time that our observed test statistic exceeds the test statistics obtained \n", 466 | "via re-sampling.\n", 467 | "\"\"\"\n", 468 | "\n", 469 | "np.random.seed(21)\n", 470 | "iteration = 10000\n", 471 | "test_stats = []\n", 472 | "x_temp = np.concatenate((x1[:,gene_index], x2[:,gene_index]), axis=0)\n", 473 | "\n", 474 | "for i in range(iteration):\n", 475 | " np.random.shuffle(x_temp)\n", 476 | " result_temp = st.ttest_ind(a=x_temp[:n1], b=x_temp[-n2:], equal_var=True)\n", 477 | " test_stats.append(result_temp.statistic)" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "print(np.mean((np.abs(test_stats) >= np.abs(original_result.statistic))))\n", 487 | "\n", 488 | "\"\"\" \n", 489 | "This fraction is our re-sampling-based p-value. It is almost identical to the p-value of 0.0412 \n", 490 | "obtained using the theoretical null distribution.\n", 491 | "\n", 492 | "The reason for this is that the parametrized distribution is a pretty good assumption in this case\n", 493 | "To see this, we can plot the histogram of the re-sampled statistics vs. parametrized distribution. \n", 494 | "\n", 495 | "We could try other genes (i.e. gene_index = 876) to see its theoretical and re-sampling null distributions are \n", 496 | "quite different\n", 497 | "\"\"\"" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "# construct the t distribution \n", 507 | "df = n1 + n2 - 2\n", 508 | "rv = st.t(df)\n", 509 | "x = np.linspace(-4.2, 4.2, 1000)\n", 510 | "\n", 511 | "\n", 512 | "plt.hist(test_stats, 100, density=True, facecolor='g', alpha=0.75)\n", 513 | "plt.plot(x, rv.pdf(x), 'k-', lw=2)\n", 514 | "plt.xlabel('Null Distribution of Test Statistic')\n", 515 | "plt.ylabel('Probability')\n", 516 | "plt.title('Histogram of re-sample stats')\n", 517 | "plt.xlim(-4.2, 4.2)\n", 518 | "plt.grid(True)\n", 519 | "plt.show()" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "# we could do this for 100 and see how FDR works under re-sample \n", 529 | "# it would be good to use small iterations to make sure the code runs okay \n", 530 | "num_gene = 100\n", 531 | "iteration = 500\n", 532 | "test_stats_matrix = []\n", 533 | "test_stats_origin = []\n", 534 | "\n", 535 | "for j in range(num_gene):\n", 536 | " gene_index = j \n", 537 | " x_temp = np.concatenate((x1[:,gene_index], x2[:,gene_index]), axis=0)\n", 538 | " result_origin = st.ttest_ind(a=x1[:,gene_index], b=x2[:,gene_index], equal_var=True)\n", 539 | " test_stats_origin.append(result_origin.statistic)\n", 540 | " test_stats = []\n", 541 | " for i in range(iteration):\n", 542 | " np.random.shuffle(x_temp)\n", 543 | " result_temp = st.ttest_ind(a=x_temp[:n1], b=x_temp[-n2:], equal_var=True)\n", 544 | " test_stats.append(result_temp.statistic)\n", 545 | " \n", 546 | " test_stats_matrix.append(test_stats)" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": {}, 553 | "outputs": [], 554 | "source": [ 555 | "test_stats_origin_sorted = np.sort(np.abs(test_stats_origin))" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": {}, 562 | "outputs": [], 563 | "source": [ 564 | "Rs = []\n", 565 | "Vs = []\n", 566 | "FDRs = []\n", 567 | "for j in range(num_gene):\n", 568 | " R = np.sum(np.abs(test_stats_origin) >= test_stats_origin_sorted[j])\n", 569 | " V = np.sum(np.abs(test_stats_matrix) >= test_stats_origin_sorted[j]) / iteration\n", 570 | " Rs.append(R)\n", 571 | " Vs.append(V)\n", 572 | " FDRs.append(V*1.0/R)\n", 573 | "\n", 574 | "Rs = np.array(Rs)\n", 575 | "Vs = np.array(Vs)\n", 576 | "FDRs = np.array(FDRs) " 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "print(np.max(Rs[FDRs <= .1]))\n", 586 | "print(np.max(Rs[FDRs <= .2]))" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "plt.plot(Rs, FDRs, 'k-', lw=2)\n", 596 | "plt.xlabel('Number of Rejections')\n", 597 | "plt.ylabel('False Discovery Rate')\n", 598 | "plt.show()" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "# End of Chapter 13" 608 | ] 609 | } 610 | ], 611 | "metadata": { 612 | "interpreter": { 613 | "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250" 614 | }, 615 | "kernelspec": { 616 | "display_name": "Python 2.7.16 64-bit ('base': conda)", 617 | "name": "python3" 618 | }, 619 | "language_info": { 620 | "codemirror_mode": { 621 | "name": "ipython", 622 | "version": 3 623 | }, 624 | "file_extension": ".py", 625 | "mimetype": "text/x-python", 626 | "name": "python", 627 | "nbconvert_exporter": "python", 628 | "pygments_lexer": "ipython3", 629 | "version": "3.6.2" 630 | } 631 | }, 632 | "nbformat": 4, 633 | "nbformat_minor": 2 634 | } 635 | -------------------------------------------------------------------------------- /Chapter_2_sec_3.1_3.5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# 2.3 Lab: Introduction to R" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "slide" 19 | } 20 | }, 21 | "source": [ 22 | "## 2.3.1 Basic Commands" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "slideshow": { 30 | "slide_type": "slide" 31 | } 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "# best practice is to have all the modules imported at the top of the file, but for this one, I will import them when we need them\n", 36 | "import numpy as np # for calculation purpose, let us use np.array \n", 37 | "import random # for the random number generation\n", 38 | "\n", 39 | "x = np.array([1, 3, 2, 5])\n", 40 | "# use print to see the array\n", 41 | "print(x)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "slideshow": { 49 | "slide_type": "slide" 50 | } 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "x = np.array([1, 6, 2])\n", 55 | "print(x)\n", 56 | "y = [1, 4, 3]\n", 57 | "print(y)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# use len() to find length of a vector\n", 67 | "len(x) " 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "len(y)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "print(x + y) # please note that we define x and y a little bit differently, but we still can do the calculation " 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# The whos function allows us to look at a list of all of the objects, such as data and functions, that we have saved so far\n", 95 | "%whos" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# reset_selective x\n", 105 | "del x " 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "%whos" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# read the description of a function \n", 124 | "%whos?" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# get a matrix \n", 134 | "x = [[1,2],[3, 4]]\n", 135 | "print(x)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# we could also reshape a one dimensional array to a matrix\n", 145 | "x = np.array([1, 2, 3, 4])\n", 146 | "print(x)\n", 147 | "x = np.reshape(x, [2,2])\n", 148 | "print(x)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "# then we can use the matrix to do some calculations\n", 158 | "np.sqrt(x)\n", 159 | "x**2\n", 160 | "np.square(x)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "# use random to generate random numbers/arrays/matrices\n", 170 | "mu, sigma = 0, 1\n", 171 | "x = np.random.normal(mu, sigma, 5)\n", 172 | "y = x + np.random.normal(20, 0.1, 5)\n", 173 | "print(x)\n", 174 | "print(y)\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "# more calculation\n", 184 | "np.corrcoef(x, y) " 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "# above will return the correlation matrix, let us see just the correlation coefficient between x and y\n", 194 | "np.corrcoef(x, y)[0,1]" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "collapsed": true 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "# we can use the seed function to set up the random seed, so that every thing we run the code, we will get the same result\n", 206 | "random.seed(2333)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "# after set up the seed, this should genernate the same result everytime we run the notebook\n", 216 | "np.random.normal(mu, sigma, 5) " 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "# you could increase the number of samples to see the empirical distribution coverages to the theoretical distribution\n", 226 | "mu, sigma = 0, 1\n", 227 | "num_samples = 10\n", 228 | "x = np.random.normal(mu, sigma, num_samples)\n", 229 | "print(np.mean(x))\n", 230 | "print(np.var(x))\n", 231 | "print(np.sqrt(np.var(x)))\n", 232 | "print(np.std(x))" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "## 2.3.2 Graphics" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "import numpy as np # for calculation purpose, let use np.array \n", 249 | "import random # for the random \n", 250 | "\n", 251 | "x = np.random.normal(0, 1, 100)\n", 252 | "y = np.random.normal(0, 1, 100)\n", 253 | "\n", 254 | "# in python, matplotlib is the most used library for plot \n", 255 | "# matplotlib.pyplot is a collection of command style functions that make matplotlib work like MATLAB.\n", 256 | "import matplotlib.pyplot as plt\n", 257 | "\n", 258 | "\n", 259 | "plt.plot(x, y, 'bo') # please use plt.plot? to look at more options \n", 260 | "plt.ylabel(\"this is the y-axis\")\n", 261 | "plt.xlabel(\"this is the x-axis\")\n", 262 | "plt.title(\"Plot of X vs Y\")\n", 263 | "plt.savefig('Figure.pdf') # use plt.savefig function to save images\n", 264 | "plt.show() \n" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "# note the arange excludes right end of rande specification \n", 274 | "x = np.arange(1, 11) \n", 275 | "print(x) " 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "# note: np.arange actually can result in unexpected results; check np.arange(0.2, 0.6, 0.4) vs np.arange(0.2, 1.6, 1.4)\n", 285 | "print(np.arange(0.2, 0.6, 0.4))\n", 286 | "print(np.arange(0.2, 1.6, 1.4))" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "# in order to use Pi, math module needs to loaded first\n", 296 | "import math\n", 297 | "x = np.linspace(-math.pi, math.pi, num = 50)\n", 298 | "print(x)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "collapsed": true 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "import matplotlib.cm as cm\n", 310 | "import matplotlib.mlab as mlab\n", 311 | "y = x\n", 312 | "X, Y = np.meshgrid(x,y)" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "%whos" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "# same as above, use plt.contour? to explore the options\n", 331 | "f = np.cos(Y)/(1 + np.square(X))\n", 332 | "CS = plt.contour(X, Y, f)\n", 333 | "plt.show()" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "# I think imshow looks nicer for heatmap, use 'extent =' fix the x, y axis\n", 343 | "fa = (f - f.T)/2 #f.T for transpose or tranpose(f)\n", 344 | "plt.imshow(fa, extent=(x[0], x[-1], y[0], y[-1])) \n", 345 | "plt.show()" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": { 352 | "scrolled": true 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "from mpl_toolkits.mplot3d import axes3d\n", 357 | "fig = plt.figure()\n", 358 | "ax = fig.add_subplot(111, projection='3d')\n", 359 | "ax.plot_wireframe(X, Y, fa)\n", 360 | "plt.show()" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "## 2.3.3 Indexing Data \n", 368 | "Here we use np array. If the data structure is something else, the method below may not work" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "A = np.arange(1,17,1).reshape(4, 4).transpose()\n", 378 | "print(A)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "# one thing to note here is that in python, the index starts from 0, not 1\n", 388 | "print(A[2, 3])" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "# try the same index as the book, but we got different number. The reason is R starts the index from 1 (Matlab too), but Python starts the index from 0. To select the same number (10) as the book did, we reduce the index by 1\n", 398 | "print(A[1, 2])" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "# to select a submatrix, need the non-singleton dimension of your indexing array to be aligned with the axis you're indexing into, \n", 408 | "# e.g. for an n x m 2D subarray: A[n by 1 array,1 by m array]\n", 409 | "A[[[0],[2]], [1,3]]" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "# this is another way to do that\n", 419 | "A[0:3:2, 1:4:2] " 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "# select all columns in those two rows \n", 429 | "A[0:3:2,:]" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "# select all row in those two columns \n", 439 | "A[:, 1:4:2] " 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "# the last two examples include either no index for the columns or no index for the rows. These indicate that Python should include all columns or all rows, respectively\n", 449 | "A[0,:]" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "# '-' sign has a different meaning and good usage in Python. This means index from the end, -1 means the last element \n", 459 | "A[-1, -1] " 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "# there are other ways to let Python keep all rows except certain index. For example, we could also use boolean. \n", 469 | "ind = np.ones((4,), bool)\n", 470 | "ind[[0,2]] = False\n", 471 | "print(ind)" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "A[ind,:]" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "# we do not specify the row or column, the default is the for the row\n", 490 | "A[ind]" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "# we use .shape to get the shape of the matrix \n", 500 | "A.shape" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": { 506 | "collapsed": true 507 | }, 508 | "source": [ 509 | "## 2.3.4 Loading Data\n", 510 | "\n", 511 | "In Python, Pandas is a common used module to read from file into a data frame. I downloaded the Auto.csv from the book website. First, take a look at the csv file. There are headers, missing value is marked by '?'." 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "import pandas as pd \n", 521 | "Auto = pd.read_csv('data/Auto.csv', header=0, na_values='?')" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [ 530 | "# we could use .head to see the first few rows (default = 5) of the data \n", 531 | "Auto.head()" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [ 540 | "# check one record with missing value, and make sure the missing value is correctly imported. \n", 541 | "# Here we use the i.loc to select the row which is different from the indexing method above\n", 542 | "# the reason is that Auto is a pandas dataframe, while the indexing method was for a numpy array\n", 543 | "Auto.iloc[32]" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "# Use the same .shape function as in ndarray to find out the dimension of the data frame \n", 553 | "Auto.shape" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "# an alternative way to select the first 4 rows. \n", 563 | "Auto[:4]" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "metadata": {}, 570 | "outputs": [], 571 | "source": [ 572 | "# an alternative way to select the first 4 rows and first 2 columns.\n", 573 | "Auto.iloc[:4, :2]" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "# we can use list to find the column names or use .columns\n", 583 | "print(list(Auto))\n", 584 | "print(Auto.columns)" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "# Use .isnull and .sum to find out how many NaNs in each variables\n", 594 | "Auto.isnull().sum()" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": null, 600 | "metadata": { 601 | "collapsed": true 602 | }, 603 | "outputs": [], 604 | "source": [ 605 | "# after the previous steps, there are 397 obs in the data and only 5 with missing values. We can just drop the ones with missing values \n", 606 | "print(Auto.shape)\n", 607 | "Auto = Auto.dropna()\n", 608 | "print(Auto.shape)" 609 | ] 610 | }, 611 | { 612 | "cell_type": "markdown", 613 | "metadata": {}, 614 | "source": [ 615 | "## 2.3.5 Additional Graphical and Numerical Summaries" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "# refer a column of data frame by name, by using a '.'. Ref the options in plt.plot for more.\n", 625 | "plt.plot(Auto.cylinders, Auto.mpg, 'ro')\n", 626 | "plt.show()" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "# use .hist to get the histogram of certain variables. column = to specify which variable\n", 636 | "Auto.hist(column = ['cylinders', 'mpg'])\n", 637 | "plt.show()" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": {}, 644 | "outputs": [], 645 | "source": [ 646 | "# use the .describe() to get a summary of the data frame. Use .describe ( include = 'all' ) for mix types, use describe(include = [np.number]) for numerical columns, use describe(include = ['O']) for objects.\n", 647 | "Auto.describe()" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": { 654 | "collapsed": true 655 | }, 656 | "outputs": [], 657 | "source": [ 658 | "# we can change type of certain variable(s). Here changed the cylinders into categorical variable \n", 659 | "Auto['cylinders'] = Auto['cylinders'].astype('category')" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [ 668 | "Auto.describe()" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": null, 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [ 677 | "Auto.describe(include= 'all')" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "metadata": {}, 684 | "outputs": [], 685 | "source": [ 686 | "# End of Chapter 2" 687 | ] 688 | } 689 | ], 690 | "metadata": { 691 | "anaconda-cloud": {}, 692 | "interpreter": { 693 | "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250" 694 | }, 695 | "kernelspec": { 696 | "display_name": "Python 2.7.16 64-bit ('base': conda)", 697 | "name": "python3" 698 | }, 699 | "language_info": { 700 | "codemirror_mode": { 701 | "name": "ipython", 702 | "version": 3 703 | }, 704 | "file_extension": ".py", 705 | "mimetype": "text/x-python", 706 | "name": "python", 707 | "nbconvert_exporter": "python", 708 | "pygments_lexer": "ipython3", 709 | "version": "3.6.2" 710 | } 711 | }, 712 | "nbformat": 4, 713 | "nbformat_minor": 1 714 | } 715 | -------------------------------------------------------------------------------- /Chapter_3_sec_6.1_6.7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 3.6 Lab: Linear Regression" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 3.6.1 Libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# in Python, module can be imported by a command similar to 'import numpy as np'. \n", 24 | "# it is a good practice to maintain a section at the beginning of the notebook to import all necessary modules.\n", 25 | "# for new module, could use pip to install it. \n", 26 | "# for example 'pip install numpy'\n", 27 | "import numpy as np\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "import pandas as pd \n", 30 | "import math\n", 31 | "import statsmodels.api as sm\n", 32 | "import statsmodels.formula.api as smf\n", 33 | "from statsmodels.stats.outliers_influence import variance_inflation_factor\n", 34 | "from statsmodels.graphics.regressionplots import *\n", 35 | "from sklearn import datasets, linear_model\n", 36 | "from patsy import dmatrices" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "# since in Python, there is no default MASS module and Boston dataset, I will read in the Boston dataset from CSV. The data is in the ./data folder.\n", 48 | "Boston = pd.read_csv('data/Boston.csv', header=0)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## 3.6.2 Simple Linear Regression" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# use the commands we learned in the previous chapeter to exame the data.\n", 65 | "list(Boston) # or Boston.columns" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "Boston.head()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "Boston.shape" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# to have similar formula notation as R, use the following import. \n", 93 | "# One thing to note is that the ' ' for the formula part in fitting step and the '.fit()' at the end.\n", 94 | "# import statsmodels.formula.api as smf, we would use smf to call the model. Of course, there are other ways to run linear regression in pythin, such as sklearn.\n", 95 | "lm = smf.ols ('medv~lstat', data = Boston).fit()" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "print(lm.summary())" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# use dir() to get a list of all the attributes an object has\n", 114 | "dir(lm)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# we can try a few \n", 124 | "print(lm.params)\n", 125 | "print(lm.conf_int())" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# provide prediction for 3 observations\n", 135 | "lm.predict(pd.DataFrame({'lstat':[5, 10, 15]}))" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# plot the fitted line, we only take two extreme points to make the plot\n", 145 | "X_new = pd.DataFrame({'lstat': [Boston.lstat.min(), Boston.lstat.max()]})\n", 146 | "preds = lm.predict(X_new)\n", 147 | "Boston.plot(kind='scatter', x='lstat', y='medv')\n", 148 | "plt.plot(X_new, preds, c='red', linewidth=2)\n", 149 | "plt.show()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "fig, ((ax1, ax2), (ax3, ax4))= plt.subplots(2, 2)\n", 159 | "ax1.plot(Boston.lstat, lm.predict(),'ro')\n", 160 | "ax2.plot(lm.predict(), lm.resid, 'go')\n", 161 | "ax3.plot(lm.predict(), lm.resid_pearson, 'bo')\n", 162 | "plt.show()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "# the statistics of the linear regression mostly stored in lm.get_influence(), for example, the cookdistances, leverage.\n", 172 | "dir(lm.get_influence())\n", 173 | "# for example, the following identifies the observation with the largest leverage \n", 174 | "np.argmax(lm.get_influence().hat_matrix_diag)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "# from statsmodels.graphics.regressionplots import * just as a reference\n", 184 | "plot_leverage_resid2(lm)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "# as mentioned above. For machine learning models, sklearn is the most common used module, but sklearn is a little bit less on statistics.\n", 194 | "x = pd.DataFrame(Boston.lstat)\n", 195 | "y = Boston.medv\n", 196 | "print(x.shape)\n", 197 | "\n", 198 | "model = linear_model.LinearRegression()\n", 199 | "model.fit(x, y)\n", 200 | "print(model.intercept_)\n", 201 | "print(model.coef_)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "## 3.6.3 Multiple Linear Regression" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "# we can still use smg.ols to run multiple linear regression.\n", 220 | "lm = smf.ols ('medv~lstat+age', data = Boston).fit()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "print(lm.summary())" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "# if we want to use all the variable. We can use the following trick to manually construct the list. In Python, most of time, you have to manully construct the variable list.\n", 239 | "all_columns = \"+\".join(Boston.columns.difference([\"medv\"]))\n", 240 | "my_formula = \"medv~\" + all_columns\n", 241 | "lm = smf.ols(my_formula, data=Boston).fit()\n", 242 | "print(lm.summary())" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "# unlike R, Python is not fully up speeded to all the statistics. If you want to have the VIF of the variables in LM, you have to code a little bit.\n", 254 | "# from patsy import dmatrices\n", 255 | "# from statsmodels.stats.outliers_influence import variance_inflation_factor" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "y, X = dmatrices(my_formula, data=Boston, return_type='dataframe')\n", 265 | "vif_coeff = {}\n", 266 | "for i in range(X.shape[1]):\n", 267 | " vif_coeff[X.columns[i]] = variance_inflation_factor(np.array(X.dropna()),i)\n", 268 | " \n", 269 | "print(vif_coeff)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "## 3.6.4 Interaction Terms" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "# we use * to add interaction terms\n", 286 | "lm = smf.ols('medv~lstat * age', data=Boston).fit()\n", 287 | "print(lm.summary())" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "## 3.6.5 Non-linear Transformations of the Predictors " 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "lm_order1 = smf.ols('medv~ lstat', data=Boston).fit()\n", 304 | "lm_order2 = smf.ols('medv~ lstat+ I(lstat ** 2.0)', data=Boston).fit()\n", 305 | "print(lm_order2.summary())" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "fig, ((ax1, ax2), (ax3, ax4))= plt.subplots(2, 2)\n", 315 | "ax1.plot(Boston.lstat, lm_order1.predict(),'ro')\n", 316 | "ax3.plot(lm_order1.predict(), lm_order1.resid, 'go')\n", 317 | "ax4.plot(lm_order1.predict(), lm_order1.resid_pearson, 'bo')\n", 318 | "plt.show()" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "### if we added in the second order, we can see the residues are more random" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "fig, ((ax1, ax2), (ax3, ax4))= plt.subplots(2, 2)\n", 335 | "ax1.plot(Boston.lstat, lm_order2.predict(),'ro')\n", 336 | "ax2.plot(Boston.lstat ** 2.0, lm_order2.predict(),'ro')\n", 337 | "ax3.plot(lm_order2.predict(), lm_order2.resid, 'go')\n", 338 | "ax4.plot(lm_order2.predict(), lm_order2.resid_pearson, 'bo')\n", 339 | "plt.show()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "# there is anova function built in already in statsmodels. \n", 349 | "# if you know what to do, use the key words to google it and likely you will find a very good answer. \n", 350 | "# here we compare the models with one order of stat and two orders of stats. \n", 351 | "# by looking at the p value that will reject the null hypothesis that the coefficent of lstat**2 equals 0.\n", 352 | "table = sm.stats.anova_lm(lm_order1, lm_order2)\n", 353 | "print(table)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "lm_log = smf.ols('medv~ np.log(rm)', data=Boston).fit()\n", 363 | "lm_log.summary()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "## 3.6.6 Qualitative Predictors \n", 371 | "\n", 372 | "I prepared the Carseats file from .Rdata. And it is saved under the data folder. Let us load them in and explore this dataset." 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": { 379 | "collapsed": true 380 | }, 381 | "outputs": [], 382 | "source": [ 383 | "Carseats = pd.read_csv('data/Carseats.csv', header=0)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "list(Carseats)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [ 401 | "Carseats.dtypes" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "Carseats.head()" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "lm_carseats = smf.ols('Sales ~ Income + Advertising + Price + Age', data = Carseats).fit()" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "lm_carseats.summary()" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": { 435 | "collapsed": true 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "# let us create dummy variables using get_dummies, then exclude the first dummy column\n", 440 | "ShelveLoc_dummies = pd.get_dummies(Carseats.ShelveLoc, prefix='ShelveLoc').iloc[:,1:]" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "Carseats_dummy = pd.concat([Carseats, ShelveLoc_dummies], axis=1)\n", 450 | "Carseats_dummy.head()" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [ 459 | "# then the model buliding will be the same with all numerrical variables.\n", 460 | "lm_carseats_dummy = smf.ols('Sales ~ Income + Advertising + Price + Age + ShelveLoc_Good + ShelveLoc_Medium', \n", 461 | " data = Carseats_dummy).fit()" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "# the interpretation of the coefficients are holding everything fixed, Medium shelve location is associated with an average\n", 471 | "# increase of sale around 2.0046. \n", 472 | "lm_carseats_dummy.summary() " 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": { 479 | "collapsed": true 480 | }, 481 | "outputs": [], 482 | "source": [ 483 | "# Compapre the summary of two models, one with explicit encoding of dummy varible, while the other used the built-in function.\n", 484 | "lm_carseats_wo_dummy = smf.ols('Sales ~ Income + Advertising + Price + Age + C(ShelveLoc)', \n", 485 | " data = Carseats).fit()\n", 486 | "lm_carseats_wo_dummy.summary()" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "## 3.6.7 Writing Functions" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "# let us write a simple function to print current time. \n", 503 | "# yhe key word in Python for user defined function is 'def'. \n", 504 | "# pay attention to the ':'. The difference betwwen R (others) and Python is that Python \n", 505 | "# forces you to obey its indentation rules. For example, the following function won't work because of the extra space in front of 'print'.\n", 506 | "def print_current_time_wrong():\n", 507 | " from datetime import datetime # this is very bad practice !!! \n", 508 | " print(str(datetime.now())) " 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": { 515 | "collapsed": true 516 | }, 517 | "outputs": [], 518 | "source": [ 519 | "def print_current_time():\n", 520 | " from datetime import datetime\n", 521 | " print (str(datetime.now())) " 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [ 530 | "print_current_time()" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "# End of Chapter 3." 540 | ] 541 | } 542 | ], 543 | "metadata": { 544 | "anaconda-cloud": {}, 545 | "interpreter": { 546 | "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250" 547 | }, 548 | "kernelspec": { 549 | "display_name": "Python 2.7.16 64-bit ('base': conda)", 550 | "name": "python3" 551 | }, 552 | "language_info": { 553 | "codemirror_mode": { 554 | "name": "ipython", 555 | "version": 3 556 | }, 557 | "file_extension": ".py", 558 | "mimetype": "text/x-python", 559 | "name": "python", 560 | "nbconvert_exporter": "python", 561 | "pygments_lexer": "ipython3", 562 | "version": "3.6.2" 563 | } 564 | }, 565 | "nbformat": 4, 566 | "nbformat_minor": 1 567 | } 568 | -------------------------------------------------------------------------------- /Chapter_5_sec_3.1_3.4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 5.3 Lab: Cross-Validation and the Bootstrap" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 5.3.1 The Validation Set Approach" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import scipy\n", 26 | "import pandas as pd \n", 27 | "import math\n", 28 | "import random\n", 29 | "import statsmodels.api as sm\n", 30 | "import statsmodels.formula.api as smf\n", 31 | "from statsmodels.graphics.regressionplots import *\n", 32 | "from sklearn import datasets, linear_model\n", 33 | "from sklearn.model_selection import KFold, cross_val_score\n", 34 | "from sklearn.preprocessing import PolynomialFeatures\n", 35 | "from sklearn.linear_model import LinearRegression\n", 36 | "from sklearn.pipeline import Pipeline\n", 37 | "from collections import OrderedDict" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "Auto = pd.read_csv('data/Auto.csv', header=0, na_values='?')\n", 47 | "Auto = Auto.dropna().reset_index(drop=True) # drop the observation with NA values and reindex the obs from 0\n", 48 | "print(Auto.shape)\n", 49 | "print(Auto.head())" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# split the data into training and record the index of train samples\n", 59 | "np.random.seed(1)\n", 60 | "train = np.random.choice(Auto.shape[0], 196, replace=False)\n", 61 | "select = np.in1d(range(Auto.shape[0]), train)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# start to build the model\n", 71 | "lm = smf.ols ('mpg~horsepower', data = Auto[select]).fit()\n", 72 | "print(lm.summary())" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# to follow the book, get prediction for all the observations in the dataset\n", 82 | "# here we use ~ select to exclude the result of the training samples\n", 83 | "preds = lm.predict(Auto)\n", 84 | "square_error = (Auto['mpg'] - preds)**2\n", 85 | "print('-------- Test error for 1st order model --------')\n", 86 | "print(np.mean(square_error[~select]))" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "# build a model with 2nd order of features \n", 96 | "lm2 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0)', data = Auto[select]).fit()\n", 97 | "preds = lm2.predict(Auto)\n", 98 | "square_error = (Auto['mpg'] - preds)**2\n", 99 | "print('--------Test error for 2nd order--------')\n", 100 | "print(square_error[~select].mean())" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# build a model with 3rd order of features \n", 110 | "lm3 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0) + I(horsepower ** 3.0)', data = Auto[select]).fit()\n", 111 | "preds = lm3.predict(Auto)\n", 112 | "square_error = (Auto['mpg'] - preds)**2\n", 113 | "print('--------Test rror for 3rd order--------')\n", 114 | "print(np.mean(square_error[~select]))\n", 115 | "\n", 116 | "\"\"\" \n", 117 | "These results are consistent with our previous findings: a model that predicts mpg using a quadratic function of \n", 118 | "horsepower performs better than a model that involves only a linear function of horsepower, \n", 119 | "and there is little evidence in favor of a model that uses a cubic function of horsepower.\n", 120 | "\"\"\"" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "# if we look at the summmary for 3rd order regression, \n", 130 | "# the coefficient of the 3rd order term is not statistically significant. \n", 131 | "# I will use this as Supporting evidence for the above claim. \n", 132 | "print(lm3.summary())" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## 5.3.2 Leave-One-Out Cross-Validation\n", 140 | "The LOOCV estimates only keep one sample in the validation data and use the rest of the data to train the model. This way the training model has similar dataset comparing to the model trained on entire dataset." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "# OLS fit \n", 150 | "ols_fit = smf.ols ('mpg~horsepower', data = Auto).fit()\n", 151 | "print(ols_fit.params)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "# GLM fit. Compare with OLS fit, the coeffs are the same\n", 161 | "glm_fit = smf.glm('mpg~horsepower', data = Auto).fit()\n", 162 | "print(glm_fit.params)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "# trying CV in Python is not as easy as that in R. It will require some manual coding.\n", 174 | "# to use some of implemented function in Python, we use Sklearn for linear model \n", 175 | "# from sklearn.model_selection import KFold, cross_val_score\n", 176 | "# from sklearn.preprocessing import PolynomialFeatures\n", 177 | "# from sklearn.linear_model import LinearRegression\n", 178 | "# from sklearn.pipeline import Pipeline" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "# let us re-train the model in sklearn\n", 188 | "x = pd.DataFrame(Auto.horsepower)\n", 189 | "y = Auto.mpg\n", 190 | "\n", 191 | "model = LinearRegression()\n", 192 | "model.fit(x, y)\n", 193 | "print(model.intercept_)\n", 194 | "print(model.coef_)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "# loo use folds equal to # of observations. We could also choose other number of folds.\n", 204 | "k_fold = KFold(n_splits=x.shape[0]) \n", 205 | "test = cross_val_score(model, x, y, cv=k_fold, scoring = 'neg_mean_squared_error', n_jobs=-1)\n", 206 | "print(np.mean(-test))" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "# for higher order polynomial fit, we use pipline tool. \n", 216 | "# below shows how to fit an order 1 to 20 polynomial data and show the loo results\n", 217 | "# this step may take a few mins\n", 218 | "A = OrderedDict()\n", 219 | "n_split = x.shape[0]\n", 220 | "for porder in range(1, 21, 2):\n", 221 | " model = Pipeline([('poly', PolynomialFeatures(degree=porder)), ('linear', LinearRegression())])\n", 222 | " k_fold = KFold(n_splits=n_split) # loo use folds equal to # of observations\n", 223 | " test = cross_val_score(model, x, y, cv=k_fold, scoring = 'neg_mean_squared_error', n_jobs=-1)\n", 224 | " A[str(porder)] = np.mean(-test)\n", 225 | " \n", 226 | "print(A)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "## 5.3.3 k-Fold Cross-Validation" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "# K-fold validation is exactly same as LOO with different n_splits parameter setup. \n", 243 | "# the computation time is much shorter than that of LOOCV.\n", 244 | "np.random.seed(2)\n", 245 | "A = OrderedDict()\n", 246 | "n_split = 10\n", 247 | "for porder in range(1, 21, 2):\n", 248 | " model = Pipeline([('poly', PolynomialFeatures(degree=porder)), ('linear', LinearRegression())])\n", 249 | " k_fold = KFold(n_splits=n_split) \n", 250 | " test = cross_val_score(model, x, y, cv = k_fold, scoring = 'neg_mean_squared_error', n_jobs = -1)\n", 251 | " A[str(porder)] = np.mean(-test)\n", 252 | " \n", 253 | "print(A)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "## 5.3.4 The Bootstrap\n", 261 | "Bootstrap means sampling with replacement. To eliminate the effect of sample size, the norm practice is to sample the same size as original dataset with replacement.\n", 262 | "\n", 263 | "Bootstrap can be used in a lot of other places, such as estimating the accuracy of a linear regression model coeffcients / Conduct non-parametric testing (permutation test) / Estimate some complicated probability " 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "collapsed": true 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "Portfolio = pd.read_csv('data/Portfolio.csv', header=0)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "# to illustrate the use of the bootstrap on this data, we must first create a function, alpha_fn(), \n", 284 | "# which takes as input the (X, Y) data as well as a vector indicating which observations should be used to estimate alpha.\n", 285 | "def alpha_fn(data, index):\n", 286 | " X = data.X.iloc[index]\n", 287 | " Y = data.Y.iloc[index]\n", 288 | " return (np.var(Y) - np.cov(X,Y)[0,1])/(np.var(X) + np.var(Y) - 2 * np.cov(X, Y)[0,1])" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "alpha_fn(Portfolio, range(0,100))" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "# generate one set of random index with 100 elements. The array has been sorted to show there are repeat elements.\n", 307 | "np.sort(np.random.choice(range(0, 100), size=100, replace=True))" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "# recall the previous function with a random set of input. \n", 317 | "alpha_fn(Portfolio, np.random.choice(range(0, 100), size=100, replace=True))" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "collapsed": true 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "# since I am not aware of boot like function in python, I just defined an ad-hoc function called boot_python()\n", 329 | "def boot_python(data, input_fun, iteration):\n", 330 | " n = Portfolio.shape[0]\n", 331 | " idx = np.random.randint(0, n, (iteration, n))\n", 332 | " stat = np.zeros(iteration)\n", 333 | " for i in range(len(idx)):\n", 334 | " stat[i] = input_fun(data, idx[i])\n", 335 | " \n", 336 | " return {'Mean': np.mean(stat), 'STD': np.std(stat)}\n", 337 | " " 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "boot_python(Portfolio, alpha_fn, 1000)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "# End of Chapter 5" 356 | ] 357 | } 358 | ], 359 | "metadata": { 360 | "anaconda-cloud": {}, 361 | "interpreter": { 362 | "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250" 363 | }, 364 | "kernelspec": { 365 | "display_name": "Python 2.7.16 64-bit ('base': conda)", 366 | "name": "python3" 367 | }, 368 | "language_info": { 369 | "codemirror_mode": { 370 | "name": "ipython", 371 | "version": 3 372 | }, 373 | "file_extension": ".py", 374 | "mimetype": "text/x-python", 375 | "name": "python", 376 | "nbconvert_exporter": "python", 377 | "pygments_lexer": "ipython3", 378 | "version": "3.6.2" 379 | } 380 | }, 381 | "nbformat": 4, 382 | "nbformat_minor": 1 383 | } 384 | -------------------------------------------------------------------------------- /Chapter_7_sec_7.8.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 7.8 Lab: Non-linear Modeling " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "import scipy\n", 19 | "import pandas as pd \n", 20 | "from sklearn.linear_model import LinearRegression\n", 21 | "from sklearn.metrics import mean_squared_error, r2_score\n", 22 | "from sklearn.preprocessing import PolynomialFeatures\n", 23 | "import statsmodels.api as sm\n", 24 | "from patsy import dmatrix\n", 25 | "\n", 26 | "%matplotlib inline" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# in this lab, we will use Wage data. Let us read in the CSV data ans look at a sample of this data.\n", 36 | "Wage = pd.read_csv('data/Wage.csv', header=0, na_values='NA')\n", 37 | "print(Wage.shape)\n", 38 | "print(Wage.head())" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## 7.8.1 Polynomial Regression and Step Functions" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "\"\"\"\n", 55 | "We will examine how to fit a polynomial regression model on the wage dataset. As all the techniques, \n", 56 | "we have multiple ways to do this. Here I will use sklearn as we alreadly used statsmodel.api before in Chapter 3. \n", 57 | "If you are looking for more built-in functions around p-value, significance, confidence intervie, etc., \n", 58 | "I would recommend to use statsmodel.api. \n", 59 | "\n", 60 | "But scikit-learn does not have built error estimates for doing inference. But this problem forces us to \n", 61 | "think about a more general method to find Confidence Interview (key word: Bootstrap) \n", 62 | "\n", 63 | "Numpy also has a nice function to do ploynomial regression: https://www.ritchieng.com/machine-learning-polynomial-regression/\n", 64 | "\"\"\"\n", 65 | "\n", 66 | "n_deg = 4\n", 67 | "X = Wage.age\n", 68 | "y = Wage.wage\n", 69 | "X = X.values.reshape(X.shape[0], 1)\n", 70 | "y = y.values.reshape(y.shape[0], 1)\n", 71 | "\n", 72 | "polynomial_features= PolynomialFeatures(degree=n_deg)\n", 73 | "X_poly = polynomial_features.fit_transform(X)\n", 74 | "\n", 75 | "reg = LinearRegression()\n", 76 | "reg.fit(X_poly, y)\n", 77 | "\n", 78 | "# get coefficients and compare with the numbers \n", 79 | "print(reg.intercept_)\n", 80 | "print(reg.coef_)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# we now create a grid of values for age at which we want predictionsm and the call the generic predict() function \n", 90 | "# generate a sequence of age values spanning the range\n", 91 | "age_grid = np.arange(Wage.age.min(), Wage.age.max()).reshape(-1,1)\n", 92 | "\n", 93 | "# generate test data use PolynomialFeatures and fit_transform\n", 94 | "X_test = PolynomialFeatures(degree=n_deg).fit_transform(age_grid)\n", 95 | "\n", 96 | "# predict the value of the generated ages\n", 97 | "y_pred = reg.predict(X_test)\n", 98 | "\n", 99 | "# creating plots\n", 100 | "plt.plot(age_grid, y_pred, color='red')\n", 101 | "plt.show()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "\"\"\"\n", 111 | "Next we need to decide the order of the polynomial.\n", 112 | "In the book, the authors did this by using hypothesis testing. ANOVA using F-test was explanied. \n", 113 | "In order to use the ANOVA function, two models $M_1$ and $M_2$ must be nested model: \n", 114 | "the predictors in $M_1$ must be a subset of the predictors in $M_2$. \n", 115 | "statsmodel.api has a nice built-in function to do that. \n", 116 | "\n", 117 | "As an alternative, we could choose the polynomial degree using cross-validation, as discussed in before. \n", 118 | "Actually, the cross-validation approach is more commonly used in practice. \n", 119 | "\"\"\"\n", 120 | "\n", 121 | "X1 = PolynomialFeatures(1).fit_transform(X)\n", 122 | "X2 = PolynomialFeatures(2).fit_transform(X)\n", 123 | "X3 = PolynomialFeatures(3).fit_transform(X)\n", 124 | "X4 = PolynomialFeatures(4).fit_transform(X)\n", 125 | "X5 = PolynomialFeatures(5).fit_transform(X)\n", 126 | "fit1 = sm.GLS(y, X1).fit()\n", 127 | "fit2 = sm.GLS(y, X2).fit()\n", 128 | "fit3 = sm.GLS(y, X3).fit()\n", 129 | "fit4 = sm.GLS(y, X4).fit()\n", 130 | "fit5 = sm.GLS(y, X5).fit()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "print(sm.stats.anova_lm(fit1, fit2, fit3, fit4, fit5, type=1))\n", 140 | "\n", 141 | "\"\"\"\n", 142 | "The above table, we fit five different models and sequentially compare the simpler model to the more complex model.\n", 143 | "The summary above shows the quadratic model fit2 is significantly better than fit1 at p value of $2.36*10^{-32}$.\n", 144 | "Similarly, the cubic model is significnatly better than the quadratic model ($p = 1.68 * 10^{-3}$).\n", 145 | "The p-value comparing the cubic and degree-4 polynomials, fit3 and fit4, is approximately 0.05 \n", 146 | "while the degree-5 polynomial fit5 seems unnecessary because its p-value is 0.37. \n", 147 | "Hence, either a cubic or a quartic polynomial appear to provide a reasonable fit to the data, \n", 148 | "but lower- or higher-order models are not justified.\n", 149 | "\"\"\"" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# in the book, the authors also discussed logistic regression and the polynomial terms. \n", 159 | "# in python, sm.GLM function provided some functions similar to glm() in R.\n", 160 | "logistic_model = sm.GLM ((y>250), X4, family=sm.families.Binomial())\n", 161 | "logistic_fit = logistic_model.fit()\n", 162 | "print(logistic_fit.summary())" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "# in python, we could use the pd.cut() function to fit a step function.\n", 172 | "age_cut, bins = pd.cut(Wage.age, bins=4, retbins=True, right=True)\n", 173 | "age_cut.value_counts(sort=False)\n", 174 | "\n", 175 | "\"\"\" \n", 176 | "Here cut() automatically picked the cutpoints at 33.5, 49, and 64.5 years of age. \n", 177 | "We could also have specified our own cutpoints directly using the breaks option (set bins into a sequence of scalars, e.g. [0, 10, 20, 40, 100]). \n", 178 | "Note in the following code, I manually added a constant column and dropped the lowest value bin (17.938, 33.5] dummy variable.\n", 179 | "\"\"\"" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "age_cut_dummies = pd.get_dummies(age_cut)\n", 189 | "age_cut_dummies = sm.add_constant(age_cut_dummies)\n", 190 | "fit_age_cut = sm.GLM(Wage.wage, age_cut_dummies.drop(age_cut_dummies.columns[1], axis=1)).fit()\n", 191 | "print(fit_age_cut.summary())" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "## 7.8.2 Splines" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "# in order to fit regression splines in python, we use the spatsy library. \n", 208 | "# from patsy import dmatrix\n", 209 | "\n", 210 | "\"\"\" \n", 211 | "In the content of section 7.4, we saw that regression splines can be fit by constructing an appropriate matrix of basis functions. \n", 212 | "The bs() function generates the entire matrix of bs() basis functions for splines with the specified set of knots. \n", 213 | "By default, cubic splines are produced. Here we have prespecified knots at ages 25, 40, and 60. \n", 214 | "This produces a spline with six basis functions. \n", 215 | "\"\"\"\n", 216 | "age_grid = np.arange(Wage.age.min(), Wage.age.max()).reshape(-1,1)\n", 217 | "spline_basis1 = dmatrix(\"bs(Wage.age, knots=(25,40,60), degree=3, include_intercept=False)\", {\"Wage.age\": Wage.age}, return_type='dataframe')" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "# now we can fit the model using the spline basis functions\n", 227 | "spline_fit1 = sm.GLM(Wage.wage, spline_basis1).fit()\n", 228 | "spline_fit1.summary()" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "# another approach is to fix the degree of freedom and let the code to automatically choose the knots.\n", 238 | "spline_basis2 = dmatrix(\"bs(Wage.age, df=6, include_intercept=False)\",\n", 239 | " {\"Wage.age\": Wage.age}, return_type='dataframe')\n", 240 | "spline_fit2 = sm.GLM(Wage.wage, spline_basis2).fit()\n", 241 | "spline_fit2.summary()" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "# package patsy also has a nice function to do natural spline using cr()\n", 251 | "spline_basis3 = dmatrix(\"cr(Wage.age, df=4)\", {\"Wage.age\": Wage.age}, return_type='dataframe')\n", 252 | "spline_fit3 = sm.GLM(Wage.wage, spline_basis3).fit()\n", 253 | "spline_fit3.summary()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "collapsed": true 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "# finally, let us make some predictions\n", 265 | "pred1 = spline_fit1.predict(dmatrix(\"bs(age_grid, knots=(25,40,60), include_intercept=False)\",{\"age_grid\": age_grid}, return_type='dataframe'))\n", 266 | "pred2 = spline_fit2.predict(dmatrix(\"bs(age_grid, df=6, include_intercept=False)\",{\"age_grid\": age_grid}, return_type='dataframe'))\n", 267 | "pred3 = spline_fit3.predict(dmatrix(\"cr(age_grid, df=4)\", {\"age_grid\": age_grid}, return_type='dataframe'))" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "# plot the splines and error bands\n", 277 | "plt.scatter(Wage.age, Wage.wage, facecolor='None', edgecolor='k', alpha=0.1)\n", 278 | "plt.plot(age_grid, pred1, color='r', label='Cubic spine with knots at [25, 40, 60]')\n", 279 | "plt.plot(age_grid, pred2, color='g', label='Cubic spine with df=6')\n", 280 | "plt.plot(age_grid, pred3, color='b', label='Natural spline df=4')\n", 281 | "plt.legend()\n", 282 | "plt.xlim(15,85)\n", 283 | "plt.ylim(0,350)\n", 284 | "plt.xlabel('age')\n", 285 | "plt.ylabel('wage')\n", 286 | "plt.show()" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "## 7.8.3 GAMs" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "# we now fit a GAM to predict wage using natural spline functions of year and age, treating education as a qualitative (i.e. categorical) predictor.\n", 303 | "age_basis = dmatrix(\"cr(Wage.age, df=5)\", {\"Wage.age\": Wage.age}, return_type='dataframe')\n", 304 | "year_basis = dmatrix(\"cr(Wage.year, df=4)\", {\"Wage.year\": Wage.year}, return_type='dataframe').drop (['Intercept'], axis = 1)\n", 305 | "education_dummies = pd.get_dummies(Wage.education)\n", 306 | "education_dummies = education_dummies.drop([education_dummies.columns[0]], axis = 1)\n", 307 | "\n", 308 | "# we concatenate all the predictors\n", 309 | "x_all = pd.concat([age_basis, year_basis, education_dummies], axis=1)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": { 316 | "collapsed": true 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "# fit a model and print the summary\n", 321 | "gam1_fit = sm.OLS(Wage.wage, x_all).fit()\n", 322 | "print(gam1_fit.summary())\n", 323 | "\n", 324 | "\"\"\" \n", 325 | "We could apply similar analysis procedure to this analysis, \n", 326 | "such as ANOVA, construction of a classification model and visually inspecting the model performance.\n", 327 | "\"\"\"" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "# End of Chapter 7" 337 | ] 338 | } 339 | ], 340 | "metadata": { 341 | "interpreter": { 342 | "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250" 343 | }, 344 | "kernelspec": { 345 | "display_name": "Python 2.7.16 64-bit ('base': conda)", 346 | "name": "python3" 347 | }, 348 | "language_info": { 349 | "codemirror_mode": { 350 | "name": "ipython", 351 | "version": 3 352 | }, 353 | "file_extension": ".py", 354 | "mimetype": "text/x-python", 355 | "name": "python", 356 | "nbconvert_exporter": "python", 357 | "pygments_lexer": "ipython3", 358 | "version": "3.6.2" 359 | } 360 | }, 361 | "nbformat": 4, 362 | "nbformat_minor": 1 363 | } 364 | -------------------------------------------------------------------------------- /Chapter_8_sec_8.3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 8.3 Lab: Decision Trees" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "from sklearn.model_selection import train_test_split\n", 19 | "from sklearn.tree import DecisionTreeClassifier, export_graphviz, DecisionTreeRegressor, plot_tree\n", 20 | "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n", 21 | "from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error\n", 22 | "from sklearn import tree\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "\n", 25 | "%matplotlib inline" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## 8.3.1 Fitting Classification Trees" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "\"\"\" \n", 42 | "The sklearn library has a lot of useful tools for tress. We first use classification trees to analyze the Carseats data set.\n", 43 | "In these data, Sales is a continuous variable, and so we begin by recoding it as a binary variable (by thresholding). \n", 44 | "We use the map() function to create a variable, called High, which takes on a value of 'Y' if the Sales variable exceeds 8, \n", 45 | "and takes on a value of 'N' otherwise. In Python, we need to code catergorical variable into dummy variable.\n", 46 | "\"\"\"\n", 47 | "\n", 48 | "carseats = pd.read_csv('./data/Carseats.csv')\n", 49 | "carseats['High'] = carseats.Sales.map(lambda x: 'Y' if x>8 else 'N')\n", 50 | "carseats.ShelveLoc = pd.factorize(carseats.ShelveLoc)[0]\n", 51 | "carseats.Urban = carseats.Urban.map({'No':0, 'Yes':1})\n", 52 | "carseats.US = carseats.US.map({'No':0, 'Yes':1})\n", 53 | "print(carseats.describe())\n", 54 | "print(carseats.info())" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "# we first split the dataset into training (200 samples) and test sets.\n", 66 | "X = carseats.drop(['Sales', 'High'], axis=1)\n", 67 | "y = carseats.High\n", 68 | "train_size = 200\n", 69 | "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, \n", 70 | " test_size=X.shape[0]-train_size, random_state=0)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# to build a tree, we could use 'gini' or 'entropy' as split criterion at each node. \n", 80 | "# here we use an example use 'gini' and with a few other hyperparameters.\n", 81 | "criteria = \"gini\" \n", 82 | "max_depth = 6 \n", 83 | "min_sample_leaf = 4\n", 84 | "clf_gini = DecisionTreeClassifier(criterion=criteria, random_state=100,\n", 85 | " max_depth=max_depth, min_samples_leaf=min_sample_leaf)\n", 86 | "clf_gini.fit(X_train, y_train)\n", 87 | "print(clf_gini.score(X_train, y_train))\n" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# one attractive feature of a tree is visulization. \n", 97 | "plt.figure(figsize=(40,20)) # customize according to the size of your tree\n", 98 | "plot_tree(clf_gini, feature_names = X_train.columns)\n", 99 | "plt.show()\n" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# let us build the confusion matrix to evaluate the model in accuracy for both training and test datasets.\n", 109 | "# we could also compute more metrics such as precision, recall, f1-score, etc.\n", 110 | "y_pred_train = clf_gini.predict(X_train)\n", 111 | "cm = pd.DataFrame(confusion_matrix(y_train, y_pred_train).T, index=['No', 'Yes'], columns=['No', 'Yes'])\n", 112 | "print(cm)\n", 113 | "print(\"Train Accuracy is \", accuracy_score(y_train,y_pred_train)*100)\n", 114 | "\n", 115 | "\n", 116 | "y_pred = clf_gini.predict(X_test)\n", 117 | "cm = pd.DataFrame(confusion_matrix(y_test, y_pred).T, index=['No', 'Yes'], columns=['No', 'Yes'])\n", 118 | "print(cm)\n", 119 | "print(\"Test Accuracy is \", accuracy_score(y_test,y_pred)*100)\n", 120 | "\n", 121 | "\"\"\"\n", 122 | "The test accuracy of our model is significant lower than our training result, this may indicate overfitting. \n", 123 | "we can go back and change the hyperparameters in the training process to reduce the dimension of the parameter space.\n", 124 | "\"\"\"" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## 8.3.2 Fitting Regression Trees" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "\"\"\" \n", 141 | "Another attractive feature of a tree is the ability to solve both classification and regression problems.\n", 142 | "Here we fit a regression tree to the Boston data set. First, we create a training set, and fit the tree to the training data. \n", 143 | "Since Python does not support back-prune, let us use the max_depth at 2.\n", 144 | "\"\"\"\n", 145 | "\n", 146 | "# as we move forward, it is good to keep the hyperparameters together for future iterations.\n", 147 | "boston = pd.read_csv('./data/Boston.csv')\n", 148 | "X = boston.drop('medv', axis=1)\n", 149 | "y = boston.medv\n", 150 | "train_size = 0.5 # we used specific train size before, we can also use a percentage. \n", 151 | "random_state = 0 \n", 152 | "max_depth = 2\n", 153 | "\n", 154 | "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=random_state)\n", 155 | "regr_tree = DecisionTreeRegressor(max_depth=max_depth)\n", 156 | "regr_tree.fit(X_train, y_train)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "plt.figure(figsize=(40,20)) # customize according to the size of your tree\n", 166 | "plot_tree(regr_tree, feature_names = X_train.columns)\n", 167 | "plt.show()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "y_pred = regr_tree.predict(X_test)\n", 177 | "print(mean_squared_error(y_test, y_pred))\n", 178 | "\"\"\" \n", 179 | "We could look more into train and test accuracy to see whether the current model is overfitting or underfitting.\n", 180 | "\"\"\"" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "## 8.3.3 Bagging and Random Forests" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "\"\"\"\n", 197 | "Here we apply bagging and random forests (RF) to the Boston data.\n", 198 | "RF is an ensemble method, which means it combines the results of multiple decision trees. \n", 199 | "As a result, RF could help to reduce the variance of the model. \n", 200 | "Similar to decision trees, RF can be used to solve both classification and regression problems.\n", 201 | "\n", 202 | "\n", 203 | "In this excercise, we will use the randomForest package in Python. \n", 204 | "The exact results obtained in this section may depend on the version of Python and the version of the randomForest package installed on your computer. \n", 205 | "\"\"\"" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "# we reused the previous train and test sets.\n", 215 | "all_features = X_train.shape[1]\n", 216 | "regr_bagging = RandomForestRegressor(max_features=all_features, random_state=1)\n", 217 | "regr_bagging.fit(X_train, y_train)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "y_pred = regr_bagging.predict(X_test)\n", 227 | "print(mean_squared_error(y_test, y_pred))\n", 228 | "\n", 229 | "\"\"\"\n", 230 | "we can compare the test accuracy of the bagging model with the test accuracy of the singl regression tree above.\n", 231 | "Normally, the bagging model is better than the single tree model.\n", 232 | "\"\"\"" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "\"\"\"\n", 242 | "We can grow a random forest in exactly the same way, except that we'll use a smaller value of the max_features argument. \n", 243 | "Theoretically, radomly selecting a subset of features could reduce the correlation of the tress and can reduce the variance of the model.\n", 244 | "\"\"\"\n", 245 | "# here we'll use max_features = 3 (close to square root of all features as a rule of thumb)\n", 246 | "regr_rf = RandomForestRegressor(max_features=3, random_state=1)\n", 247 | "regr_rf.fit(X_train, y_train)\n", 248 | "\n", 249 | "y_pred = regr_rf.predict(X_test)\n", 250 | "print(mean_squared_error(y_test, y_pred))\n", 251 | "\n", 252 | "\"\"\" \n", 253 | "The test set MSE is even lower; this indicates that random forests yielded an improvement over bagging in this case.\n", 254 | "\"\"\"" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "# RF provides multiple ways to interpret the model. One way is to see the importance of each feature.\n", 264 | "Importance = pd.DataFrame({'Importance':regr_rf.feature_importances_*100}, index=X_train.columns)\n", 265 | "Importance.sort_values(by='Importance', axis=0, ascending=True).plot(kind='barh', color='r', )\n", 266 | "plt.xlabel('Variable Importance')\n", 267 | "plt.gca().legend_ = None" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "## 8.3.4 Boosting" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "Here we use the GradientBoostingRegressor package. The argument n_estimators=500 indicates that we want 500 trees, and the option interaction.depth=4 limits the depth of each tree." 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "\"\"\"\n", 291 | "Boosting is another ensemble method. Gradient Boosting is a popular method, \n", 292 | "and other well-known methods include AdaBoost, XGBoost, and LightGBM are buiit on top of it.\n", 293 | "\n", 294 | "Here we use the GradientBoostingRegressor package. The argument n_estimators=500 indicates that we want 500 trees, \n", 295 | "and the option interaction.depth=4 limits the depth of each tree. See the manuel for more details.\n", 296 | "\"\"\"\n", 297 | "\n", 298 | "regr_boost = GradientBoostingRegressor(n_estimators=500, learning_rate=0.02, max_depth=4, random_state=1)\n", 299 | "regr_boost.fit(X_train, y_train)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "# let us check the feature importance and MSE.\n", 309 | "feature_importance = regr_boost.feature_importances_*100\n", 310 | "rel_imp = pd.Series(feature_importance, index=X_train.columns).sort_values(inplace=False)\n", 311 | "rel_imp.T.plot(kind='barh', color='r', )\n", 312 | "plt.xlabel('Variable Importance')\n", 313 | "plt.gca().legend_ = None" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "y_pred = regr_boost.predict(X_test)\n", 323 | "print(mean_squared_error(y_test,y_pred))" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "## 8.3.5 Bayesian Additive Regression Trees\n", 331 | "As of now (2021), I was not able to find a good package for BART in Python. Please reach out if you have a package that works.\n", 332 | "** [To do: find a package for BART in Python.] **" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "\"\"\"\n", 342 | "Here we discuss Bayesian additive regression trees (BART), another ensemble method that uses decision trees as its building blocks.\n", 343 | "BART is related to both Random forest and boosting: each tree is constructed in a random manner as in bagging and random forests, \n", 344 | "and each tree tries to capture signal not yet accounted for by the current model, as in boosting. \n", 345 | "The main novelty in BART is the way in which new trees are generated.\n", 346 | "\"\"\"" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "# End of Chapter 8" 356 | ] 357 | } 358 | ], 359 | "metadata": { 360 | "interpreter": { 361 | "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250" 362 | }, 363 | "kernelspec": { 364 | "display_name": "Python 2.7.16 64-bit ('base': conda)", 365 | "name": "python3" 366 | }, 367 | "language_info": { 368 | "codemirror_mode": { 369 | "name": "ipython", 370 | "version": 3 371 | }, 372 | "file_extension": ".py", 373 | "mimetype": "text/x-python", 374 | "name": "python", 375 | "nbconvert_exporter": "python", 376 | "pygments_lexer": "ipython3", 377 | "version": "3.6.2" 378 | } 379 | }, 380 | "nbformat": 4, 381 | "nbformat_minor": 2 382 | } 383 | -------------------------------------------------------------------------------- /Chapter_9_sec_9.6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 9.6 Lab: Support Vector Machines" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "import seaborn as sns\n", 20 | "\n", 21 | "from sklearn.model_selection import train_test_split, GridSearchCV\n", 22 | "from sklearn.svm import SVC, LinearSVC\n", 23 | "from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve, auc, classification_report\n", 24 | "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", 25 | "\n", 26 | "import json\n", 27 | "%matplotlib inline" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# support function to plot the decision boundary of svc and highlight the support vectors\n", 37 | "def plot_decision_boundary(svc, X, y, h=0.021, pad=0.21):\n", 38 | " x_min, x_max = X[:, 0].min() - pad, X[:, 0].max() + pad\n", 39 | " y_min, y_max = X[:, 1].min() - pad, X[:, 1].max() + pad\n", 40 | " xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n", 41 | " Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])\n", 42 | " Z = Z.reshape(xx.shape)\n", 43 | " plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.2)\n", 44 | " plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)\n", 45 | "\n", 46 | " # highlight the support vectors\n", 47 | " sv = svc.support_vectors_\n", 48 | " plt.scatter(sv[:,0], sv[:,1], c='k', marker='*', s=21, linewidths=1)\n", 49 | " plt.xlim(x_min, x_max)\n", 50 | " plt.ylim(y_min, y_max)\n", 51 | " plt.xlabel('X1')\n", 52 | " plt.ylabel('X2')\n", 53 | " plt.show()\n", 54 | " print('Number of support vectors: ', svc.support_.size)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## 9.6.1 Support Vector Classifier" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# we start from generating random dataset: following the bookm we generate a dataset with 20 observations,\n", 71 | "# 2 features. And we divide these into two classes.\n", 72 | "# set seed \n", 73 | "np.random.seed(21)\n", 74 | "X = np.random.randn(20, 2)\n", 75 | "y = np.repeat([-1,1], 10)\n", 76 | "X[y==1] = X[y==1] + 1\n", 77 | "\n", 78 | "plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)\n", 79 | "plt.xlabel('X1')\n", 80 | "plt.ylabel('X2')\n", 81 | "plt.show()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# Support Vector Classifier (i.e. support vector machine with linear kernel)\n", 91 | "svc1 = SVC(C= 10, kernel='linear')\n", 92 | "svc1.fit(X, y)\n", 93 | "\n", 94 | "plot_decision_boundary(svc1, X, y)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "# as mentioned before, we could use dir() to see the methods of the class\n", 104 | "# I did not find a good way to print out the summary of the SVC model.\n", 105 | "dir(svc1)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "# we could take a look at the defaul parameters of the SVC model\n", 115 | "svc1.get_params()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# we use a small cost (c = 0.1). A smaller value of the cost parameter is being used, \n", 125 | "# we obtain a larger number of support vectors, because the margin is now wider. \n", 126 | "svc2 = SVC(C=0.1, kernel='linear')\n", 127 | "svc2.fit(X, y)\n", 128 | "\n", 129 | "plot_decision_boundary(svc2, X, y)\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "# we could also try to tune the cost parameter (C) of the SVC model using GridSearchCV\n", 139 | "# in this function, we need to specify cross validation folds and the metric to use for evaluation\n", 140 | "tuned_parameters = [{'C': [0.001, 0.01, 0.1, 1, 5, 10, 100]}]\n", 141 | "clf = GridSearchCV(SVC(kernel='linear'), tuned_parameters, cv=10, scoring='accuracy', return_train_score=True)\n", 142 | "clf.fit(X, y)\n", 143 | "clf.cv_results_" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# let us see the best parameters. \n", 153 | "# This is different from the results in the book, it is very likely due to the random generation of the datasetof the data\n", 154 | "clf.best_params_" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "# we use the same generation process to generate test data\n", 164 | "X_test = np.random.randn(20, 2)\n", 165 | "y_test = np.repeat([-1,1], 10)\n", 166 | "X_test[y_test==1] = X_test[y_test==1] + 1\n", 167 | "\n", 168 | "plt.scatter(X_test[:,0], X_test[:,1], s=70, c=y_test, cmap=plt.cm.Paired)\n", 169 | "plt.xlabel('X1')\n", 170 | "plt.ylabel('X2')\n", 171 | "plt.show()" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "# train a model with the optimal parameters\n", 181 | "svc3 = SVC(C=1, kernel='linear')\n", 182 | "svc3.fit(X, y)\n", 183 | "\n", 184 | "y_pred = svc3.predict(X_test)\n", 185 | "pd.DataFrame(confusion_matrix(y_test, y_pred),index=svc3.classes_, columns=svc3.classes_)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "# now we make our data linear separable. In the book, they add another 0.5 to seperate the data. \n", 195 | "# here we start from the data generation process to aviod confusion.\n", 196 | "np.random.seed(21)\n", 197 | "X = np.random.randn(20, 2)\n", 198 | "y = np.repeat([-1,1], 10)\n", 199 | "X[y==1] = X[y==1] + 2.5\n", 200 | "\n", 201 | "plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)\n", 202 | "plt.xlabel('X1')\n", 203 | "plt.ylabel('X2')\n", 204 | "plt.show()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "X_test = np.random.randn(20, 2)\n", 214 | "y_test = np.repeat([-1,1], 10)\n", 215 | "X_test[y_test==1] = X_test[y_test==1] + 2.5\n", 216 | "\n", 217 | "plt.scatter(X_test[:,0], X_test[:,1], s=70, c=y_test, cmap=plt.cm.Paired)\n", 218 | "plt.xlabel('X1')\n", 219 | "plt.ylabel('X2')\n", 220 | "plt.show()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "# here seems the data is linear separable. We could use a bigger cost parameter (C = 100) to train the model.\n", 230 | "svc4 = SVC(C=100, kernel='linear')\n", 231 | "svc4.fit(X, y)\n", 232 | "\n", 233 | "plot_decision_boundary(svc4, X, y)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "y_pred = svc4.predict(X_test)\n", 243 | "pd.DataFrame(confusion_matrix(y_test, y_pred),index=svc4.classes_, columns=svc4.classes_)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "## 9.6.2 Support Vector Machine" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "# generating random dataset\n", 260 | "np.random.seed(21)\n", 261 | "X = np.random.randn(200,2)\n", 262 | "X[:100] = X[:100] + 2\n", 263 | "X[101:150] = X[101:150] - 2\n", 264 | "y = np.concatenate([np.repeat(-1, 150), np.repeat(1,50)])\n", 265 | "\n", 266 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2)\n", 267 | "\n", 268 | "plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)\n", 269 | "plt.xlabel('X1')\n", 270 | "plt.ylabel('X2')\n", 271 | "plt.show()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "# in python, we can use the same svc model abd kernel to specific the kernel\n", 281 | "# for rbf kernel, we need to specify the gamma parameter\n", 282 | "svm = SVC(C=1.0, kernel='rbf', gamma=1)\n", 283 | "svm.fit(X_train, y_train)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "plot_decision_boundary(svm, X_train, y_train)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "y_pred = svm.predict(X_test)\n", 302 | "pd.DataFrame(confusion_matrix(y_test, y_pred),index=svm.classes_, columns=svm.classes_)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "# increasing C parameter which increases more flexibility\n", 312 | "svm2 = SVC(C=100, kernel='rbf', gamma=1.0)\n", 313 | "svm2.fit(X_train, y_train)\n", 314 | "plot_decision_boundary(svm2, X_train, y_train)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "\"\"\"\n", 324 | "The above decision boundary seems overfitting. We can compute the test accuracy of the model to\n", 325 | "see whether that is the case. \n", 326 | "\n", 327 | "The model (c = 1) yields a test accuracy of 0.85; the model(c = 100) yields a test accuracy of 0.77.\n", 328 | "\"\"\"\n", 329 | "y_pred = svm2.predict(X_test)\n", 330 | "pd.DataFrame(confusion_matrix(y_test, y_pred),index=svm2.classes_, columns=svm2.classes_)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "# set the parameters by cross-validation\n", 340 | "tuned_parameters = [{'C': [0.01, 0.1, 1, 10, 100],\n", 341 | " 'gamma': [0.5, 1,2,3,4]}]\n", 342 | "clf = GridSearchCV(SVC(kernel='rbf'), tuned_parameters, cv=10, scoring='accuracy', return_train_score=True)\n", 343 | "clf.fit(X_train, y_train)\n", 344 | "clf.cv_results_" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "# let us see the best parameters.\n", 354 | "clf.best_params_" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "# confusion matrix for the best model\n", 364 | "confusion_matrix(y_test, clf.best_estimator_.predict(X_test))" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "# calculate the test accuracy\n", 374 | "clf.best_estimator_.score(X_test, y_test)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "## 9.6.3 ROC Curves" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "svm3 = SVC(C=1, kernel='rbf', gamma=2)\n", 391 | "svm3.fit(X_train, y_train)\n", 392 | "\n", 393 | "# we train another model flexible model\n", 394 | "svm4 = SVC(C=1, kernel='rbf', gamma=50)\n", 395 | "svm4.fit(X_train, y_train)\n", 396 | "\n", 397 | "y_train_score3 = svm3.decision_function(X_train)\n", 398 | "y_train_score4 = svm4.decision_function(X_train)\n", 399 | "\n", 400 | "false_pos_rate3, true_pos_rate3, _ = roc_curve(y_train, y_train_score3)\n", 401 | "roc_auc3 = auc(false_pos_rate3, true_pos_rate3)\n", 402 | "\n", 403 | "false_pos_rate4, true_pos_rate4, _ = roc_curve(y_train, y_train_score4)\n", 404 | "roc_auc4 = auc(false_pos_rate4, true_pos_rate4)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(14,6))\n", 414 | "ax1.plot(false_pos_rate3, true_pos_rate3, label='SVM $\\gamma = 1$ ROC curve (area = %0.2f)' % roc_auc3, color='b')\n", 415 | "ax1.plot(false_pos_rate4, true_pos_rate4, label='SVM $\\gamma = 50$ ROC curve (area = %0.2f)' % roc_auc4, color='r')\n", 416 | "ax1.set_title('Training Data')\n", 417 | "\n", 418 | "y_test_score3 = svm3.decision_function(X_test)\n", 419 | "y_test_score4 = svm4.decision_function(X_test)\n", 420 | "\n", 421 | "false_pos_rate3, true_pos_rate3, _ = roc_curve(y_test, y_test_score3)\n", 422 | "roc_auc3 = auc(false_pos_rate3, true_pos_rate3)\n", 423 | "\n", 424 | "false_pos_rate4, true_pos_rate4, _ = roc_curve(y_test, y_test_score4)\n", 425 | "roc_auc4 = auc(false_pos_rate4, true_pos_rate4)\n", 426 | "\n", 427 | "ax2.plot(false_pos_rate3, true_pos_rate3, label='SVM $\\gamma = 1$ ROC curve (area = %0.2f)' % roc_auc3, color='b')\n", 428 | "ax2.plot(false_pos_rate4, true_pos_rate4, label='SVM $\\gamma = 50$ ROC curve (area = %0.2f)' % roc_auc4, color='r')\n", 429 | "ax2.set_title('Test Data')\n", 430 | "\n", 431 | "for ax in fig.axes:\n", 432 | " ax.plot([0, 1], [0, 1], 'k--')\n", 433 | " ax.set_xlim([-0.05, 1.0])\n", 434 | " ax.set_ylim([0.0, 1.05])\n", 435 | " ax.set_xlabel('False Positive Rate')\n", 436 | " ax.set_ylabel('True Positive Rate')\n", 437 | " ax.legend(loc=\"lower right\")\n", 438 | "\n", 439 | "\"\"\" \n", 440 | "From the plots below, we can see that the model with gamma = 50 is overfitting the training data \n", 441 | "(i.e. the training metric is much better than the test metric).\n", 442 | "\"\"\"" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "## 9.6.4 SVM with Multiple Classes" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "# generate the previously used random dataset\n", 459 | "np.random.seed(21)\n", 460 | "X = np.random.randn(200,2)\n", 461 | "X[:100] = X[:100] + 2\n", 462 | "X[101:150] = X[101:150] - 2\n", 463 | "y = np.concatenate([np.repeat(-1, 150), np.repeat(1,50)])\n", 464 | "\n", 465 | "# adding another class to the dataset, I used a different offset to separate the classes better\n", 466 | "XX = np.vstack([X, np.random.randn(50,2)])\n", 467 | "yy = np.hstack([y, np.repeat(0,50)])\n", 468 | "XX[yy==0, 1] = XX[yy==0, 1] + 6\n", 469 | "\n", 470 | "plt.scatter(XX[:,0], XX[:,1], s=70, c=yy, cmap=plt.cm.prism)\n", 471 | "plt.xlabel('XX1')\n", 472 | "plt.ylabel('XX2')" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "# fit the svm model \n", 482 | "svm5 = SVC(C=10, kernel='rbf', gamma=1)\n", 483 | "svm5.fit(XX, yy)\n", 484 | "plot_decision_boundary(svm5, XX, yy)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": {}, 490 | "source": [ 491 | "## 9.6.5 Application to Gene Expression Data" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "# I saved the gene expression data as a json file, in python we could load the json file using the json library\n", 501 | "# after reading in the data, we can use the data is same as a dictionary, we can use the keys to access the data\n", 502 | "# import json\n", 503 | "f = open('./data/Khan.json',)\n", 504 | "Khan = json.load(f)\n", 505 | "print(Khan.keys())" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | "X_train = np.array(Khan['xtrain'])\n", 515 | "y_train = np.array(Khan['ytrain'])\n", 516 | "X_test = np.array(Khan['xtest'])\n", 517 | "y_test = np.array(Khan['ytest'])" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "# take a look at the data, we will notice there are 4 classes\n", 527 | "np.unique(y_train)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [ 536 | "svm6 = SVC(C = 10, kernel='linear')\n", 537 | "svm6.fit(X_train, y_train)" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "\"\"\" \n", 547 | "We see below that the model is perfect on training data. In fact, this is not surprising, \n", 548 | "because the large number of variables relative to the number of observations implies that \n", 549 | "it is easy to find hyperplanes that fully separate the classes. We are most interested not \n", 550 | "in the support vector classifier’s performance on the training observations, but rather its \n", 551 | "performance on the test observations.\n", 552 | "\"\"\"\n", 553 | "print('train accuracy', svm6.score(X_train, y_train))\n", 554 | "y_pred = svm6.predict(X_test)\n", 555 | "print('test accuracy', svm6.score(X_test, y_test))" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": {}, 562 | "outputs": [], 563 | "source": [ 564 | "# End of Chapter 9" 565 | ] 566 | } 567 | ], 568 | "metadata": { 569 | "interpreter": { 570 | "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250" 571 | }, 572 | "kernelspec": { 573 | "display_name": "Python 2.7.16 64-bit ('base': conda)", 574 | "name": "python3" 575 | }, 576 | "language_info": { 577 | "codemirror_mode": { 578 | "name": "ipython", 579 | "version": 3 580 | }, 581 | "file_extension": ".py", 582 | "mimetype": "text/x-python", 583 | "name": "python", 584 | "nbconvert_exporter": "python", 585 | "pygments_lexer": "ipython3", 586 | "version": "3.6.2" 587 | } 588 | }, 589 | "nbformat": 4, 590 | "nbformat_minor": 2 591 | } 592 | -------------------------------------------------------------------------------- /ISLR_v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qx0731/Sharing_ISL_python/6b588e773be9984d0a4b91172fcce803c39fe0d6/ISLR_v1.pdf -------------------------------------------------------------------------------- /ISLR_v2_2021_Nov.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qx0731/Sharing_ISL_python/6b588e773be9984d0a4b91172fcce803c39fe0d6/ISLR_v2_2021_Nov.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ISL_python 2 | ### An Introduction to Statistical Learning with Applications in PYTHON 3 | 4 | I love the book << An Introduction to Statistical Learning with Applications in R>> by Gareth James • Daniela Witten • Trevor Hastie and Robert Tibshirani. This book has been super helpful for me. 5 | 6 | In this repository, I have implemented the same/similar functionality in Python. The code is in a script format to show the thought process. Hope this could help this book reach more broad audience. *Don't let the language barriers stop you from exploring something fun and useful.* 7 | 8 | Please refer https://www.statlearning.com/ for more details. In 2nd Edition, the authors introduced a R library ISLR2 for all the dataset used in the book. 9 | 10 | ### Setup for this repository: 11 | * Python==3.6.2 12 | * ipykernal==4.10.0 13 | * numpy==1.19.2 14 | * matplotlib==3.3.4 15 | * pandas==1.1.5 16 | * statsmodels==0.12.2 17 | * scikit-learn==0.21.1 18 | * patsy==0.5.1 19 | * scipy==1.5.2 20 | * seaborn==0.11.2 21 | * json==2.0.9 22 | * tensorflow==2.0.0 23 | * keras==2.3.1 24 | * lifelines==0.26.3 25 | * math 26 | * random 27 | * collections 28 | * itertools 29 | 30 | 31 | ### Special thanks to Bommy 32 | 33 | __ 34 | (___()'`; 35 | /, /` 36 | \\"--\\ 37 | 38 | Reference: https://www.asciiart.eu/animals/dogs 39 | -------------------------------------------------------------------------------- /data/Auto.csv: -------------------------------------------------------------------------------- 1 | mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name 2 | 18,8,307,130,3504,12,70,1,chevrolet chevelle malibu 3 | 15,8,350,165,3693,11.5,70,1,buick skylark 320 4 | 18,8,318,150,3436,11,70,1,plymouth satellite 5 | 16,8,304,150,3433,12,70,1,amc rebel sst 6 | 17,8,302,140,3449,10.5,70,1,ford torino 7 | 15,8,429,198,4341,10,70,1,ford galaxie 500 8 | 14,8,454,220,4354,9,70,1,chevrolet impala 9 | 14,8,440,215,4312,8.5,70,1,plymouth fury iii 10 | 14,8,455,225,4425,10,70,1,pontiac catalina 11 | 15,8,390,190,3850,8.5,70,1,amc ambassador dpl 12 | 15,8,383,170,3563,10,70,1,dodge challenger se 13 | 14,8,340,160,3609,8,70,1,plymouth 'cuda 340 14 | 15,8,400,150,3761,9.5,70,1,chevrolet monte carlo 15 | 14,8,455,225,3086,10,70,1,buick estate wagon (sw) 16 | 24,4,113,95,2372,15,70,3,toyota corona mark ii 17 | 22,6,198,95,2833,15.5,70,1,plymouth duster 18 | 18,6,199,97,2774,15.5,70,1,amc hornet 19 | 21,6,200,85,2587,16,70,1,ford maverick 20 | 27,4,97,88,2130,14.5,70,3,datsun pl510 21 | 26,4,97,46,1835,20.5,70,2,volkswagen 1131 deluxe sedan 22 | 25,4,110,87,2672,17.5,70,2,peugeot 504 23 | 24,4,107,90,2430,14.5,70,2,audi 100 ls 24 | 25,4,104,95,2375,17.5,70,2,saab 99e 25 | 26,4,121,113,2234,12.5,70,2,bmw 2002 26 | 21,6,199,90,2648,15,70,1,amc gremlin 27 | 10,8,360,215,4615,14,70,1,ford f250 28 | 10,8,307,200,4376,15,70,1,chevy c20 29 | 11,8,318,210,4382,13.5,70,1,dodge d200 30 | 9,8,304,193,4732,18.5,70,1,hi 1200d 31 | 27,4,97,88,2130,14.5,71,3,datsun pl510 32 | 28,4,140,90,2264,15.5,71,1,chevrolet vega 2300 33 | 25,4,113,95,2228,14,71,3,toyota corona 34 | 25,4,98,?,2046,19,71,1,ford pinto 35 | 19,6,232,100,2634,13,71,1,amc gremlin 36 | 16,6,225,105,3439,15.5,71,1,plymouth satellite custom 37 | 17,6,250,100,3329,15.5,71,1,chevrolet chevelle malibu 38 | 19,6,250,88,3302,15.5,71,1,ford torino 500 39 | 18,6,232,100,3288,15.5,71,1,amc matador 40 | 14,8,350,165,4209,12,71,1,chevrolet impala 41 | 14,8,400,175,4464,11.5,71,1,pontiac catalina brougham 42 | 14,8,351,153,4154,13.5,71,1,ford galaxie 500 43 | 14,8,318,150,4096,13,71,1,plymouth fury iii 44 | 12,8,383,180,4955,11.5,71,1,dodge monaco (sw) 45 | 13,8,400,170,4746,12,71,1,ford country squire (sw) 46 | 13,8,400,175,5140,12,71,1,pontiac safari (sw) 47 | 18,6,258,110,2962,13.5,71,1,amc hornet sportabout (sw) 48 | 22,4,140,72,2408,19,71,1,chevrolet vega (sw) 49 | 19,6,250,100,3282,15,71,1,pontiac firebird 50 | 18,6,250,88,3139,14.5,71,1,ford mustang 51 | 23,4,122,86,2220,14,71,1,mercury capri 2000 52 | 28,4,116,90,2123,14,71,2,opel 1900 53 | 30,4,79,70,2074,19.5,71,2,peugeot 304 54 | 30,4,88,76,2065,14.5,71,2,fiat 124b 55 | 31,4,71,65,1773,19,71,3,toyota corolla 1200 56 | 35,4,72,69,1613,18,71,3,datsun 1200 57 | 27,4,97,60,1834,19,71,2,volkswagen model 111 58 | 26,4,91,70,1955,20.5,71,1,plymouth cricket 59 | 24,4,113,95,2278,15.5,72,3,toyota corona hardtop 60 | 25,4,97.5,80,2126,17,72,1,dodge colt hardtop 61 | 23,4,97,54,2254,23.5,72,2,volkswagen type 3 62 | 20,4,140,90,2408,19.5,72,1,chevrolet vega 63 | 21,4,122,86,2226,16.5,72,1,ford pinto runabout 64 | 13,8,350,165,4274,12,72,1,chevrolet impala 65 | 14,8,400,175,4385,12,72,1,pontiac catalina 66 | 15,8,318,150,4135,13.5,72,1,plymouth fury iii 67 | 14,8,351,153,4129,13,72,1,ford galaxie 500 68 | 17,8,304,150,3672,11.5,72,1,amc ambassador sst 69 | 11,8,429,208,4633,11,72,1,mercury marquis 70 | 13,8,350,155,4502,13.5,72,1,buick lesabre custom 71 | 12,8,350,160,4456,13.5,72,1,oldsmobile delta 88 royale 72 | 13,8,400,190,4422,12.5,72,1,chrysler newport royal 73 | 19,3,70,97,2330,13.5,72,3,mazda rx2 coupe 74 | 15,8,304,150,3892,12.5,72,1,amc matador (sw) 75 | 13,8,307,130,4098,14,72,1,chevrolet chevelle concours (sw) 76 | 13,8,302,140,4294,16,72,1,ford gran torino (sw) 77 | 14,8,318,150,4077,14,72,1,plymouth satellite custom (sw) 78 | 18,4,121,112,2933,14.5,72,2,volvo 145e (sw) 79 | 22,4,121,76,2511,18,72,2,volkswagen 411 (sw) 80 | 21,4,120,87,2979,19.5,72,2,peugeot 504 (sw) 81 | 26,4,96,69,2189,18,72,2,renault 12 (sw) 82 | 22,4,122,86,2395,16,72,1,ford pinto (sw) 83 | 28,4,97,92,2288,17,72,3,datsun 510 (sw) 84 | 23,4,120,97,2506,14.5,72,3,toyouta corona mark ii (sw) 85 | 28,4,98,80,2164,15,72,1,dodge colt (sw) 86 | 27,4,97,88,2100,16.5,72,3,toyota corolla 1600 (sw) 87 | 13,8,350,175,4100,13,73,1,buick century 350 88 | 14,8,304,150,3672,11.5,73,1,amc matador 89 | 13,8,350,145,3988,13,73,1,chevrolet malibu 90 | 14,8,302,137,4042,14.5,73,1,ford gran torino 91 | 15,8,318,150,3777,12.5,73,1,dodge coronet custom 92 | 12,8,429,198,4952,11.5,73,1,mercury marquis brougham 93 | 13,8,400,150,4464,12,73,1,chevrolet caprice classic 94 | 13,8,351,158,4363,13,73,1,ford ltd 95 | 14,8,318,150,4237,14.5,73,1,plymouth fury gran sedan 96 | 13,8,440,215,4735,11,73,1,chrysler new yorker brougham 97 | 12,8,455,225,4951,11,73,1,buick electra 225 custom 98 | 13,8,360,175,3821,11,73,1,amc ambassador brougham 99 | 18,6,225,105,3121,16.5,73,1,plymouth valiant 100 | 16,6,250,100,3278,18,73,1,chevrolet nova custom 101 | 18,6,232,100,2945,16,73,1,amc hornet 102 | 18,6,250,88,3021,16.5,73,1,ford maverick 103 | 23,6,198,95,2904,16,73,1,plymouth duster 104 | 26,4,97,46,1950,21,73,2,volkswagen super beetle 105 | 11,8,400,150,4997,14,73,1,chevrolet impala 106 | 12,8,400,167,4906,12.5,73,1,ford country 107 | 13,8,360,170,4654,13,73,1,plymouth custom suburb 108 | 12,8,350,180,4499,12.5,73,1,oldsmobile vista cruiser 109 | 18,6,232,100,2789,15,73,1,amc gremlin 110 | 20,4,97,88,2279,19,73,3,toyota carina 111 | 21,4,140,72,2401,19.5,73,1,chevrolet vega 112 | 22,4,108,94,2379,16.5,73,3,datsun 610 113 | 18,3,70,90,2124,13.5,73,3,maxda rx3 114 | 19,4,122,85,2310,18.5,73,1,ford pinto 115 | 21,6,155,107,2472,14,73,1,mercury capri v6 116 | 26,4,98,90,2265,15.5,73,2,fiat 124 sport coupe 117 | 15,8,350,145,4082,13,73,1,chevrolet monte carlo s 118 | 16,8,400,230,4278,9.5,73,1,pontiac grand prix 119 | 29,4,68,49,1867,19.5,73,2,fiat 128 120 | 24,4,116,75,2158,15.5,73,2,opel manta 121 | 20,4,114,91,2582,14,73,2,audi 100ls 122 | 19,4,121,112,2868,15.5,73,2,volvo 144ea 123 | 15,8,318,150,3399,11,73,1,dodge dart custom 124 | 24,4,121,110,2660,14,73,2,saab 99le 125 | 20,6,156,122,2807,13.5,73,3,toyota mark ii 126 | 11,8,350,180,3664,11,73,1,oldsmobile omega 127 | 20,6,198,95,3102,16.5,74,1,plymouth duster 128 | 21,6,200,?,2875,17,74,1,ford maverick 129 | 19,6,232,100,2901,16,74,1,amc hornet 130 | 15,6,250,100,3336,17,74,1,chevrolet nova 131 | 31,4,79,67,1950,19,74,3,datsun b210 132 | 26,4,122,80,2451,16.5,74,1,ford pinto 133 | 32,4,71,65,1836,21,74,3,toyota corolla 1200 134 | 25,4,140,75,2542,17,74,1,chevrolet vega 135 | 16,6,250,100,3781,17,74,1,chevrolet chevelle malibu classic 136 | 16,6,258,110,3632,18,74,1,amc matador 137 | 18,6,225,105,3613,16.5,74,1,plymouth satellite sebring 138 | 16,8,302,140,4141,14,74,1,ford gran torino 139 | 13,8,350,150,4699,14.5,74,1,buick century luxus (sw) 140 | 14,8,318,150,4457,13.5,74,1,dodge coronet custom (sw) 141 | 14,8,302,140,4638,16,74,1,ford gran torino (sw) 142 | 14,8,304,150,4257,15.5,74,1,amc matador (sw) 143 | 29,4,98,83,2219,16.5,74,2,audi fox 144 | 26,4,79,67,1963,15.5,74,2,volkswagen dasher 145 | 26,4,97,78,2300,14.5,74,2,opel manta 146 | 31,4,76,52,1649,16.5,74,3,toyota corona 147 | 32,4,83,61,2003,19,74,3,datsun 710 148 | 28,4,90,75,2125,14.5,74,1,dodge colt 149 | 24,4,90,75,2108,15.5,74,2,fiat 128 150 | 26,4,116,75,2246,14,74,2,fiat 124 tc 151 | 24,4,120,97,2489,15,74,3,honda civic 152 | 26,4,108,93,2391,15.5,74,3,subaru 153 | 31,4,79,67,2000,16,74,2,fiat x1.9 154 | 19,6,225,95,3264,16,75,1,plymouth valiant custom 155 | 18,6,250,105,3459,16,75,1,chevrolet nova 156 | 15,6,250,72,3432,21,75,1,mercury monarch 157 | 15,6,250,72,3158,19.5,75,1,ford maverick 158 | 16,8,400,170,4668,11.5,75,1,pontiac catalina 159 | 15,8,350,145,4440,14,75,1,chevrolet bel air 160 | 16,8,318,150,4498,14.5,75,1,plymouth grand fury 161 | 14,8,351,148,4657,13.5,75,1,ford ltd 162 | 17,6,231,110,3907,21,75,1,buick century 163 | 16,6,250,105,3897,18.5,75,1,chevroelt chevelle malibu 164 | 15,6,258,110,3730,19,75,1,amc matador 165 | 18,6,225,95,3785,19,75,1,plymouth fury 166 | 21,6,231,110,3039,15,75,1,buick skyhawk 167 | 20,8,262,110,3221,13.5,75,1,chevrolet monza 2+2 168 | 13,8,302,129,3169,12,75,1,ford mustang ii 169 | 29,4,97,75,2171,16,75,3,toyota corolla 170 | 23,4,140,83,2639,17,75,1,ford pinto 171 | 20,6,232,100,2914,16,75,1,amc gremlin 172 | 23,4,140,78,2592,18.5,75,1,pontiac astro 173 | 24,4,134,96,2702,13.5,75,3,toyota corona 174 | 25,4,90,71,2223,16.5,75,2,volkswagen dasher 175 | 24,4,119,97,2545,17,75,3,datsun 710 176 | 18,6,171,97,2984,14.5,75,1,ford pinto 177 | 29,4,90,70,1937,14,75,2,volkswagen rabbit 178 | 19,6,232,90,3211,17,75,1,amc pacer 179 | 23,4,115,95,2694,15,75,2,audi 100ls 180 | 23,4,120,88,2957,17,75,2,peugeot 504 181 | 22,4,121,98,2945,14.5,75,2,volvo 244dl 182 | 25,4,121,115,2671,13.5,75,2,saab 99le 183 | 33,4,91,53,1795,17.5,75,3,honda civic cvcc 184 | 28,4,107,86,2464,15.5,76,2,fiat 131 185 | 25,4,116,81,2220,16.9,76,2,opel 1900 186 | 25,4,140,92,2572,14.9,76,1,capri ii 187 | 26,4,98,79,2255,17.7,76,1,dodge colt 188 | 27,4,101,83,2202,15.3,76,2,renault 12tl 189 | 17.5,8,305,140,4215,13,76,1,chevrolet chevelle malibu classic 190 | 16,8,318,150,4190,13,76,1,dodge coronet brougham 191 | 15.5,8,304,120,3962,13.9,76,1,amc matador 192 | 14.5,8,351,152,4215,12.8,76,1,ford gran torino 193 | 22,6,225,100,3233,15.4,76,1,plymouth valiant 194 | 22,6,250,105,3353,14.5,76,1,chevrolet nova 195 | 24,6,200,81,3012,17.6,76,1,ford maverick 196 | 22.5,6,232,90,3085,17.6,76,1,amc hornet 197 | 29,4,85,52,2035,22.2,76,1,chevrolet chevette 198 | 24.5,4,98,60,2164,22.1,76,1,chevrolet woody 199 | 29,4,90,70,1937,14.2,76,2,vw rabbit 200 | 33,4,91,53,1795,17.4,76,3,honda civic 201 | 20,6,225,100,3651,17.7,76,1,dodge aspen se 202 | 18,6,250,78,3574,21,76,1,ford granada ghia 203 | 18.5,6,250,110,3645,16.2,76,1,pontiac ventura sj 204 | 17.5,6,258,95,3193,17.8,76,1,amc pacer d/l 205 | 29.5,4,97,71,1825,12.2,76,2,volkswagen rabbit 206 | 32,4,85,70,1990,17,76,3,datsun b-210 207 | 28,4,97,75,2155,16.4,76,3,toyota corolla 208 | 26.5,4,140,72,2565,13.6,76,1,ford pinto 209 | 20,4,130,102,3150,15.7,76,2,volvo 245 210 | 13,8,318,150,3940,13.2,76,1,plymouth volare premier v8 211 | 19,4,120,88,3270,21.9,76,2,peugeot 504 212 | 19,6,156,108,2930,15.5,76,3,toyota mark ii 213 | 16.5,6,168,120,3820,16.7,76,2,mercedes-benz 280s 214 | 16.5,8,350,180,4380,12.1,76,1,cadillac seville 215 | 13,8,350,145,4055,12,76,1,chevy c10 216 | 13,8,302,130,3870,15,76,1,ford f108 217 | 13,8,318,150,3755,14,76,1,dodge d100 218 | 31.5,4,98,68,2045,18.5,77,3,honda accord cvcc 219 | 30,4,111,80,2155,14.8,77,1,buick opel isuzu deluxe 220 | 36,4,79,58,1825,18.6,77,2,renault 5 gtl 221 | 25.5,4,122,96,2300,15.5,77,1,plymouth arrow gs 222 | 33.5,4,85,70,1945,16.8,77,3,datsun f-10 hatchback 223 | 17.5,8,305,145,3880,12.5,77,1,chevrolet caprice classic 224 | 17,8,260,110,4060,19,77,1,oldsmobile cutlass supreme 225 | 15.5,8,318,145,4140,13.7,77,1,dodge monaco brougham 226 | 15,8,302,130,4295,14.9,77,1,mercury cougar brougham 227 | 17.5,6,250,110,3520,16.4,77,1,chevrolet concours 228 | 20.5,6,231,105,3425,16.9,77,1,buick skylark 229 | 19,6,225,100,3630,17.7,77,1,plymouth volare custom 230 | 18.5,6,250,98,3525,19,77,1,ford granada 231 | 16,8,400,180,4220,11.1,77,1,pontiac grand prix lj 232 | 15.5,8,350,170,4165,11.4,77,1,chevrolet monte carlo landau 233 | 15.5,8,400,190,4325,12.2,77,1,chrysler cordoba 234 | 16,8,351,149,4335,14.5,77,1,ford thunderbird 235 | 29,4,97,78,1940,14.5,77,2,volkswagen rabbit custom 236 | 24.5,4,151,88,2740,16,77,1,pontiac sunbird coupe 237 | 26,4,97,75,2265,18.2,77,3,toyota corolla liftback 238 | 25.5,4,140,89,2755,15.8,77,1,ford mustang ii 2+2 239 | 30.5,4,98,63,2051,17,77,1,chevrolet chevette 240 | 33.5,4,98,83,2075,15.9,77,1,dodge colt m/m 241 | 30,4,97,67,1985,16.4,77,3,subaru dl 242 | 30.5,4,97,78,2190,14.1,77,2,volkswagen dasher 243 | 22,6,146,97,2815,14.5,77,3,datsun 810 244 | 21.5,4,121,110,2600,12.8,77,2,bmw 320i 245 | 21.5,3,80,110,2720,13.5,77,3,mazda rx-4 246 | 43.1,4,90,48,1985,21.5,78,2,volkswagen rabbit custom diesel 247 | 36.1,4,98,66,1800,14.4,78,1,ford fiesta 248 | 32.8,4,78,52,1985,19.4,78,3,mazda glc deluxe 249 | 39.4,4,85,70,2070,18.6,78,3,datsun b210 gx 250 | 36.1,4,91,60,1800,16.4,78,3,honda civic cvcc 251 | 19.9,8,260,110,3365,15.5,78,1,oldsmobile cutlass salon brougham 252 | 19.4,8,318,140,3735,13.2,78,1,dodge diplomat 253 | 20.2,8,302,139,3570,12.8,78,1,mercury monarch ghia 254 | 19.2,6,231,105,3535,19.2,78,1,pontiac phoenix lj 255 | 20.5,6,200,95,3155,18.2,78,1,chevrolet malibu 256 | 20.2,6,200,85,2965,15.8,78,1,ford fairmont (auto) 257 | 25.1,4,140,88,2720,15.4,78,1,ford fairmont (man) 258 | 20.5,6,225,100,3430,17.2,78,1,plymouth volare 259 | 19.4,6,232,90,3210,17.2,78,1,amc concord 260 | 20.6,6,231,105,3380,15.8,78,1,buick century special 261 | 20.8,6,200,85,3070,16.7,78,1,mercury zephyr 262 | 18.6,6,225,110,3620,18.7,78,1,dodge aspen 263 | 18.1,6,258,120,3410,15.1,78,1,amc concord d/l 264 | 19.2,8,305,145,3425,13.2,78,1,chevrolet monte carlo landau 265 | 17.7,6,231,165,3445,13.4,78,1,buick regal sport coupe (turbo) 266 | 18.1,8,302,139,3205,11.2,78,1,ford futura 267 | 17.5,8,318,140,4080,13.7,78,1,dodge magnum xe 268 | 30,4,98,68,2155,16.5,78,1,chevrolet chevette 269 | 27.5,4,134,95,2560,14.2,78,3,toyota corona 270 | 27.2,4,119,97,2300,14.7,78,3,datsun 510 271 | 30.9,4,105,75,2230,14.5,78,1,dodge omni 272 | 21.1,4,134,95,2515,14.8,78,3,toyota celica gt liftback 273 | 23.2,4,156,105,2745,16.7,78,1,plymouth sapporo 274 | 23.8,4,151,85,2855,17.6,78,1,oldsmobile starfire sx 275 | 23.9,4,119,97,2405,14.9,78,3,datsun 200-sx 276 | 20.3,5,131,103,2830,15.9,78,2,audi 5000 277 | 17,6,163,125,3140,13.6,78,2,volvo 264gl 278 | 21.6,4,121,115,2795,15.7,78,2,saab 99gle 279 | 16.2,6,163,133,3410,15.8,78,2,peugeot 604sl 280 | 31.5,4,89,71,1990,14.9,78,2,volkswagen scirocco 281 | 29.5,4,98,68,2135,16.6,78,3,honda accord lx 282 | 21.5,6,231,115,3245,15.4,79,1,pontiac lemans v6 283 | 19.8,6,200,85,2990,18.2,79,1,mercury zephyr 6 284 | 22.3,4,140,88,2890,17.3,79,1,ford fairmont 4 285 | 20.2,6,232,90,3265,18.2,79,1,amc concord dl 6 286 | 20.6,6,225,110,3360,16.6,79,1,dodge aspen 6 287 | 17,8,305,130,3840,15.4,79,1,chevrolet caprice classic 288 | 17.6,8,302,129,3725,13.4,79,1,ford ltd landau 289 | 16.5,8,351,138,3955,13.2,79,1,mercury grand marquis 290 | 18.2,8,318,135,3830,15.2,79,1,dodge st. regis 291 | 16.9,8,350,155,4360,14.9,79,1,buick estate wagon (sw) 292 | 15.5,8,351,142,4054,14.3,79,1,ford country squire (sw) 293 | 19.2,8,267,125,3605,15,79,1,chevrolet malibu classic (sw) 294 | 18.5,8,360,150,3940,13,79,1,chrysler lebaron town @ country (sw) 295 | 31.9,4,89,71,1925,14,79,2,vw rabbit custom 296 | 34.1,4,86,65,1975,15.2,79,3,maxda glc deluxe 297 | 35.7,4,98,80,1915,14.4,79,1,dodge colt hatchback custom 298 | 27.4,4,121,80,2670,15,79,1,amc spirit dl 299 | 25.4,5,183,77,3530,20.1,79,2,mercedes benz 300d 300 | 23,8,350,125,3900,17.4,79,1,cadillac eldorado 301 | 27.2,4,141,71,3190,24.8,79,2,peugeot 504 302 | 23.9,8,260,90,3420,22.2,79,1,oldsmobile cutlass salon brougham 303 | 34.2,4,105,70,2200,13.2,79,1,plymouth horizon 304 | 34.5,4,105,70,2150,14.9,79,1,plymouth horizon tc3 305 | 31.8,4,85,65,2020,19.2,79,3,datsun 210 306 | 37.3,4,91,69,2130,14.7,79,2,fiat strada custom 307 | 28.4,4,151,90,2670,16,79,1,buick skylark limited 308 | 28.8,6,173,115,2595,11.3,79,1,chevrolet citation 309 | 26.8,6,173,115,2700,12.9,79,1,oldsmobile omega brougham 310 | 33.5,4,151,90,2556,13.2,79,1,pontiac phoenix 311 | 41.5,4,98,76,2144,14.7,80,2,vw rabbit 312 | 38.1,4,89,60,1968,18.8,80,3,toyota corolla tercel 313 | 32.1,4,98,70,2120,15.5,80,1,chevrolet chevette 314 | 37.2,4,86,65,2019,16.4,80,3,datsun 310 315 | 28,4,151,90,2678,16.5,80,1,chevrolet citation 316 | 26.4,4,140,88,2870,18.1,80,1,ford fairmont 317 | 24.3,4,151,90,3003,20.1,80,1,amc concord 318 | 19.1,6,225,90,3381,18.7,80,1,dodge aspen 319 | 34.3,4,97,78,2188,15.8,80,2,audi 4000 320 | 29.8,4,134,90,2711,15.5,80,3,toyota corona liftback 321 | 31.3,4,120,75,2542,17.5,80,3,mazda 626 322 | 37,4,119,92,2434,15,80,3,datsun 510 hatchback 323 | 32.2,4,108,75,2265,15.2,80,3,toyota corolla 324 | 46.6,4,86,65,2110,17.9,80,3,mazda glc 325 | 27.9,4,156,105,2800,14.4,80,1,dodge colt 326 | 40.8,4,85,65,2110,19.2,80,3,datsun 210 327 | 44.3,4,90,48,2085,21.7,80,2,vw rabbit c (diesel) 328 | 43.4,4,90,48,2335,23.7,80,2,vw dasher (diesel) 329 | 36.4,5,121,67,2950,19.9,80,2,audi 5000s (diesel) 330 | 30,4,146,67,3250,21.8,80,2,mercedes-benz 240d 331 | 44.6,4,91,67,1850,13.8,80,3,honda civic 1500 gl 332 | 40.9,4,85,?,1835,17.3,80,2,renault lecar deluxe 333 | 33.8,4,97,67,2145,18,80,3,subaru dl 334 | 29.8,4,89,62,1845,15.3,80,2,vokswagen rabbit 335 | 32.7,6,168,132,2910,11.4,80,3,datsun 280-zx 336 | 23.7,3,70,100,2420,12.5,80,3,mazda rx-7 gs 337 | 35,4,122,88,2500,15.1,80,2,triumph tr7 coupe 338 | 23.6,4,140,?,2905,14.3,80,1,ford mustang cobra 339 | 32.4,4,107,72,2290,17,80,3,honda accord 340 | 27.2,4,135,84,2490,15.7,81,1,plymouth reliant 341 | 26.6,4,151,84,2635,16.4,81,1,buick skylark 342 | 25.8,4,156,92,2620,14.4,81,1,dodge aries wagon (sw) 343 | 23.5,6,173,110,2725,12.6,81,1,chevrolet citation 344 | 30,4,135,84,2385,12.9,81,1,plymouth reliant 345 | 39.1,4,79,58,1755,16.9,81,3,toyota starlet 346 | 39,4,86,64,1875,16.4,81,1,plymouth champ 347 | 35.1,4,81,60,1760,16.1,81,3,honda civic 1300 348 | 32.3,4,97,67,2065,17.8,81,3,subaru 349 | 37,4,85,65,1975,19.4,81,3,datsun 210 mpg 350 | 37.7,4,89,62,2050,17.3,81,3,toyota tercel 351 | 34.1,4,91,68,1985,16,81,3,mazda glc 4 352 | 34.7,4,105,63,2215,14.9,81,1,plymouth horizon 4 353 | 34.4,4,98,65,2045,16.2,81,1,ford escort 4w 354 | 29.9,4,98,65,2380,20.7,81,1,ford escort 2h 355 | 33,4,105,74,2190,14.2,81,2,volkswagen jetta 356 | 34.5,4,100,?,2320,15.8,81,2,renault 18i 357 | 33.7,4,107,75,2210,14.4,81,3,honda prelude 358 | 32.4,4,108,75,2350,16.8,81,3,toyota corolla 359 | 32.9,4,119,100,2615,14.8,81,3,datsun 200sx 360 | 31.6,4,120,74,2635,18.3,81,3,mazda 626 361 | 28.1,4,141,80,3230,20.4,81,2,peugeot 505s turbo diesel 362 | 30.7,6,145,76,3160,19.6,81,2,volvo diesel 363 | 25.4,6,168,116,2900,12.6,81,3,toyota cressida 364 | 24.2,6,146,120,2930,13.8,81,3,datsun 810 maxima 365 | 22.4,6,231,110,3415,15.8,81,1,buick century 366 | 26.6,8,350,105,3725,19,81,1,oldsmobile cutlass ls 367 | 20.2,6,200,88,3060,17.1,81,1,ford granada gl 368 | 17.6,6,225,85,3465,16.6,81,1,chrysler lebaron salon 369 | 28,4,112,88,2605,19.6,82,1,chevrolet cavalier 370 | 27,4,112,88,2640,18.6,82,1,chevrolet cavalier wagon 371 | 34,4,112,88,2395,18,82,1,chevrolet cavalier 2-door 372 | 31,4,112,85,2575,16.2,82,1,pontiac j2000 se hatchback 373 | 29,4,135,84,2525,16,82,1,dodge aries se 374 | 27,4,151,90,2735,18,82,1,pontiac phoenix 375 | 24,4,140,92,2865,16.4,82,1,ford fairmont futura 376 | 36,4,105,74,1980,15.3,82,2,volkswagen rabbit l 377 | 37,4,91,68,2025,18.2,82,3,mazda glc custom l 378 | 31,4,91,68,1970,17.6,82,3,mazda glc custom 379 | 38,4,105,63,2125,14.7,82,1,plymouth horizon miser 380 | 36,4,98,70,2125,17.3,82,1,mercury lynx l 381 | 36,4,120,88,2160,14.5,82,3,nissan stanza xe 382 | 36,4,107,75,2205,14.5,82,3,honda accord 383 | 34,4,108,70,2245,16.9,82,3,toyota corolla 384 | 38,4,91,67,1965,15,82,3,honda civic 385 | 32,4,91,67,1965,15.7,82,3,honda civic (auto) 386 | 38,4,91,67,1995,16.2,82,3,datsun 310 gx 387 | 25,6,181,110,2945,16.4,82,1,buick century limited 388 | 38,6,262,85,3015,17,82,1,oldsmobile cutlass ciera (diesel) 389 | 26,4,156,92,2585,14.5,82,1,chrysler lebaron medallion 390 | 22,6,232,112,2835,14.7,82,1,ford granada l 391 | 32,4,144,96,2665,13.9,82,3,toyota celica gt 392 | 36,4,135,84,2370,13,82,1,dodge charger 2.2 393 | 27,4,151,90,2950,17.3,82,1,chevrolet camaro 394 | 27,4,140,86,2790,15.6,82,1,ford mustang gl 395 | 44,4,97,52,2130,24.6,82,2,vw pickup 396 | 32,4,135,84,2295,11.6,82,1,dodge rampage 397 | 28,4,120,79,2625,18.6,82,1,ford ranger 398 | 31,4,119,82,2720,19.4,82,1,chevy s-10 399 | -------------------------------------------------------------------------------- /data/BrainCancer.csv: -------------------------------------------------------------------------------- 1 | "sex","diagnosis","loc","ki","gtv","stereo","status","time" 2 | "Female","Meningioma","Infratentorial",90,6.11,"SRS",0,57.64 3 | "Male","HG glioma","Supratentorial",90,19.35,"SRT",1,8.98 4 | "Female","Meningioma","Infratentorial",70,7.95,"SRS",0,26.46 5 | "Female","LG glioma","Supratentorial",80,7.61,"SRT",1,47.8 6 | "Male","HG glioma","Supratentorial",90,5.06,"SRT",1,6.3 7 | "Female","Meningioma","Supratentorial",80,4.82,"SRS",0,52.75 8 | "Male","Meningioma","Supratentorial",80,3.19,"SRT",0,55.8 9 | "Male","LG glioma","Supratentorial",80,12.37,"SRT",0,42.1 10 | "Female","Meningioma","Supratentorial",70,12.16,"SRT",0,34.66 11 | "Male","HG glioma","Supratentorial",100,2.53,"SRT",0,11.48 12 | "Male","LG glioma","Supratentorial",80,0.14,"SRT",1,35.93 13 | "Female","Meningioma","Infratentorial",90,6.54,"SRS",0,34.26 14 | "Female","Meningioma","Infratentorial",90,0.63,"SRS",0,32.98 15 | "Male",NA,"Supratentorial",90,6.38,"SRT",0,50.85 16 | "Female","Meningioma","Supratentorial",60,9.18,"SRT",0,41.44 17 | "Female","HG glioma","Supratentorial",70,11.38,"SRS",1,7.05 18 | "Female","Other","Infratentorial",60,24,"SRT",1,6.82 19 | "Male","HG glioma","Supratentorial",90,10.8,"SRT",0,82.56 20 | "Male","Meningioma","Supratentorial",80,13.49,"SRS",1,6.92 21 | "Female","Meningioma","Supratentorial",90,2.5,"SRT",0,30.16 22 | "Female","Meningioma","Supratentorial",80,2.82,"SRS",0,24.39 23 | "Male","HG glioma","Supratentorial",70,14.44,"SRT",1,14 24 | "Female","Other","Infratentorial",80,2.11,"SRS",0,10.49 25 | "Female","Meningioma","Infratentorial",100,2.13,"SRS",1,51.02 26 | "Female","Meningioma","Supratentorial",70,6.48,"SRT",1,33.41 27 | "Male","LG glioma","Supratentorial",90,4.23,"SRT",1,25.02 28 | "Male","Other","Supratentorial",60,34.64,"SRT",1,11.57 29 | "Male","HG glioma","Supratentorial",70,33.69,"SRT",1,0.07 30 | "Male","Meningioma","Supratentorial",60,3.81,"SRT",0,36.1 31 | "Female","Meningioma","Supratentorial",90,4.72,"SRS",0,65.02 32 | "Female","LG glioma","Supratentorial",80,0.85,"SRS",1,6.1 33 | "Male","Meningioma","Supratentorial",90,2.56,"SRS",0,44.39 34 | "Female","Other","Infratentorial",70,13.45,"SRT",1,10.82 35 | "Male","Other","Infratentorial",80,6.81,"SRS",0,57.11 36 | "Female","Meningioma","Supratentorial",90,7.3,"SRT",0,5.51 37 | "Female","Other","Supratentorial",70,14.26,"SRT",0,7.18 38 | "Female","Meningioma","Supratentorial",80,6.6,"SRT",0,14.75 39 | "Male","HG glioma","Supratentorial",90,9.95,"SRT",1,6.23 40 | "Male","Other","Infratentorial",80,12.51,"SRT",1,29.7 41 | "Female","Meningioma","Supratentorial",90,2.54,"SRT",0,45.74 42 | "Female","Meningioma","Supratentorial",80,1.57,"SRT",0,2.03 43 | "Male","HG glioma","Supratentorial",90,0.28,"SRT",1,16.43 44 | "Female","Meningioma","Supratentorial",70,6.7,"SRT",0,14.56 45 | "Male","Meningioma","Supratentorial",80,12.63,"SRT",1,4.16 46 | "Male","Other","Infratentorial",90,3.12,"SRT",0,18.95 47 | "Male","Meningioma","Supratentorial",60,7.09,"SRS",1,31.25 48 | "Male","HG glioma","Supratentorial",80,29.27,"SRT",0,5.15 49 | "Female","Meningioma","Supratentorial",80,26.31,"SRT",1,39.54 50 | "Male","Meningioma","Supratentorial",70,0.97,"SRT",1,1.41 51 | "Female","LG glioma","Supratentorial",80,0.19,"SRS",0,11.51 52 | "Female","HG glioma","Supratentorial",90,0.04,"SRT",0,31.67 53 | "Female","Meningioma","Infratentorial",90,9.24,"SRT",0,26.85 54 | "Male","HG glioma","Supratentorial",90,2.5,"SRT",1,9.77 55 | "Male","Meningioma","Infratentorial",80,24.41,"SRT",0,39.54 56 | "Female","HG glioma","Supratentorial",80,0.63,"SRT",1,16.92 57 | "Male","Other","Infratentorial",90,0.48,"SRS",0,54.43 58 | "Male","HG glioma","Infratentorial",80,0.22,"SRS",0,33.67 59 | "Male","HG glioma","Supratentorial",80,3.75,"SRT",1,19.9 60 | "Female","Other","Supratentorial",80,11.83,"SRT",1,22.03 61 | "Female","Meningioma","Supratentorial",90,2.47,"SRT",0,17.57 62 | "Female","HG glioma","Supratentorial",80,12.08,"SRT",1,7.25 63 | "Male","Meningioma","Supratentorial",80,11.51,"SRT",1,14.62 64 | "Female","HG glioma","Supratentorial",40,22.87,"SRT",1,3.38 65 | "Male","Meningioma","Supratentorial",80,4.77,"SRT",0,67.38 66 | "Male","LG glioma","Supratentorial",80,9.58,"SRT",0,78.75 67 | "Female","Meningioma","Supratentorial",100,4,"SRT",0,52.23 68 | "Female","HG glioma","Supratentorial",80,7.59,"SRT",1,4.56 69 | "Male","Other","Infratentorial",70,0.01,"SRS",0,23.67 70 | "Female","Meningioma","Supratentorial",80,6.93,"SRS",0,10.1 71 | "Female","Meningioma","Supratentorial",70,3.63,"SRT",0,32.82 72 | "Male","Meningioma","Supratentorial",70,8.45,"SRT",0,19.41 73 | "Male","Meningioma","Supratentorial",80,20.93,"SRT",1,31.15 74 | "Male","LG glioma","Supratentorial",90,2.64,"SRT",0,20.13 75 | "Female","HG glioma","Supratentorial",80,0.19,"SRT",1,11.02 76 | "Male","Other","Supratentorial",100,24.91,"SRT",0,19.74 77 | "Female","Meningioma","Supratentorial",80,31.74,"SRT",0,57.25 78 | "Female","Meningioma","Supratentorial",80,2.39,"SRS",0,73.74 79 | "Female","Meningioma","Supratentorial",90,7.26,"SRT",0,49.05 80 | "Female","Meningioma","Supratentorial",100,9.66,"SRT",0,39.25 81 | "Female","Meningioma","Infratentorial",70,2.94,"SRS",0,1.54 82 | "Female","HG glioma","Supratentorial",80,15.45,"SRT",1,46.16 83 | "Female","Other","Supratentorial",90,1.82,"SRT",0,47.11 84 | "Male","LG glioma","Infratentorial",90,30.41,"SRT",0,1.18 85 | "Male","HG glioma","Supratentorial",80,0.16,"SRT",1,20.69 86 | "Male","HG glioma","Supratentorial",80,19.81,"SRT",1,6.39 87 | "Male","Meningioma","Supratentorial",90,2.5,"SRT",0,32.82 88 | "Male","Meningioma","Supratentorial",90,2.02,"SRS",0,42.07 89 | "Male","Other","Infratentorial",80,0.11,"SRT",0,13.9 90 | -------------------------------------------------------------------------------- /data/Carseats.csv: -------------------------------------------------------------------------------- 1 | Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US 9.5,138,73,11,276,120,Bad,42,17,Yes,Yes 11.22,111,48,16,260,83,Good,65,10,Yes,Yes 10.06,113,35,10,269,80,Medium,59,12,Yes,Yes 7.4,117,100,4,466,97,Medium,55,14,Yes,Yes 4.15,141,64,3,340,128,Bad,38,13,Yes,No 10.81,124,113,13,501,72,Bad,78,16,No,Yes 6.63,115,105,0,45,108,Medium,71,15,Yes,No 11.85,136,81,15,425,120,Good,67,10,Yes,Yes 6.54,132,110,0,108,124,Medium,76,10,No,No 4.69,132,113,0,131,124,Medium,76,17,No,Yes 9.01,121,78,9,150,100,Bad,26,10,No,Yes 11.96,117,94,4,503,94,Good,50,13,Yes,Yes 3.98,122,35,2,393,136,Medium,62,18,Yes,No 10.96,115,28,11,29,86,Good,53,18,Yes,Yes 11.17,107,117,11,148,118,Good,52,18,Yes,Yes 8.71,149,95,5,400,144,Medium,76,18,No,No 7.58,118,32,0,284,110,Good,63,13,Yes,No 12.29,147,74,13,251,131,Good,52,10,Yes,Yes 13.91,110,110,0,408,68,Good,46,17,No,Yes 8.73,129,76,16,58,121,Medium,69,12,Yes,Yes 6.41,125,90,2,367,131,Medium,35,18,Yes,Yes 12.13,134,29,12,239,109,Good,62,18,No,Yes 5.08,128,46,6,497,138,Medium,42,13,Yes,No 5.87,121,31,0,292,109,Medium,79,10,Yes,No 10.14,145,119,16,294,113,Bad,42,12,Yes,Yes 14.9,139,32,0,176,82,Good,54,11,No,No 8.33,107,115,11,496,131,Good,50,11,No,Yes 5.27,98,118,0,19,107,Medium,64,17,Yes,No 2.99,103,74,0,359,97,Bad,55,11,Yes,Yes 7.81,104,99,15,226,102,Bad,58,17,Yes,Yes 13.55,125,94,0,447,89,Good,30,12,Yes,No 8.25,136,58,16,241,131,Medium,44,18,Yes,Yes 6.2,107,32,12,236,137,Good,64,10,No,Yes 8.77,114,38,13,317,128,Good,50,16,Yes,Yes 2.67,115,54,0,406,128,Medium,42,17,Yes,Yes 11.07,131,84,11,29,96,Medium,44,17,No,Yes 8.89,122,76,0,270,100,Good,60,18,No,No 4.95,121,41,5,412,110,Medium,54,10,Yes,Yes 6.59,109,73,0,454,102,Medium,65,15,Yes,No 3.24,130,60,0,144,138,Bad,38,10,No,No 2.07,119,98,0,18,126,Bad,73,17,No,No 7.96,157,53,0,403,124,Bad,58,16,Yes,No 10.43,77,69,0,25,24,Medium,50,18,Yes,No 4.12,123,42,11,16,134,Medium,59,13,Yes,Yes 4.16,85,79,6,325,95,Medium,69,13,Yes,Yes 4.56,141,63,0,168,135,Bad,44,12,Yes,Yes 12.44,127,90,14,16,70,Medium,48,15,No,Yes 4.38,126,98,0,173,108,Bad,55,16,Yes,No 3.91,116,52,0,349,98,Bad,69,18,Yes,No 10.61,157,93,0,51,149,Good,32,17,Yes,No 1.42,99,32,18,341,108,Bad,80,16,Yes,Yes 4.42,121,90,0,150,108,Bad,75,16,Yes,No 7.91,153,40,3,112,129,Bad,39,18,Yes,Yes 6.92,109,64,13,39,119,Medium,61,17,Yes,Yes 4.9,134,103,13,25,144,Medium,76,17,No,Yes 6.85,143,81,5,60,154,Medium,61,18,Yes,Yes 11.91,133,82,0,54,84,Medium,50,17,Yes,No 0.91,93,91,0,22,117,Bad,75,11,Yes,No 5.42,103,93,15,188,103,Bad,74,16,Yes,Yes 5.21,118,71,4,148,114,Medium,80,13,Yes,No 8.32,122,102,19,469,123,Bad,29,13,Yes,Yes 7.32,105,32,0,358,107,Medium,26,13,No,No 1.82,139,45,0,146,133,Bad,77,17,Yes,Yes 8.47,119,88,10,170,101,Medium,61,13,Yes,Yes 7.8,100,67,12,184,104,Medium,32,16,No,Yes 4.9,122,26,0,197,128,Medium,55,13,No,No 8.85,127,92,0,508,91,Medium,56,18,Yes,No 9.01,126,61,14,152,115,Medium,47,16,Yes,Yes 13.39,149,69,20,366,134,Good,60,13,Yes,Yes 7.99,127,59,0,339,99,Medium,65,12,Yes,No 9.46,89,81,15,237,99,Good,74,12,Yes,Yes 6.5,148,51,16,148,150,Medium,58,17,No,Yes 5.52,115,45,0,432,116,Medium,25,15,Yes,No 12.61,118,90,10,54,104,Good,31,11,No,Yes 6.2,150,68,5,125,136,Medium,64,13,No,Yes 8.55,88,111,23,480,92,Bad,36,16,No,Yes 10.64,102,87,10,346,70,Medium,64,15,Yes,Yes 7.7,118,71,12,44,89,Medium,67,18,No,Yes 4.43,134,48,1,139,145,Medium,65,12,Yes,Yes 9.14,134,67,0,286,90,Bad,41,13,Yes,No 8.01,113,100,16,353,79,Bad,68,11,Yes,Yes 7.52,116,72,0,237,128,Good,70,13,Yes,No 11.62,151,83,4,325,139,Good,28,17,Yes,Yes 4.42,109,36,7,468,94,Bad,56,11,Yes,Yes 2.23,111,25,0,52,121,Bad,43,18,No,No 8.47,125,103,0,304,112,Medium,49,13,No,No 8.7,150,84,9,432,134,Medium,64,15,Yes,No 11.7,131,67,7,272,126,Good,54,16,No,Yes 6.56,117,42,7,144,111,Medium,62,10,Yes,Yes 7.95,128,66,3,493,119,Medium,45,16,No,No 5.33,115,22,0,491,103,Medium,64,11,No,No 4.81,97,46,11,267,107,Medium,80,15,Yes,Yes 4.53,114,113,0,97,125,Medium,29,12,Yes,No 8.86,145,30,0,67,104,Medium,55,17,Yes,No 8.39,115,97,5,134,84,Bad,55,11,Yes,Yes 5.58,134,25,10,237,148,Medium,59,13,Yes,Yes 9.48,147,42,10,407,132,Good,73,16,No,Yes 7.45,161,82,5,287,129,Bad,33,16,Yes,Yes 12.49,122,77,24,382,127,Good,36,16,No,Yes 4.88,121,47,3,220,107,Bad,56,16,No,Yes 4.11,113,69,11,94,106,Medium,76,12,No,Yes 6.2,128,93,0,89,118,Medium,34,18,Yes,No 5.3,113,22,0,57,97,Medium,65,16,No,No 5.07,123,91,0,334,96,Bad,78,17,Yes,Yes 4.62,121,96,0,472,138,Medium,51,12,Yes,No 5.55,104,100,8,398,97,Medium,61,11,Yes,Yes 0.16,102,33,0,217,139,Medium,70,18,No,No 8.55,134,107,0,104,108,Medium,60,12,Yes,No 3.47,107,79,2,488,103,Bad,65,16,Yes,No 8.98,115,65,0,217,90,Medium,60,17,No,No 9,128,62,7,125,116,Medium,43,14,Yes,Yes 6.62,132,118,12,272,151,Medium,43,14,Yes,Yes 6.67,116,99,5,298,125,Good,62,12,Yes,Yes 6.01,131,29,11,335,127,Bad,33,12,Yes,Yes 9.31,122,87,9,17,106,Medium,65,13,Yes,Yes 8.54,139,35,0,95,129,Medium,42,13,Yes,No 5.08,135,75,0,202,128,Medium,80,10,No,No 8.8,145,53,0,507,119,Medium,41,12,Yes,No 7.57,112,88,2,243,99,Medium,62,11,Yes,Yes 7.37,130,94,8,137,128,Medium,64,12,Yes,Yes 6.87,128,105,11,249,131,Medium,63,13,Yes,Yes 11.67,125,89,10,380,87,Bad,28,10,Yes,Yes 6.88,119,100,5,45,108,Medium,75,10,Yes,Yes 8.19,127,103,0,125,155,Good,29,15,No,Yes 8.87,131,113,0,181,120,Good,63,14,Yes,No 9.34,89,78,0,181,49,Medium,43,15,No,No 11.27,153,68,2,60,133,Good,59,16,Yes,Yes 6.52,125,48,3,192,116,Medium,51,14,Yes,Yes 4.96,133,100,3,350,126,Bad,55,13,Yes,Yes 4.47,143,120,7,279,147,Bad,40,10,No,Yes 8.41,94,84,13,497,77,Medium,51,12,Yes,Yes 6.5,108,69,3,208,94,Medium,77,16,Yes,No 9.54,125,87,9,232,136,Good,72,10,Yes,Yes 7.62,132,98,2,265,97,Bad,62,12,Yes,Yes 3.67,132,31,0,327,131,Medium,76,16,Yes,No 6.44,96,94,14,384,120,Medium,36,18,No,Yes 5.17,131,75,0,10,120,Bad,31,18,No,No 6.52,128,42,0,436,118,Medium,80,11,Yes,No 10.27,125,103,12,371,109,Medium,44,10,Yes,Yes 12.3,146,62,10,310,94,Medium,30,13,No,Yes 6.03,133,60,10,277,129,Medium,45,18,Yes,Yes 6.53,140,42,0,331,131,Bad,28,15,Yes,No 7.44,124,84,0,300,104,Medium,77,15,Yes,No 0.53,122,88,7,36,159,Bad,28,17,Yes,Yes 9.09,132,68,0,264,123,Good,34,11,No,No 8.77,144,63,11,27,117,Medium,47,17,Yes,Yes 3.9,114,83,0,412,131,Bad,39,14,Yes,No 10.51,140,54,9,402,119,Good,41,16,No,Yes 7.56,110,119,0,384,97,Medium,72,14,No,Yes 11.48,121,120,13,140,87,Medium,56,11,Yes,Yes 10.49,122,84,8,176,114,Good,57,10,No,Yes 10.77,111,58,17,407,103,Good,75,17,No,Yes 7.64,128,78,0,341,128,Good,45,13,No,No 5.93,150,36,7,488,150,Medium,25,17,No,Yes 6.89,129,69,10,289,110,Medium,50,16,No,Yes 7.71,98,72,0,59,69,Medium,65,16,Yes,No 7.49,146,34,0,220,157,Good,51,16,Yes,No 10.21,121,58,8,249,90,Medium,48,13,No,Yes 12.53,142,90,1,189,112,Good,39,10,No,Yes 9.32,119,60,0,372,70,Bad,30,18,No,No 4.67,111,28,0,486,111,Medium,29,12,No,No 2.93,143,21,5,81,160,Medium,67,12,No,Yes 3.63,122,74,0,424,149,Medium,51,13,Yes,No 5.68,130,64,0,40,106,Bad,39,17,No,No 8.22,148,64,0,58,141,Medium,27,13,No,Yes 0.37,147,58,7,100,191,Bad,27,15,Yes,Yes 6.71,119,67,17,151,137,Medium,55,11,Yes,Yes 6.71,106,73,0,216,93,Medium,60,13,Yes,No 7.3,129,89,0,425,117,Medium,45,10,Yes,No 11.48,104,41,15,492,77,Good,73,18,Yes,Yes 8.01,128,39,12,356,118,Medium,71,10,Yes,Yes 12.49,93,106,12,416,55,Medium,75,15,Yes,Yes 9.03,104,102,13,123,110,Good,35,16,Yes,Yes 6.38,135,91,5,207,128,Medium,66,18,Yes,Yes 0,139,24,0,358,185,Medium,79,15,No,No 7.54,115,89,0,38,122,Medium,25,12,Yes,No 5.61,138,107,9,480,154,Medium,47,11,No,Yes 10.48,138,72,0,148,94,Medium,27,17,Yes,Yes 10.66,104,71,14,89,81,Medium,25,14,No,Yes 7.78,144,25,3,70,116,Medium,77,18,Yes,Yes 4.94,137,112,15,434,149,Bad,66,13,Yes,Yes 7.43,121,83,0,79,91,Medium,68,11,Yes,No 4.74,137,60,4,230,140,Bad,25,13,Yes,No 5.32,118,74,6,426,102,Medium,80,18,Yes,Yes 9.95,132,33,7,35,97,Medium,60,11,No,Yes 10.07,130,100,11,449,107,Medium,64,10,Yes,Yes 8.68,120,51,0,93,86,Medium,46,17,No,No 6.03,117,32,0,142,96,Bad,62,17,Yes,No 8.07,116,37,0,426,90,Medium,76,15,Yes,No 12.11,118,117,18,509,104,Medium,26,15,No,Yes 8.79,130,37,13,297,101,Medium,37,13,No,Yes 6.67,156,42,13,170,173,Good,74,14,Yes,Yes 7.56,108,26,0,408,93,Medium,56,14,No,No 13.28,139,70,7,71,96,Good,61,10,Yes,Yes 7.23,112,98,18,481,128,Medium,45,11,Yes,Yes 4.19,117,93,4,420,112,Bad,66,11,Yes,Yes 4.1,130,28,6,410,133,Bad,72,16,Yes,Yes 2.52,124,61,0,333,138,Medium,76,16,Yes,No 3.62,112,80,5,500,128,Medium,69,10,Yes,Yes 6.42,122,88,5,335,126,Medium,64,14,Yes,Yes 5.56,144,92,0,349,146,Medium,62,12,No,No 5.94,138,83,0,139,134,Medium,54,18,Yes,No 4.1,121,78,4,413,130,Bad,46,10,No,Yes 2.05,131,82,0,132,157,Bad,25,14,Yes,No 8.74,155,80,0,237,124,Medium,37,14,Yes,No 5.68,113,22,1,317,132,Medium,28,12,Yes,No 4.97,162,67,0,27,160,Medium,77,17,Yes,Yes 8.19,111,105,0,466,97,Bad,61,10,No,No 7.78,86,54,0,497,64,Bad,33,12,Yes,No 3.02,98,21,11,326,90,Bad,76,11,No,Yes 4.36,125,41,2,357,123,Bad,47,14,No,Yes 9.39,117,118,14,445,120,Medium,32,15,Yes,Yes 12.04,145,69,19,501,105,Medium,45,11,Yes,Yes 8.23,149,84,5,220,139,Medium,33,10,Yes,Yes 4.83,115,115,3,48,107,Medium,73,18,Yes,Yes 2.34,116,83,15,170,144,Bad,71,11,Yes,Yes 5.73,141,33,0,243,144,Medium,34,17,Yes,No 4.34,106,44,0,481,111,Medium,70,14,No,No 9.7,138,61,12,156,120,Medium,25,14,Yes,Yes 10.62,116,79,19,359,116,Good,58,17,Yes,Yes 10.59,131,120,15,262,124,Medium,30,10,Yes,Yes 6.43,124,44,0,125,107,Medium,80,11,Yes,No 7.49,136,119,6,178,145,Medium,35,13,Yes,Yes 3.45,110,45,9,276,125,Medium,62,14,Yes,Yes 4.1,134,82,0,464,141,Medium,48,13,No,No 6.68,107,25,0,412,82,Bad,36,14,Yes,No 7.8,119,33,0,245,122,Good,56,14,Yes,No 8.69,113,64,10,68,101,Medium,57,16,Yes,Yes 5.4,149,73,13,381,163,Bad,26,11,No,Yes 11.19,98,104,0,404,72,Medium,27,18,No,No 5.16,115,60,0,119,114,Bad,38,14,No,No 8.09,132,69,0,123,122,Medium,27,11,No,No 13.14,137,80,10,24,105,Good,61,15,Yes,Yes 8.65,123,76,18,218,120,Medium,29,14,No,Yes 9.43,115,62,11,289,129,Good,56,16,No,Yes 5.53,126,32,8,95,132,Medium,50,17,Yes,Yes 9.32,141,34,16,361,108,Medium,69,10,Yes,Yes 9.62,151,28,8,499,135,Medium,48,10,Yes,Yes 7.36,121,24,0,200,133,Good,73,13,Yes,No 3.89,123,105,0,149,118,Bad,62,16,Yes,Yes 10.31,159,80,0,362,121,Medium,26,18,Yes,No 12.01,136,63,0,160,94,Medium,38,12,Yes,No 4.68,124,46,0,199,135,Medium,52,14,No,No 7.82,124,25,13,87,110,Medium,57,10,Yes,Yes 8.78,130,30,0,391,100,Medium,26,18,Yes,No 10,114,43,0,199,88,Good,57,10,No,Yes 6.9,120,56,20,266,90,Bad,78,18,Yes,Yes 5.04,123,114,0,298,151,Bad,34,16,Yes,No 5.36,111,52,0,12,101,Medium,61,11,Yes,Yes 5.05,125,67,0,86,117,Bad,65,11,Yes,No 9.16,137,105,10,435,156,Good,72,14,Yes,Yes 3.72,139,111,5,310,132,Bad,62,13,Yes,Yes 8.31,133,97,0,70,117,Medium,32,16,Yes,No 5.64,124,24,5,288,122,Medium,57,12,No,Yes 9.58,108,104,23,353,129,Good,37,17,Yes,Yes 7.71,123,81,8,198,81,Bad,80,15,Yes,Yes 4.2,147,40,0,277,144,Medium,73,10,Yes,No 8.67,125,62,14,477,112,Medium,80,13,Yes,Yes 3.47,108,38,0,251,81,Bad,72,14,No,No 5.12,123,36,10,467,100,Bad,74,11,No,Yes 7.67,129,117,8,400,101,Bad,36,10,Yes,Yes 5.71,121,42,4,188,118,Medium,54,15,Yes,Yes 6.37,120,77,15,86,132,Medium,48,18,Yes,Yes 7.77,116,26,6,434,115,Medium,25,17,Yes,Yes 6.95,128,29,5,324,159,Good,31,15,Yes,Yes 5.31,130,35,10,402,129,Bad,39,17,Yes,Yes 9.1,128,93,12,343,112,Good,73,17,No,Yes 5.83,134,82,7,473,112,Bad,51,12,No,Yes 6.53,123,57,0,66,105,Medium,39,11,Yes,No 5.01,159,69,0,438,166,Medium,46,17,Yes,No 11.99,119,26,0,284,89,Good,26,10,Yes,No 4.55,111,56,0,504,110,Medium,62,16,Yes,No 12.98,113,33,0,14,63,Good,38,12,Yes,No 10.04,116,106,8,244,86,Medium,58,12,Yes,Yes 7.22,135,93,2,67,119,Medium,34,11,Yes,Yes 6.67,107,119,11,210,132,Medium,53,11,Yes,Yes 6.93,135,69,14,296,130,Medium,73,15,Yes,Yes 7.8,136,48,12,326,125,Medium,36,16,Yes,Yes 7.22,114,113,2,129,151,Good,40,15,No,Yes 3.42,141,57,13,376,158,Medium,64,18,Yes,Yes 2.86,121,86,10,496,145,Bad,51,10,Yes,Yes 11.19,122,69,7,303,105,Good,45,16,No,Yes 7.74,150,96,0,80,154,Good,61,11,Yes,No 5.36,135,110,0,112,117,Medium,80,16,No,No 6.97,106,46,11,414,96,Bad,79,17,No,No 7.6,146,26,11,261,131,Medium,39,10,Yes,Yes 7.53,117,118,11,429,113,Medium,67,18,No,Yes 6.88,95,44,4,208,72,Bad,44,17,Yes,Yes 6.98,116,40,0,74,97,Medium,76,15,No,No 8.75,143,77,25,448,156,Medium,43,17,Yes,Yes 9.49,107,111,14,400,103,Medium,41,11,No,Yes 6.64,118,70,0,106,89,Bad,39,17,Yes,No 11.82,113,66,16,322,74,Good,76,15,Yes,Yes 11.28,123,84,0,74,89,Good,59,10,Yes,No 12.66,148,76,3,126,99,Good,60,11,Yes,Yes 4.21,118,35,14,502,137,Medium,79,10,No,Yes 8.21,127,44,13,160,123,Good,63,18,Yes,Yes 3.07,118,83,13,276,104,Bad,75,10,Yes,Yes 10.98,148,63,0,312,130,Good,63,15,Yes,No 9.4,135,40,17,497,96,Medium,54,17,No,Yes 8.57,116,78,1,158,99,Medium,45,11,Yes,Yes 7.41,99,93,0,198,87,Medium,57,16,Yes,Yes 5.28,108,77,13,388,110,Bad,74,14,Yes,Yes 10.01,133,52,16,290,99,Medium,43,11,Yes,Yes 11.93,123,98,12,408,134,Good,29,10,Yes,Yes 8.03,115,29,26,394,132,Medium,33,13,Yes,Yes 4.78,131,32,1,85,133,Medium,48,12,Yes,Yes 5.9,138,92,0,13,120,Bad,61,12,Yes,No 9.24,126,80,19,436,126,Medium,52,10,Yes,Yes 11.18,131,111,13,33,80,Bad,68,18,Yes,Yes 9.53,175,65,29,419,166,Medium,53,12,Yes,Yes 6.15,146,68,12,328,132,Bad,51,14,Yes,Yes 6.8,137,117,5,337,135,Bad,38,10,Yes,Yes 9.33,103,81,3,491,54,Medium,66,13,Yes,No 7.72,133,33,10,333,129,Good,71,14,Yes,Yes 6.39,131,21,8,220,171,Good,29,14,Yes,Yes 15.63,122,36,5,369,72,Good,35,10,Yes,Yes 6.41,142,30,0,472,136,Good,80,15,No,No 10.08,116,72,10,456,130,Good,41,14,No,Yes 6.97,127,45,19,459,129,Medium,57,11,No,Yes 5.86,136,70,12,171,152,Medium,44,18,Yes,Yes 7.52,123,39,5,499,98,Medium,34,15,Yes,No 9.16,140,50,10,300,139,Good,60,15,Yes,Yes 10.36,107,105,18,428,103,Medium,34,12,Yes,Yes 2.66,136,65,4,133,150,Bad,53,13,Yes,Yes 11.7,144,69,11,131,104,Medium,47,11,Yes,Yes 4.69,133,30,0,152,122,Medium,53,17,Yes,No 6.23,112,38,17,316,104,Medium,80,16,Yes,Yes 3.15,117,66,1,65,111,Bad,55,11,Yes,Yes 11.27,100,54,9,433,89,Good,45,12,Yes,Yes 4.99,122,59,0,501,112,Bad,32,14,No,No 10.1,135,63,15,213,134,Medium,32,10,Yes,Yes 5.74,106,33,20,354,104,Medium,61,12,Yes,Yes 5.87,136,60,7,303,147,Medium,41,10,Yes,Yes 7.63,93,117,9,489,83,Bad,42,13,Yes,Yes 6.18,120,70,15,464,110,Medium,72,15,Yes,Yes 5.17,138,35,6,60,143,Bad,28,18,Yes,No 8.61,130,38,0,283,102,Medium,80,15,Yes,No 5.97,112,24,0,164,101,Medium,45,11,Yes,No 11.54,134,44,4,219,126,Good,44,15,Yes,Yes 7.5,140,29,0,105,91,Bad,43,16,Yes,No 7.38,98,120,0,268,93,Medium,72,10,No,No 7.81,137,102,13,422,118,Medium,71,10,No,Yes 5.99,117,42,10,371,121,Bad,26,14,Yes,Yes 8.43,138,80,0,108,126,Good,70,13,No,Yes 4.81,121,68,0,279,149,Good,79,12,Yes,No 8.97,132,107,0,144,125,Medium,33,13,No,No 6.88,96,39,0,161,112,Good,27,14,No,No 12.57,132,102,20,459,107,Good,49,11,Yes,Yes 9.32,134,27,18,467,96,Medium,49,14,No,Yes 8.64,111,101,17,266,91,Medium,63,17,No,Yes 10.44,124,115,16,458,105,Medium,62,16,No,Yes 13.44,133,103,14,288,122,Good,61,17,Yes,Yes 9.45,107,67,12,430,92,Medium,35,12,No,Yes 5.3,133,31,1,80,145,Medium,42,18,Yes,Yes 7.02,130,100,0,306,146,Good,42,11,Yes,No 3.58,142,109,0,111,164,Good,72,12,Yes,No 13.36,103,73,3,276,72,Medium,34,15,Yes,Yes 4.17,123,96,10,71,118,Bad,69,11,Yes,Yes 3.13,130,62,11,396,130,Bad,66,14,Yes,Yes 8.77,118,86,7,265,114,Good,52,15,No,Yes 8.68,131,25,10,183,104,Medium,56,15,No,Yes 5.25,131,55,0,26,110,Bad,79,12,Yes,Yes 10.26,111,75,1,377,108,Good,25,12,Yes,No 10.5,122,21,16,488,131,Good,30,14,Yes,Yes 6.53,154,30,0,122,162,Medium,57,17,No,No 5.98,124,56,11,447,134,Medium,53,12,No,Yes 14.37,95,106,0,256,53,Good,52,17,Yes,No 10.71,109,22,10,348,79,Good,74,14,No,Yes 10.26,135,100,22,463,122,Medium,36,14,Yes,Yes 7.68,126,41,22,403,119,Bad,42,12,Yes,Yes 9.08,152,81,0,191,126,Medium,54,16,Yes,No 7.8,121,50,0,508,98,Medium,65,11,No,No 5.58,137,71,0,402,116,Medium,78,17,Yes,No 9.44,131,47,7,90,118,Medium,47,12,Yes,Yes 7.9,132,46,4,206,124,Medium,73,11,Yes,No 16.27,141,60,19,319,92,Good,44,11,Yes,Yes 6.81,132,61,0,263,125,Medium,41,12,No,No 6.11,133,88,3,105,119,Medium,79,12,Yes,Yes 5.81,125,111,0,404,107,Bad,54,15,Yes,No 9.64,106,64,10,17,89,Medium,68,17,Yes,Yes 3.9,124,65,21,496,151,Bad,77,13,Yes,Yes 4.95,121,28,19,315,121,Medium,66,14,Yes,Yes 9.35,98,117,0,76,68,Medium,63,10,Yes,No 12.85,123,37,15,348,112,Good,28,12,Yes,Yes 5.87,131,73,13,455,132,Medium,62,17,Yes,Yes 5.32,152,116,0,170,160,Medium,39,16,Yes,No 8.67,142,73,14,238,115,Medium,73,14,No,Yes 8.14,135,89,11,245,78,Bad,79,16,Yes,Yes 8.44,128,42,8,328,107,Medium,35,12,Yes,Yes 5.47,108,75,9,61,111,Medium,67,12,Yes,Yes 6.1,153,63,0,49,124,Bad,56,16,Yes,No 4.53,129,42,13,315,130,Bad,34,13,Yes,Yes 5.57,109,51,10,26,120,Medium,30,17,No,Yes 5.35,130,58,19,366,139,Bad,33,16,Yes,Yes 12.57,138,108,17,203,128,Good,33,14,Yes,Yes 6.14,139,23,3,37,120,Medium,55,11,No,Yes 7.41,162,26,12,368,159,Medium,40,18,Yes,Yes 5.94,100,79,7,284,95,Bad,50,12,Yes,Yes 9.71,134,37,0,27,120,Good,49,16,Yes,Yes -------------------------------------------------------------------------------- /data/Khan.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qx0731/Sharing_ISL_python/6b588e773be9984d0a4b91172fcce803c39fe0d6/data/Khan.rda -------------------------------------------------------------------------------- /data/NCI60.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qx0731/Sharing_ISL_python/6b588e773be9984d0a4b91172fcce803c39fe0d6/data/NCI60.rda -------------------------------------------------------------------------------- /data/NCI60_labs.csv: -------------------------------------------------------------------------------- 1 | "x" 2 | "CNS" 3 | "CNS" 4 | "CNS" 5 | "RENAL" 6 | "BREAST" 7 | "CNS" 8 | "CNS" 9 | "BREAST" 10 | "NSCLC" 11 | "NSCLC" 12 | "RENAL" 13 | "RENAL" 14 | "RENAL" 15 | "RENAL" 16 | "RENAL" 17 | "RENAL" 18 | "RENAL" 19 | "BREAST" 20 | "NSCLC" 21 | "RENAL" 22 | "UNKNOWN" 23 | "OVARIAN" 24 | "MELANOMA" 25 | "PROSTATE" 26 | "OVARIAN" 27 | "OVARIAN" 28 | "OVARIAN" 29 | "OVARIAN" 30 | "OVARIAN" 31 | "PROSTATE" 32 | "NSCLC" 33 | "NSCLC" 34 | "NSCLC" 35 | "LEUKEMIA" 36 | "K562B-repro" 37 | "K562A-repro" 38 | "LEUKEMIA" 39 | "LEUKEMIA" 40 | "LEUKEMIA" 41 | "LEUKEMIA" 42 | "LEUKEMIA" 43 | "COLON" 44 | "COLON" 45 | "COLON" 46 | "COLON" 47 | "COLON" 48 | "COLON" 49 | "COLON" 50 | "MCF7A-repro" 51 | "BREAST" 52 | "MCF7D-repro" 53 | "BREAST" 54 | "NSCLC" 55 | "NSCLC" 56 | "NSCLC" 57 | "MELANOMA" 58 | "BREAST" 59 | "BREAST" 60 | "MELANOMA" 61 | "MELANOMA" 62 | "MELANOMA" 63 | "MELANOMA" 64 | "MELANOMA" 65 | "MELANOMA" 66 | -------------------------------------------------------------------------------- /data/Portfolio.csv: -------------------------------------------------------------------------------- 1 | "X","Y" 2 | -0.895250889141557,-0.234923525765402 3 | -1.5624543274753,-0.885175993044695 4 | -0.417089883126492,0.271888018049829 5 | 1.04435572526951,-0.734197504067649 6 | -0.315568406681027,0.841983429961188 7 | -1.73712384902476,-2.03719104074984 8 | 1.96641315717111,1.45295666192369 9 | 2.1528678980109,-0.434138628179502 10 | -0.0812080267602958,1.45080850218963 11 | -0.891781794029037,0.821016234539977 12 | -0.293201702010266,-1.04239112183501 13 | 0.50577917106943,0.608477825846609 14 | 0.52675125409276,-0.222493343282789 15 | 1.06646932095091,1.2313566752569 16 | 0.294015895063748,0.628589480036184 17 | 0.0425492997633765,-1.26757361755317 18 | 1.83096958062302,-0.572751605498511 19 | -0.32693749887808,-0.487472465045569 20 | 0.521480415807099,2.56598528732423 21 | 1.39986834733422,-0.35783612748179 22 | -0.645447596468841,-1.41243138949505 23 | -0.904351878449744,-0.568304791041892 24 | -1.76458606961956,-0.746272562068363 25 | -1.81048463818975,0.493747359351401 26 | -1.16989891378141,-2.72528149494243 27 | -0.685375735369436,-0.457615734339251 28 | 1.09091803183517,0.0144945075275287 29 | -0.432340114040807,-0.399831023509433 30 | 0.268814775370724,-0.201608350198064 31 | -0.851840753541132,-1.74182928585454 32 | -1.49708417203583,-0.826033329437587 33 | 0.0887747459974043,-0.887360712723633 34 | -1.60172430963135,-0.695299045952921 35 | -1.24685724025742,-1.52958488449121 36 | -1.06298912830654,-0.110637447363915 37 | -0.26628305530967,0.0451634696288592 38 | 1.67658383263088,2.5200528826286 39 | 0.119572571440877,0.535542781034257 40 | -0.0860079872690871,1.36359582805839 41 | 0.368080289748909,1.72937250996995 42 | -0.271494206939639,1.37926732742329 43 | -0.0859264618788124,-0.127662573750838 44 | -0.190750153683344,-0.461333357787814 45 | -0.781679768391051,1.02239787730549 46 | 0.792436346460761,-0.814298088654853 47 | -0.28286988623389,-1.03846880699277 48 | -0.236625531902745,0.928450553143057 49 | 1.17183009101022,1.72983145002732 50 | 0.496942768505321,-0.925139825948684 51 | -0.887370979477135,-2.2834979593885 52 | -1.30695315836496,-2.38160058115405 53 | -2.43276412040427,-2.02554558512495 54 | -0.407188960959958,-0.335098643325459 55 | -0.285665299455223,-1.30878131266949 56 | 1.52221488310337,1.20100315334525 57 | -0.998106907437742,-0.946268900068486 58 | -0.289973726127379,0.206256579940999 59 | -1.23683924300474,-0.675447507316727 60 | -0.359506962064002,-2.70015447021752 61 | 0.543559153033075,0.42254755209331 62 | -0.403647282894893,-0.0543899228706378 63 | 1.30330893265591,1.32896747385231 64 | -0.717117243405944,1.33137979803966 65 | -1.01270788405516,-0.92476923081864 66 | 0.831992902158869,2.24774586894653 67 | 1.33764359604195,0.868256457487716 68 | 0.601693509867379,-0.198217563055149 69 | 1.30285098047145,1.10466637601686 70 | -0.881700578927026,-1.54068478518396 71 | -0.824529071304578,-1.33700787719544 72 | -0.984356518466055,-1.1391602659206 73 | -1.38499150721135,0.70269993294853 74 | -0.358842560435759,-1.69451276977832 75 | -0.226618229456359,0.801938547570983 76 | -0.941077436691343,-0.733188708932247 77 | 2.4603359481276,-0.048372817002224 78 | 0.716797281412897,0.602336759898045 79 | -0.248087023209405,-1.01849037378952 80 | 1.01077288944297,0.0529780222228798 81 | 2.31304863448491,1.75235887915611 82 | 0.835179797449368,0.98571487565829 83 | -1.07190333913753,-1.24729787324372 84 | -1.6505261438491,0.215464529577012 85 | -0.60048569030458,-0.420940526974254 86 | -0.0585293830470613,0.127620874053089 87 | 0.0757267446338611,-0.522149221026395 88 | -1.15783156137448,0.590893742238611 89 | 1.67360608794112,0.114623316085095 90 | -1.04398823978305,-0.418944284341397 91 | 0.0146874765920298,-0.558746620672602 92 | 0.675321970429067,1.48262978763307 93 | 1.77834230986132,0.942774111448264 94 | -1.29576363940663,-1.0852038131022 95 | 0.0796020218474959,-0.539100814053817 96 | 2.2608577144194,0.673224840266669 97 | 0.479090923233913,1.45477446090542 98 | -0.535019997432816,-0.399174811276031 99 | -0.773129330645406,-0.957174849520677 100 | 0.403634339015336,1.39603816898688 101 | -0.58849643871802,-0.497285090817856 102 | -------------------------------------------------------------------------------- /data/Publication.csv: -------------------------------------------------------------------------------- 1 | "posres","multi","clinend","mech","sampsize","budget","impact","time","status" 2 | 0,0,1,"R01",39876,8.0169405,44.016,11.20328542,1 3 | 0,0,1,"R01",39876,8.0169405,23.494,15.17864476,1 4 | 0,0,1,"R01",8171,7.612606,8.391,24.41067762,1 5 | 0,0,1,"Contract",24335,11.771928,15.402,2.595482546,1 6 | 0,0,1,"Contract",33357,76.517537,16.783,8.607802875,1 7 | 0,0,1,"Contract",10355,9.809938,16.783,8.607802875,1 8 | 0,1,0,"U01",1704,23.818344,5.692,40.04928131,1 9 | 1,0,0,"R01",150,2.703848,3.496,27.07186858,1 10 | 0,0,0,"R01",135,3.454153,9.835,36.00821355,1 11 | 0,1,0,"Contract",423,11.154085,16.783,9.626283368,1 12 | 0,0,1,"Contract",4060,17.95,31.736,14.12731006,1 13 | 0,1,1,"Contract",2481,29.417184,24.831,25.0349076,1 14 | 0,0,1,"Contract",8290,21.55,38.57,11.23613963,1 15 | 0,0,0,"U01",226,5.190647,34.833,34.00410678,1 16 | 0,0,0,"R01",293,1.32166,6.278,22.53798768,1 17 | 1,0,0,"U01",83,2.001333,3.811,50.98973306,1 18 | 1,0,1,"U01",129,5.963904,29.065,3.646817248,1 19 | 1,1,1,"U01",2521,8.96723,44.016,14.68583162,1 20 | 0,0,1,"Contract",6213,4.487559,15.397,5.585215606,1 21 | 0,1,1,"Contract",16608,103.4,16.783,1.544147844,1 22 | 0,1,1,"Contract",10739,79.3,24.831,1.478439425,1 23 | 0,1,1,"Contract",48835,415.1,23.175,10.31622177,1 24 | 0,1,1,"Contract",36282,18.2,51.296,10.57905544,1 25 | 0,0,0,"R01",210,2.255735,0,130.9897331,0 26 | 1,1,1,"R01",508,4.472178,34.833,2.792607803,1 27 | 0,0,0,"U01",261,4.801247,4.029,56.08213552,1 28 | 0,0,0,"U01",303,4.379891,4.029,29.01026694,1 29 | 1,1,0,"U01",810,13.684516,21.455,4.73100616,1 30 | 0,0,1,"U01",4012,8.804224,44.016,16.65708419,1 31 | 0,0,1,"Contract",433,4.699537,23.494,22.6036961,1 32 | 0,1,1,"Contract",10251,106.058401,50.017,3.975359343,1 33 | 0,1,1,"Contract",5518,55.633493,53.406,8.443531828,1 34 | 0,1,1,"Contract",4733,55.633493,53.406,8.443531828,1 35 | 0,0,0,"R01",200,2.231324,23.494,22.50513347,1 36 | 0,0,0,"R01",402,1.12463,11.632,37.05954825,1 37 | 1,0,1,"Contract",19000,9.497703,38.57,10.41478439,1 38 | 0,0,0,"R01",162,2.206498,0,119.063655,0 39 | 0,1,1,"U01",2166,17.595514,51.296,10.4476386,1 40 | 1,0,0,"R01",492,2.198572,8.016,31.47433265,1 41 | 0,0,0,"U01",124,2.708256,3.037,30.06160164,1 42 | 0,1,1,"U01",2368,62.59858062,47.05,6.209445585,1 43 | 1,1,0,"U01",8727,34.224829,3.766,20.99383984,1 44 | 1,0,0,"R01",427,3.58889,25.547,16.45995893,1 45 | 1,0,0,"R01",201,3.487832,1.52,34.95687885,1 46 | 0,0,0,"R01",135,2.292573,0,84.99383984,0 47 | 0,0,1,"R01",902,5.900058,30.011,46.71868583,1 48 | 1,0,0,"R01",500,3.275109,1.711,17.0513347,1 49 | 0,1,1,"U01",1000,16.79932,47.05,3.876796715,1 50 | 0,1,1,"U01",2136,22.592598,53.298,5.059548255,1 51 | 0,0,0,"R01",380,0.925,10.94,3.482546201,1 52 | 0,0,0,"R01",436,2.561561,0,69.84804928,0 53 | 0,0,0,"R01",389,2.617741,0,39.9835729,0 54 | 1,0,0,"R01",302,2.88127,13.254,0.985626283,1 55 | 0,0,1,"U01",7001,17.178972,50.017,6.045174538,1 56 | 1,0,0,"U01",499,11.494152,31.71,9.297741273,1 57 | 0,1,1,"U01",2331,39.218339,28.899,12.78028747,1 58 | 1,0,0,"R01",164,6.776063,23.494,4.566735113,1 59 | 0,0,0,"R01",153,1.827712,23.175,12.12320329,1 60 | 0,0,0,"R01",286,1.951931,4.357,52.36960986,1 61 | 1,1,0,"U01",1032,17.613839,31.71,8.410677618,1 62 | 0,0,0,"U01",811,7.571583,47.05,14.88295688,1 63 | 0,1,1,"U01",911,25.650638,30.011,7.885010267,1 64 | 1,0,0,"R01",453,5.354273,28.899,16.55852156,1 65 | 0,0,0,"U01",300,2.81576,0,26.97330595,0 66 | 1,0,0,"R01",156,4.920416,28.899,5.223819302,1 67 | 0,0,0,"P50",64,1.422245,14.816,16.49281314,1 68 | 0,0,0,"R01",162,2.027778,4.106,38.99794661,1 69 | 0,0,0,"R01",750,3.128757,0,9.987679671,0 70 | 0,1,0,"U01",230,9.41282,14.816,23.35934292,1 71 | 1,0,0,"R01",204,5.18198,0,41.98767967,0 72 | 0,0,1,"R01",209,1.262139,4.259,30.02874743,1 73 | 0,0,0,"R01",247,0.556358,0,63.73716632,1 74 | 1,1,1,"U01",549,11.61306,53.406,6.833675565,1 75 | 0,0,0,"R01",122,2.864764,28.899,6.702258727,1 76 | 0,0,0,"R01",3000,4.076873,0,16,0 77 | 1,0,0,"R01",288,2.895148,11.462,9.626283368,1 78 | 0,0,0,"R01",660,3.064513,0,12.02464066,0 79 | 0,0,0,"R01",300,1.22786,14.816,25.10061602,1 80 | 0,1,1,"U01",3414,23.418146,53.298,5.519507187,1 81 | 0,0,0,"R01",1200,2.568866,3.299,14.98151951,1 82 | 0,0,0,"R01",55,1.721499,0,33.01848049,0 83 | 0,0,0,"R01",6100,5.5364,0,14.98151951,0 84 | 1,0,0,"R01",454,3.586954,1.997,34.00410678,0 85 | 1,0,0,"R01",279,2.467951,2.83,70.14373717,1 86 | 1,0,0,"R01",636,3.702054,16.226,21.5523614,1 87 | 0,0,0,"R01",600,3.749889,0,26.97330595,0 88 | 1,0,0,"R01",4000,3.057155,0,14.98151951,0 89 | 0,0,0,"R01",806,4.266761,4.11,18.0698152,1 90 | 0,0,0,"R01",174,2.617692,6.296,27.43326489,1 91 | 0,0,1,"R01",4837,1.946543,53.406,3.942505133,1 92 | 1,0,0,"R01",419,3.632625,9.813,39.78644764,1 93 | 0,1,0,"U01",199,6.451432,52.589,23.52361396,1 94 | 0,1,0,"U01",119,6.795788,14.739,4.106776181,1 95 | 1,0,0,"R01",845,3.757502,2.83,16.42710472,1 96 | 0,0,0,"R01",464,1.895319,0,43.95893224,0 97 | 0,0,0,"R01",476,3.725195,0,16.65708419,0 98 | 0,0,0,"R01",777,2.728562,0,20.69815195,0 99 | 1,0,0,"R01",847,3.76502,0,20.96098563,0 100 | 1,0,0,"R01",595,3.622656,0,60.02464066,0 101 | 1,0,0,"R01",96,0.747358,0,45.53593429,1 102 | 1,0,0,"R01",181,2.831705,3.873,39.9835729,1 103 | 1,0,0,"R01",448,0.966937,0,43.99178645,0 104 | 1,0,0,"R01",279,2.997898,0,24.04928131,1 105 | 1,1,0,"U01",78,1.824367,4.651,80.16427105,1 106 | 0,0,0,"R01",50,2.341968,7.36,13.07597536,1 107 | 0,0,0,"R01",113,4.547943,0,16,0 108 | 1,0,0,"R24, K24",157,3.455642,10.639,19.35112936,1 109 | 1,0,0,"R01",778,2.667221,31.71,6.800821355,1 110 | 0,0,0,"R01",39,0.762945,4.086,29.76591376,1 111 | 0,0,0,"R01",28,1.193323,6.603,20.40246407,1 112 | 0,0,0,"R01",248,2.422114,4.518,34.98973306,1 113 | 0,0,0,"R01",388,1.030004,6.307,30.68583162,1 114 | 0,0,0,"R01",900,2.097957,0,130.0041068,0 115 | 1,1,0,"U01",53,1.356094,0,37.65092402,0 116 | 1,0,1,"R44",2470,5.653211,38.278,5.749486653,1 117 | 1,0,0,"R01",50,0.25,0,66.98973306,0 118 | 0,0,0,"R01",152,0.896992,0,120.0164271,0 119 | 0,0,0,"R01",239,1.600511,3.873,39.95071869,1 120 | 0,0,0,"R01",249,1.86971,1.438,17.0513347,1 121 | 1,0,0,"R01",201,1.747354,3.873,39.19507187,1 122 | 1,0,0,"R01",182,4.232768,7.508,18.39835729,1 123 | 1,0,0,"R18",234,2.733214,9.11,17.83983573,1 124 | 0,0,0,"R01",402,3.159323,9.813,19.4825462,1 125 | 1,0,0,"R01",179,1.974407,2.238,17.01848049,1 126 | 0,0,0,"R01",670,4.508157,2.83,29.8973306,1 127 | 1,0,0,"K01",56,0.761375,2.83,21.9137577,1 128 | 1,0,0,"R01",574,3.475804,6.615,16,1 129 | 0,0,0,"R01",870,2.645003,0,43.95893224,0 130 | 1,0,0,"R01",216,1.658351,3.672,25.00205339,1 131 | 1,0,0,"R01",190,1.424504,3.122,25.79055441,1 132 | 1,0,0,"R01",170,1.011292,0,14.75154004,1 133 | 0,0,0,"R01",60,3.880263,4.457,24.0164271,1 134 | 1,0,0,"R01",600,2.523507,0,27.95893224,0 135 | 1,0,0,"R01",337,2.612405,0,53.09240246,1 136 | 1,0,0,"P01",159,1.290477,0,35.08829569,1 137 | 1,0,0,"R01",355,3.011476,5.43,19.8110883,1 138 | 0,0,0,"R01",305,0.882736,1.318,25.42915811,1 139 | 0,0,0,"R01",305,0.882736,0,37.94661191,0 140 | 1,0,0,"R01",250,1.363761,0,16,0 141 | 0,0,0,"R01",200,2.28017,0,47.01437372,0 142 | 0,0,0,"R01",61,3.557896,1.506,26.0862423,1 143 | 0,0,0,"R01",252,2.960115,0,59.00616016,0 144 | 0,0,0,"R01",33,0.25,5.632,17.7412731,1 145 | 1,0,0,"R01",130,2.266291,4.844,18.98973306,1 146 | 0,0,0,"R01",990,4.529343,0,24.0164271,0 147 | 1,0,0,"R01",280,4.748245,0,11.72895277,1 148 | 0,0,0,"R01",120,2.771899,0,11.00616016,0 149 | 1,0,0,"R01",618,2.49585,4.284,18.98973306,1 150 | 1,0,0,"R01",350,3.693318,0,10.02053388,0 151 | 1,0,0,"R01",525,3.89671,4.9,13.8973306,1 152 | 0,0,0,"R01",394,1.560963,4.11,40.04928131,1 153 | 0,0,0,"U01",579,2.783147,0,4.008213552,0 154 | 1,0,0,"R01",221,1.456284,5.967,19.51540041,1 155 | 1,0,0,"R21",24,0.484714,5.799,5.979466119,1 156 | 1,0,0,"Contract",242,2.361381,11.462,49.7412731,1 157 | 1,0,0,"Contract",256,2.361381,11.462,49.90554415,1 158 | 0,0,0,"R01",660,3.079765,0,35.58110883,1 159 | 0,0,0,"R01",655,3.284871,0,89.98767967,0 160 | 0,0,0,"R01",500,1.940301,0,68.00821355,0 161 | 0,0,0,"R01",196,2.977829,3.368,30.91581109,1 162 | 0,0,0,"R01",133,2.213278,12.755,11.8275154,1 163 | 1,0,0,"R01",221,1.563877,3.565,11.00616016,1 164 | 0,0,1,"R01",1653,6.857323,53.406,4.566735113,1 165 | 0,0,0,"R01",594,2.784882,0,58.28336756,0 166 | 0,1,1,"U01",1331,7.907451,30.011,8.147843943,1 167 | 0,1,1,"U01",853,7.907451,7.492,22.99794661,1 168 | 0,0,0,"K23",41,0.79536,0,6.997946612,0 169 | 1,0,0,"R01",2319,2.985753,11.462,21.78234086,1 170 | 1,0,0,"R01",973,5.557458,9.11,48.45995893,1 171 | 1,0,0,"R01",32,0.966019,2.289,68.27104723,1 172 | 1,0,0,"R01",62,2.505966,0,31.96714579,0 173 | 1,0,0,"R01",158,2.160419,3.974,58.0862423,1 174 | 0,0,0,"R01",248,2.016397,3.873,24.57494867,1 175 | 0,0,0,"R01",500,1.890413,0,18.98973306,0 176 | 0,0,1,"R01",605,4.243261,0,13.9301848,0 177 | 1,0,0,"R01",1257,3.477152,0,6.012320329,0 178 | 0,0,0,"P01",58,0.754531,14.156,27.23613963,1 179 | 1,0,0,"R01",492,2.057492,9.813,47.86858316,1 180 | 0,1,1,"U01",9933,11.135179,53.298,13.04312115,1 181 | 0,1,1,"U01",8718,11.135179,53.298,13.04312115,1 182 | 0,0,0,"R01",392,2.771647,0,40.96919918,0 183 | 0,0,0,"P01",44,0.905437,0,11.99178645,0 184 | 1,0,0,"P01",44,1.358157,0,11.99178645,0 185 | 1,0,0,"R01",360,3.406196,0,8.016427105,0 186 | 1,0,0,"P50",35,1.243022,0,21.02669405,0 187 | 0,0,0,"P50",150,3.193432,0,4.008213552,0 188 | 0,0,0,"R01",1004,3.245225,0,22.99794661,0 189 | 1,0,0,"K23",200,0.774032,1.66,23.12936345,1 190 | 0,0,0,"R01",2115,1.934249,0,34.00410678,0 191 | 0,0,1,"R01",266,1.996759,0,14.98151951,0 192 | 0,0,0,"R42",259,0.930637,0,24.0164271,0 193 | 0,0,0,"U01",900,2.898552,0,33.01848049,0 194 | 0,1,1,"U01",1586,2.026512,14.093,22.89938398,1 195 | 1,1,0,"Contract",3472,15.490398,53.406,6.899383984,1 196 | 0,0,0,"K23",24,0.568258,0,53.38809035,0 197 | 1,0,0,"R01",150,3.700714,10.639,13.8973306,1 198 | 0,1,0,"U01",308,8.26,53.298,12.09034908,1 199 | 0,0,0,"P01",146,2.245685,0,41.49486653,0 200 | 0,0,0,"R01",67,2.53601,3.794,23.9835729,1 201 | 0,0,0,"P01",43,1.581877,0,48.98562628,0 202 | 0,0,0,"R01",163,10.227118,0,3.975359343,0 203 | 0,0,0,"U01",149,1.807713,0,9.034907598,0 204 | 1,0,0,"R01",420,2.948155,0,9.987679671,0 205 | 1,0,0,"U01",141,3.482757,0,25.95482546,0 206 | 0,0,0,"R01",862,1.750767,0,29.17453799,0 207 | 1,0,0,"U01",365,4.903409,0,11.00616016,0 208 | 0,0,0,"R01",392,2.354367,0,5.979466119,0 209 | 0,0,0,"R01",94,2.695184,0,6.012320329,0 210 | 0,1,0,"U01",87,8.17572,30.011,2.464065708,1 211 | 0,0,0,"R01",120,4.391234,0,15.50718686,0 212 | 0,0,0,"R01",256,3.609222,0,7.983572895,0 213 | 0,0,0,"R01",30,0.906345,8.266,22.50513347,1 214 | 0,0,0,"R01",501,3.539633,4.84,4.468172485,1 215 | 1,0,0,"R44",300,0.84868,0,22.24229979,0 216 | 1,0,0,"R01",415,5.702727,53.298,9.429158111,1 217 | 1,0,0,"R44",308,1.367797,0,22.86652977,0 218 | 0,0,0,"R01",26,0.638626,0,4.993839836,0 219 | 0,1,0,"U01",92,9.422223,30.011,4.303901437,1 220 | 1,0,0,"U01",390,5.381624,53.298,9.067761807,1 221 | 1,0,0,"R21",51,0.4125,0,22.99794661,0 222 | 0,0,0,"R21",22,0.390914,0,8.180698152,0 223 | 1,0,0,"R01",18,1.208598,6.606,38.11088296,1 224 | 1,0,0,"R01",266,2.26775,4.844,41.0349076,1 225 | 1,0,0,"R01",21,1.576176,14.739,21.51950719,1 226 | 1,0,0,"R21",120,0.362247,0,18.4312115,0 227 | 0,0,0,"K23",24,0.67217,0,28.94455852,0 228 | 1,0,1,"U54",31,5.649036,38.278,7.983572895,1 229 | 0,0,0,"R01",132,2.481322,0,2.98973306,0 230 | 0,0,0,"R01",55,1.41009,0,4.993839836,0 231 | 1,0,0,"R21",30,0.407,0,19.97535934,0 232 | 0,0,0,"R01",60,1.591641,0,6.012320329,0 233 | 1,0,0,"RC2",150,4.136888,0,30.39014374,0 234 | 1,0,0,"R44",1800,0.988677,3.299,35.02258727,1 235 | 1,0,0,"R21",181,0.37509,0,4.008213552,0 236 | 0,0,0,"R01",66,1.15875,0,5.979466119,0 237 | 0,0,0,"P50",100,2.531065,0,11.86036961,0 238 | 1,0,0,"K23",99,0.601928,0,13.04312115,0 239 | 1,0,0,"R01",247,1.3054805,0,9.691991786,1 240 | 1,0,0,"R01",247,1.3054805,0,16.45995893,1 241 | 0,0,0,"R01",4105,2.703653,5.355,65.01848049,1 242 | 1,0,0,"R44",181,1.117084,0,66.98973306,0 243 | 0,0,0,"K23",104,0.472321,0,9.987679671,0 244 | 0,0,0,"R21",69,0.40471,0,21.97946612,0 245 | 1,0,0,"R01",1699,2.957751,0,4.632443532,0 246 | -------------------------------------------------------------------------------- /data/Readme_datalist: -------------------------------------------------------------------------------- 1 | Auto 2 | Bikeshare 3 | Caravan 4 | Carseats 5 | College 6 | Default 7 | Hitters 8 | Khan (json file) 9 | NCI60 (json file) 10 | OJ 11 | Portfolio 12 | Smarket 13 | Wage 14 | Weekly 15 | -------------------------------------------------------------------------------- /data/USArrests.csv: -------------------------------------------------------------------------------- 1 | "","Murder","Assault","UrbanPop","Rape" 2 | "Alabama",13.2,236,58,21.2 3 | "Alaska",10,263,48,44.5 4 | "Arizona",8.1,294,80,31 5 | "Arkansas",8.8,190,50,19.5 6 | "California",9,276,91,40.6 7 | "Colorado",7.9,204,78,38.7 8 | "Connecticut",3.3,110,77,11.1 9 | "Delaware",5.9,238,72,15.8 10 | "Florida",15.4,335,80,31.9 11 | "Georgia",17.4,211,60,25.8 12 | "Hawaii",5.3,46,83,20.2 13 | "Idaho",2.6,120,54,14.2 14 | "Illinois",10.4,249,83,24 15 | "Indiana",7.2,113,65,21 16 | "Iowa",2.2,56,57,11.3 17 | "Kansas",6,115,66,18 18 | "Kentucky",9.7,109,52,16.3 19 | "Louisiana",15.4,249,66,22.2 20 | "Maine",2.1,83,51,7.8 21 | "Maryland",11.3,300,67,27.8 22 | "Massachusetts",4.4,149,85,16.3 23 | "Michigan",12.1,255,74,35.1 24 | "Minnesota",2.7,72,66,14.9 25 | "Mississippi",16.1,259,44,17.1 26 | "Missouri",9,178,70,28.2 27 | "Montana",6,109,53,16.4 28 | "Nebraska",4.3,102,62,16.5 29 | "Nevada",12.2,252,81,46 30 | "New Hampshire",2.1,57,56,9.5 31 | "New Jersey",7.4,159,89,18.8 32 | "New Mexico",11.4,285,70,32.1 33 | "New York",11.1,254,86,26.1 34 | "North Carolina",13,337,45,16.1 35 | "North Dakota",0.8,45,44,7.3 36 | "Ohio",7.3,120,75,21.4 37 | "Oklahoma",6.6,151,68,20 38 | "Oregon",4.9,159,67,29.3 39 | "Pennsylvania",6.3,106,72,14.9 40 | "Rhode Island",3.4,174,87,8.3 41 | "South Carolina",14.4,279,48,22.5 42 | "South Dakota",3.8,86,45,12.8 43 | "Tennessee",13.2,188,59,26.9 44 | "Texas",12.7,201,80,25.5 45 | "Utah",3.2,120,80,22.9 46 | "Vermont",2.2,48,32,11.2 47 | "Virginia",8.5,156,63,20.7 48 | "Washington",4,145,73,26.2 49 | "West Virginia",5.7,81,39,9.3 50 | "Wisconsin",2.6,53,66,10.8 51 | "Wyoming",6.8,161,60,15.6 52 | -------------------------------------------------------------------------------- /data/dog_test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qx0731/Sharing_ISL_python/6b588e773be9984d0a4b91172fcce803c39fe0d6/data/dog_test.jpg --------------------------------------------------------------------------------