├── Chapter_10_sec_10.9.ipynb
├── Chapter_11_sec_11.8.ipynb
├── Chapter_12_sec_12.5.ipynb
├── Chapter_13_sec_13.6.ipynb
├── Chapter_2_sec_3.1_3.5.ipynb
├── Chapter_3_sec_6.1_6.7.ipynb
├── Chapter_4_sec_7.1_7.7.ipynb
├── Chapter_5_sec_3.1_3.4.ipynb
├── Chapter_6_sec_6.5.ipynb
├── Chapter_7_sec_7.8.ipynb
├── Chapter_8_sec_8.3.ipynb
├── Chapter_9_sec_9.6.ipynb
├── ISLR_v1.pdf
├── ISLR_v2_2021_Nov.pdf
├── README.md
└── data
    ├── Auto.csv
    ├── Bikeshare.csv
    ├── Boston.csv
    ├── BrainCancer.csv
    ├── Caravan.csv
    ├── Carseats.csv
    ├── College.csv
    ├── Default.csv
    ├── Fund.csv
    ├── Hitters.csv
    ├── Khan.json
    ├── Khan.rda
    ├── NCI60.json
    ├── NCI60.rda
    ├── NCI60_data.csv
    ├── NCI60_labs.csv
    ├── OJ.csv
    ├── Portfolio.csv
    ├── Publication.csv
    ├── Readme_datalist
    ├── Smarket.csv
    ├── USArrests.csv
    ├── Wage.csv
    ├── Weekly.csv
    └── dog_test.jpg


/Chapter_11_sec_11.8.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 11.8 Lab: Survival Analysis"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "\n",
 20 |     "from lifelines import KaplanMeierFitter\n",
 21 |     "from lifelines.statistics import logrank_test\n",
 22 |     "from lifelines import CoxPHFitter\n",
 23 |     "\n",
 24 |     "%matplotlib inline"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "\"\"\" \n",
 34 |     "I am also new to this topic, so let us learn those concept together. Feedbacks are welcome.\n",
 35 |     "Survival analysis is a statistical method used to estimate the survival function of a population.\n",
 36 |     "These arise in the analysis of a unique kind of outcome variable: the analysis time until an event occurs.\n",
 37 |     "\"\"\""
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## 11.8.1 Brain Cancer Data"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "BrainCancer = pd.read_csv('data/BrainCancer.csv', header=0)\n",
 54 |     "# use some options in .describe() to get a quick overview of the data\n",
 55 |     "BrainCancer.describe(include = 'object')"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "BrainCancer.head()"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# I did a bit of google search and found the package lifelines.\n",
 74 |     "# % pip install lifelines"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# create a kmf object\n",
 84 |     "kmf = KaplanMeierFitter() "
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "# fit the data into the model\n",
 94 |     "kmf.fit(BrainCancer.time, BrainCancer.status,label='Kaplan Meier Estimate')\n"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "# create an estimate\n",
104 |     "kmf.plot(ci_show=True) ## ci_show is meant for Confidence interval, which is the shaded area in the plot."
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "kmf1 = KaplanMeierFitter() ## instantiate the class to create an object\n",
114 |     "\n",
115 |     "## two Cohorts are compared. Cohort 1. Female; Cohort 2. Male \n",
116 |     "groups = BrainCancer['sex'] \n",
117 |     "T = BrainCancer.time\n",
118 |     "E = BrainCancer.status  \n",
119 |     "i1 = (groups == 'Female')     \n",
120 |     "i2 = (groups == 'Male')     \n",
121 |     "\n",
122 |     "\n",
123 |     "## fit the model for 1st cohort\n",
124 |     "kmf1.fit(T[i1], E[i1], label='Female')\n",
125 |     "a1 = kmf1.plot()\n",
126 |     "\n",
127 |     "## fit the model for 2nd cohort\n",
128 |     "kmf1.fit(T[i2], E[i2], label='Male')\n",
129 |     "kmf1.plot(ax=a1)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "# we can perform a log-rank test to compare the survival of males to females,\n",
139 |     "results=logrank_test(T[i1],T[i2],event_observed_A=E[i1], event_observed_B=E[i2])\n",
140 |     "results.print_summary()\n",
141 |     "\"\"\" \n",
142 |     "The resulting p-value is 0.23, indicating no evidence of a difference in sur- vival between the two sexes.\n",
143 |     "This also can be seen from the overlapping the confidence intervals. \n",
144 |     "\"\"\""
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "df_dummy = pd.get_dummies(BrainCancer, drop_first=True)\n",
154 |     "df_dummy.head()\n"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "# use Cox Proportional Hazards model\n",
164 |     "cph1 = CoxPHFitter()   \n",
165 |     "cph1.fit(df_dummy[['status', 'sex_Male', 'time']], 'time', event_col='status')   \n",
166 |     "cph1.print_summary()\n",
167 |     "\"\"\" \n",
168 |     "I was not able to use the optional 'formula' in the fit() function due to the compatibility with new version, instead I sliced the dataframe to only contains 3 cols ('status', 'sex_Male', 'time')\n",
169 |     "Here the p value is at 0.233. Regardless of which test we use, we see that there is no clear evidence for a difference in survival between males and females.\n",
170 |     "\"\"\""
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "# use Cox Proportional Hazards model and use more features \n",
180 |     "cph2 = CoxPHFitter()   \n",
181 |     "cph2.fit(df_dummy, 'time', event_col='status')   \n",
182 |     "cph2.print_summary()\n",
183 |     "\"\"\" \n",
184 |     "after adjusting for the other predictors, larger values of the Karnofsky index, ki, are associated with lower risk, \n",
185 |     "i.e. longer survival.\n",
186 |     "\"\"\""
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "## 11.8.2 Publication Data"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "Publication = pd.read_csv('data/Publication.csv', header=0)\n",
203 |     "Publication.head()"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "kmf1 = KaplanMeierFitter() ## instantiate the class to create an object\n",
213 |     "\n",
214 |     "## two Cohorts are compared. Cohort 1. posres==0 ; Cohort 2. posres==1\n",
215 |     "groups = Publication['posres'] \n",
216 |     "T = Publication.time\n",
217 |     "E = Publication.status  \n",
218 |     "i1 = (groups == 0)     \n",
219 |     "i2 = (groups == 1)     \n",
220 |     "\n",
221 |     "## fit the model for 1st cohort\n",
222 |     "kmf1.fit(T[i1], E[i1], label='Positive Results')\n",
223 |     "a1 = kmf1.plot()\n",
224 |     "\n",
225 |     "## fit the model for 2nd cohort\n",
226 |     "kmf1.fit(T[i2], E[i2], label='Negative Results')\n",
227 |     "kmf1.plot(ax=a1)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "# we can perform a log-rank test to compare the survival of males to females,\n",
237 |     "results=logrank_test(T[i1],T[i2],event_observed_A=E[i1], event_observed_B=E[i2])\n",
238 |     "results.print_summary()"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "df_dummy = pd.get_dummies(Publication, drop_first=True)\n",
248 |     "df_dummy.head()"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "# use Cox Proportional Hazards model\n",
258 |     "cph3 = CoxPHFitter()   \n",
259 |     "cph3.fit(df_dummy[['status', 'posres', 'time']], 'time', event_col='status')   \n",
260 |     "cph3.print_summary()"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "cph4 = CoxPHFitter()   \n",
270 |     "cph4.fit(df_dummy[['status', 'posres', 'time', 'multi', 'clinend','sampsize', 'budget', 'impact']], 'time', event_col='status')   \n",
271 |     "cph4.print_summary()\n",
272 |     "\"\"\" \n",
273 |     "After we control for other features, posres becomes an important factor (well, at least significant).\n",
274 |     "We see that there are a number of statistically significant variables, \n",
275 |     "including whether the trial focused on a clinical endpoint (clinend), the impact of the study(impact),\n",
276 |     "and whether the study had positive or negative results (posres).\n",
277 |     "\"\"\""
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "markdown",
282 |    "metadata": {},
283 |    "source": [
284 |     "## 11.8.3 Call Center Data"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "np.random.seed(1)\n",
294 |     "N = 2000\n",
295 |     "Operators = np.random.choice(range(5, 16), N)\n",
296 |     "Center = np.random.choice([\"A\", \"B\", \"C\"], N)\n",
297 |     "Time = np.random.choice([\"Morn.\", \"After.\", \"Even.\"], N)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "# we generate a similar random data set\n",
307 |     "X_pre = pd.DataFrame({\"Operators\": Operators, \"Center\": Center, \"Time\": Time})\n",
308 |     "X = pd.get_dummies(X_pre, drop_first=True)\n",
309 |     "X.head()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "true_coeff = np.array([0.04, -0.3, 0, 0.2, -0.2])\n",
319 |     "# well, I was not able to fully following the simulation in the book. \n",
320 |     "# I think the highlevel idea is to use those coefficients to generate a dataset and show the model fit could \n",
321 |     "# sucessfully recover the coefficients."
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "# this simulation code is wrong. And I will come back and fix it. \n",
331 |     "X['y'] = 350*np.exp(np.sum(-X*true_coeff,axis=1).tolist() + np.random.normal(0, 0.005, N))\n",
332 |     "X['answered'] = np.where( X['y'] < 300 , 1, 0)\n",
333 |     "X.head()"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "# use Cox Proportional Hazards model\n",
343 |     "cph5 = CoxPHFitter()   \n",
344 |     "cph5.fit(X, 'y', event_col='answered')   \n",
345 |     "cph5.print_summary()\n",
346 |     "\"\"\" \n",
347 |     "Since the simulation is wrong, so the summary is not correct. \n",
348 |     "But at least we can see the p-values for Operatator, Center = B, Time = Even. and Time = Morn are very small, \n",
349 |     "and they are directly related to the ground truth coefficients.\n",
350 |     "\"\"\""
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "# End of Chapter 11"
360 |    ]
361 |   }
362 |  ],
363 |  "metadata": {
364 |   "interpreter": {
365 |    "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250"
366 |   },
367 |   "kernelspec": {
368 |    "display_name": "Python 2.7.16 64-bit ('base': conda)",
369 |    "name": "python3"
370 |   },
371 |   "language_info": {
372 |    "codemirror_mode": {
373 |     "name": "ipython",
374 |     "version": 3
375 |    },
376 |    "file_extension": ".py",
377 |    "mimetype": "text/x-python",
378 |    "name": "python",
379 |    "nbconvert_exporter": "python",
380 |    "pygments_lexer": "ipython3",
381 |    "version": "3.6.2"
382 |   }
383 |  },
384 |  "nbformat": 4,
385 |  "nbformat_minor": 2
386 | }
387 | 


--------------------------------------------------------------------------------
/Chapter_12_sec_12.5.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 12.5 Lab: Unsupervised Learning"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pandas as pd\n",
 17 |     "import numpy as np\n",
 18 |     "from numpy.linalg import svd\n",
 19 |     "import matplotlib as mpl\n",
 20 |     "import matplotlib.pyplot as plt\n",
 21 |     "\n",
 22 |     "from sklearn.preprocessing import StandardScaler\n",
 23 |     "from sklearn.decomposition import PCA\n",
 24 |     "from sklearn.cluster import KMeans\n",
 25 |     "from scipy.cluster import hierarchy\n",
 26 |     "\n",
 27 |     "%matplotlib inline\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## 12.5.1 Principal Components Analysis"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "USArrests = pd.read_csv('./data/USArrests.csv', index_col=0)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "USArrests.head()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# pandas has a built-in function to get the mean and variance of each column\n",
 62 |     "print(USArrests.mean())\n",
 63 |     "print(USArrests.var())"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "sc = StandardScaler()\n",
 73 |     "X = pd.DataFrame(sc.fit_transform(USArrests), index=USArrests.index, columns=USArrests.columns)\n",
 74 |     "# The loading vectors (i.e. these are the projection of the data onto the principal components)\n",
 75 |     "pca_loadings = pd.DataFrame(PCA().fit(X).components_.T, index=USArrests.columns, columns=['V1', 'V2', 'V3', 'V4'])\n",
 76 |     "pca_loadings\n",
 77 |     "\n",
 78 |     "\"\"\" \n",
 79 |     "Depends on the version of python/module, you may see a flipped loading vector in signs. \n",
 80 |     "This is normal because the orientation of the principal components is not deterministic. \n",
 81 |     "\"\"\"\n",
 82 |     "# X1=pd.DataFrame(sc.inverse_transform(X), index=USArrests.index, columns=USArrests.columns)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# fit the PCA model and transform X to get the principal components\n",
 92 |     "pca = PCA()\n",
 93 |     "df_plot = pd.DataFrame(pca.fit_transform(X), columns=['PC1', 'PC2', 'PC3', 'PC4'], index=X.index)\n",
 94 |     "df_plot"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "fig , ax1 = plt.subplots(figsize=(9,7))\n",
104 |     "\n",
105 |     "ax1.set_xlim(-3.5,3.5)\n",
106 |     "ax1.set_ylim(-3.5,3.5)\n",
107 |     "\n",
108 |     "# plot Principal Components 1 and 2\n",
109 |     "for i in df_plot.index:\n",
110 |     "    ax1.annotate(i, (df_plot.PC1.loc[i], -df_plot.PC2.loc[i]), ha='center')\n",
111 |     "\n",
112 |     "# plot reference lines\n",
113 |     "ax1.hlines(0,-3.5,3.5, linestyles='dotted', colors='grey')\n",
114 |     "ax1.vlines(0,-3.5,3.5, linestyles='dotted', colors='grey')\n",
115 |     "\n",
116 |     "ax1.set_xlabel('First Principal Component')\n",
117 |     "ax1.set_ylabel('Second Principal Component')\n",
118 |     "    \n",
119 |     "# plot Principal Component loading vectors, using a second y-axis.\n",
120 |     "ax2 = ax1.twinx().twiny() \n",
121 |     "\n",
122 |     "ax2.set_ylim(-1,1)\n",
123 |     "ax2.set_xlim(-1,1)\n",
124 |     "ax2.tick_params(axis='y', colors='orange')\n",
125 |     "ax2.set_xlabel('Principal Component loading vectors', color='orange')\n",
126 |     "\n",
127 |     "# plot labels for vectors. Variable 'a' is a small offset parameter to separate arrow tip and text.\n",
128 |     "a = 1.07  \n",
129 |     "for i in pca_loadings[['V1', 'V2']].index:\n",
130 |     "    ax2.annotate(i, (pca_loadings.V1.loc[i]*a, -pca_loadings.V2.loc[i]*a), color='orange')\n",
131 |     "\n",
132 |     "# plot vectors\n",
133 |     "ax2.arrow(0,0,pca_loadings.V1[0], -pca_loadings.V2[0])\n",
134 |     "ax2.arrow(0,0,pca_loadings.V1[1], -pca_loadings.V2[1])\n",
135 |     "ax2.arrow(0,0,pca_loadings.V1[2], -pca_loadings.V2[2])\n",
136 |     "ax2.arrow(0,0,pca_loadings.V1[3], -pca_loadings.V2[3])\n",
137 |     "plt.show()"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "# in previous chapter, we talked about PCR. In that case, we could use the downstream task's (i.e. regression RMSE) \n",
147 |     "# performance to select the hyperparameters (i.e. # number of PCs).\n",
148 |     "# here let us use the portion of explained variance to select the number of PCs. Those info is available in the pca object.\n",
149 |     "print(pca.explained_variance_)\n",
150 |     "print(pca.explained_variance_ratio_)\n"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "plt.figure(figsize=(7,5))\n",
160 |     "\n",
161 |     "plt.plot([1,2,3,4], pca.explained_variance_ratio_, '-o', label='Individual component')\n",
162 |     "plt.plot([1,2,3,4], np.cumsum(pca.explained_variance_ratio_), '-s', label='Cumulative')\n",
163 |     "\n",
164 |     "plt.ylabel('Proportion of Variance Explained')\n",
165 |     "plt.xlabel('Principal Component')\n",
166 |     "plt.xlim(0.75,4.25)\n",
167 |     "plt.ylim(0,1.05)\n",
168 |     "plt.xticks([1,2,3,4])\n",
169 |     "plt.legend(loc=2)\n",
170 |     "plt.show()\n",
171 |     "\n",
172 |     "\"\"\"\n",
173 |     "In this case, if we want to preserve 80% of variance of the data, we need to select 2 PCs.\n",
174 |     "\"\"\""
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "## 12.5.2 Matrix Completion"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "\"\"\"\n",
191 |     "I am happy seeing this lab added. SVD seems pretty heavy in theory/math, but this has lots of application in real problems, \n",
192 |     "such as recommendation systems, clustering, outlier smoothing, and so on.\n",
193 |     "\"\"\""
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "# run sigular value decomposition on the data (SVD)\n",
203 |     "u, s, vh = svd(X, full_matrices=False)\n",
204 |     "u.shape, s.shape, vh.shape"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "# this vh will be the principal components similar to pca.components_ (up to an unimportant sign flip)\n",
214 |     "# The matrix u is equivalent to the matrix of standardized scores, and the standard deviations are in the vector s.\n",
215 |     "print(vh)\n",
216 |     "print ('-------')\n",
217 |     "print(pca.components_)\n"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "# reconstruction based on full SVD\n",
227 |     "np.allclose(X, np.dot(u * s, vh))"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "# reconstruction based on reduced SVD\n",
237 |     "num_components = 3\n",
238 |     "recovered = pd.DataFrame(np.dot(u[:, :num_components] * s[:num_components,], vh[:num_components,:]))"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "print(recovered.head(n=2))\n",
248 |     "print(X.head(n=2))\n",
249 |     "\n",
250 |     "\"\"\"\n",
251 |     "Change the num_components from 1 to 4 and see how the reconstruction error changes.\n",
252 |     "\"\"\""
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "## 12.5.3 Clustering"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "K-means clustering "
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "# generate data\n",
276 |     "np.random.seed(21)\n",
277 |     "X = np.random.standard_normal((50,2))\n",
278 |     "X[:25,0] = X[:25,0]+3\n",
279 |     "X[:25,1] = X[:25,1]-4"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "n_clusters = 2\n",
289 |     "km1 = KMeans(n_clusters=n_clusters, n_init=20)\n",
290 |     "km1.fit(X)\n",
291 |     "\n",
292 |     "n_clusters = 3\n",
293 |     "km2 = KMeans(n_clusters=n_clusters, n_init=20)\n",
294 |     "km2.fit(X)"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "print(km1.labels_)\n",
304 |     "print(dir(km1)) # we can use dir to see other saved attributes"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "fig, (ax1, ax2) = plt.subplots(1,2, figsize=(14,5))\n",
314 |     "\n",
315 |     "ax1.scatter(X[:,0], X[:,1], s=40, c=km1.labels_, cmap=plt.cm.prism) \n",
316 |     "ax1.set_title('K-Means Clustering Results with K=2')\n",
317 |     "ax1.scatter(km1.cluster_centers_[:,0], km1.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2)\n",
318 |     "\n",
319 |     "ax2.scatter(X[:,0], X[:,1], s=40, c=km2.labels_, cmap=plt.cm.prism) \n",
320 |     "ax2.set_title('K-Means Clustering Results with K=3')\n",
321 |     "ax2.scatter(km2.cluster_centers_[:,0], km2.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2)\n",
322 |     "plt.show()"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "Hierarchical Clustering"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18))\n",
339 |     "\n",
340 |     "for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'],\n",
341 |     "                                [ax1,ax2,ax3]):\n",
342 |     "    cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0)\n",
343 |     "\n",
344 |     "ax1.set_title('Complete Linkage')\n",
345 |     "ax2.set_title('Average Linkage')\n",
346 |     "ax3.set_title('Single Linkage')\n",
347 |     "plt.show()"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {},
353 |    "source": [
354 |     "## 12.5.4 NCI60 Data Example"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {},
360 |    "source": [
361 |     "PCA on the NCI60 Data"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {},
368 |    "outputs": [],
369 |    "source": [
370 |     "# I was not able to make the json work, so I went back to R and saved the data and label separately.\n",
371 |     "X = pd.read_csv('./data/NCI60_data.csv')\n",
372 |     "y = pd.read_csv('./data/NCI60_labs.csv')"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": [
381 |     "pca2 = PCA()\n",
382 |     "X_standardized = StandardScaler().fit_transform(X)\n",
383 |     "df2_plot = pd.DataFrame(pca2.fit_transform(X_standardized))"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "metadata": {},
390 |    "outputs": [],
391 |    "source": [
392 |     "fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,6))\n",
393 |     "\n",
394 |     "color_idx = pd.factorize(y.iloc[:, 0])[0]\n",
395 |     "cmap = plt.cm.hsv\n",
396 |     "\n",
397 |     "# left plot\n",
398 |     "ax1.scatter(df2_plot.iloc[:,0], -df2_plot.iloc[:,1], c=color_idx, cmap=cmap, alpha=0.5, s=50)\n",
399 |     "ax1.set_ylabel('Principal Component 2')\n",
400 |     "\n",
401 |     "# right plot\n",
402 |     "ax2.scatter(df2_plot.iloc[:,0], df2_plot.iloc[:,2], c=color_idx, cmap=cmap, alpha=0.5, s=50)\n",
403 |     "ax2.set_ylabel('Principal Component 3')\n",
404 |     "\n",
405 |     "# custom legend for the classes (y) since we do not create scatter plots per class (which could have their own labels).\n",
406 |     "handles = []\n",
407 |     "labels = pd.factorize(y.iloc[:, 0].unique())\n",
408 |     "norm = mpl.colors.Normalize(vmin=0.0, vmax=14.0)\n",
409 |     "\n",
410 |     "for i, v in zip(labels[0], labels[1]):\n",
411 |     "    handles.append(mpl.patches.Patch(color=cmap(norm(i)), label=v, alpha=0.5))\n",
412 |     "\n",
413 |     "ax2.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)\n",
414 |     "\n",
415 |     "# xlabel for both plots\n",
416 |     "for ax in fig.axes:\n",
417 |     "    ax.set_xlabel('Principal Component 1')   "
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": null,
423 |    "metadata": {},
424 |    "outputs": [],
425 |    "source": [
426 |     "pd.DataFrame([df2_plot.iloc[:,:5].std(axis=0, ddof=0).array,\n",
427 |     "              pca2.explained_variance_ratio_[:5],\n",
428 |     "              np.cumsum(pca2.explained_variance_ratio_[:5])],\n",
429 |     "             index=['Standard Deviation', 'Proportion of Variance', 'Cumulative Proportion'],\n",
430 |     "             columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": null,
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "df2_plot.iloc[:,:10].var(axis=0, ddof=0).plot(kind='bar', rot=0)\n",
440 |     "plt.ylabel('Variances')\n",
441 |     "plt.show()"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": [
450 |     "fig , (ax1,ax2) = plt.subplots(1,2, figsize=(15,5))\n",
451 |     "\n",
452 |     "# left plot\n",
453 |     "ax1.plot(pca2.explained_variance_ratio_, '-o')\n",
454 |     "ax1.set_ylabel('Proportion of Variance Explained')\n",
455 |     "ax1.set_ylim(ymin=-0.01)\n",
456 |     "\n",
457 |     "# right plot\n",
458 |     "ax2.plot(np.cumsum(pca2.explained_variance_ratio_), '-ro')\n",
459 |     "ax2.set_ylabel('Cumulative Proportion of Variance Explained')\n",
460 |     "ax2.set_ylim(ymax=1.05)\n",
461 |     "\n",
462 |     "for ax in fig.axes:\n",
463 |     "    ax.set_xlabel('Principal Component')\n",
464 |     "    ax.set_xlim(-1,65) "
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "markdown",
469 |    "metadata": {},
470 |    "source": [
471 |     "Clustering the Observations of the NCI60 Data"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": null,
477 |    "metadata": {},
478 |    "outputs": [],
479 |    "source": [
480 |     "sc = StandardScaler()\n",
481 |     "X_standardized = pd.DataFrame(sc.fit_transform(X), index=y.iloc[:, 0], columns=X.columns)"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "metadata": {},
488 |    "outputs": [],
489 |    "source": [
490 |     "fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(20,20))\n",
491 |     "\n",
492 |     "for linkage, cluster, ax in zip([hierarchy.complete(X_standardized), hierarchy.average(X), hierarchy.single(X_standardized)],\n",
493 |     "                                ['c1','c2','c3'],\n",
494 |     "                                [ax1,ax2,ax3]):\n",
495 |     "    cluster = hierarchy.dendrogram(linkage, labels=X_standardized.index, orientation='right', color_threshold=0, leaf_font_size=10, ax=ax)\n",
496 |     "\n",
497 |     "ax1.set_title('Complete Linkage')\n",
498 |     "ax2.set_title('Average Linkage')\n",
499 |     "ax3.set_title('Single Linkage')\n",
500 |     "plt.show()"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {},
507 |    "outputs": [],
508 |    "source": [
509 |     "plt.figure(figsize=(10,20))\n",
510 |     "cut4 = hierarchy.dendrogram(hierarchy.complete(X_standardized),\n",
511 |     "                            labels=X_standardized.index, orientation='right', color_threshold=140, leaf_font_size=10)\n",
512 |     "plt.vlines(140,0,plt.gca().yaxis.get_data_interval()[1], colors='r', linestyles='dashed')\n",
513 |     "plt.show()"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "markdown",
518 |    "metadata": {},
519 |    "source": [
520 |     "Kmeans"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": null,
526 |    "metadata": {},
527 |    "outputs": [],
528 |    "source": [
529 |     "np.random.seed(21)\n",
530 |     "km3 = KMeans(n_clusters=4, n_init=50)\n",
531 |     "km3.fit(X_standardized)"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": null,
537 |    "metadata": {},
538 |    "outputs": [],
539 |    "source": [
540 |     "km3.labels_"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "markdown",
545 |    "metadata": {},
546 |    "source": [
547 |     "Combine with PCA"
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "code",
552 |    "execution_count": null,
553 |    "metadata": {},
554 |    "outputs": [],
555 |    "source": [
556 |     "plt.figure(figsize=(10,20))\n",
557 |     "pca_cluster = hierarchy.dendrogram(hierarchy.complete(X_standardized), labels=X_standardized.index,\n",
558 |     "orientation='right', color_threshold=100, leaf_font_size=10)"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": null,
564 |    "metadata": {},
565 |    "outputs": [],
566 |    "source": [
567 |     "# Hierarchy based on Principal Components 1 to 5\n",
568 |     "plt.figure(figsize=(10,20))\n",
569 |     "pca_cluster = hierarchy.dendrogram(hierarchy.complete(df2_plot.iloc[:,:5]), labels=X_standardized.index,\n",
570 |     "orientation='right', color_threshold=100, leaf_font_size=10)"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": null,
576 |    "metadata": {},
577 |    "outputs": [],
578 |    "source": [
579 |     "# End of Chapter 12"
580 |    ]
581 |   }
582 |  ],
583 |  "metadata": {
584 |   "interpreter": {
585 |    "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250"
586 |   },
587 |   "kernelspec": {
588 |    "display_name": "Python 2.7.16 64-bit ('base': conda)",
589 |    "name": "python3"
590 |   },
591 |   "language_info": {
592 |    "codemirror_mode": {
593 |     "name": "ipython",
594 |     "version": 3
595 |    },
596 |    "file_extension": ".py",
597 |    "mimetype": "text/x-python",
598 |    "name": "python",
599 |    "nbconvert_exporter": "python",
600 |    "pygments_lexer": "ipython3",
601 |    "version": "3.6.2"
602 |   }
603 |  },
604 |  "nbformat": 4,
605 |  "nbformat_minor": 2
606 | }
607 | 


--------------------------------------------------------------------------------
/Chapter_13_sec_13.6.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 13.6 Lab: Multiple Testing"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pandas as pd\n",
 17 |     "import numpy as np\n",
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "from scipy import stats as st\n",
 20 |     "from sklearn.metrics import confusion_matrix\n",
 21 |     "from statsmodels.sandbox.stats.multicomp import multipletests\n",
 22 |     "from statsmodels.stats.multicomp import pairwise_tukeyhsd\n",
 23 |     "from statsmodels.sandbox.stats.multicomp import TukeyHSDResults\n",
 24 |     "\n",
 25 |     "import json\n",
 26 |     "\n",
 27 |     "%matplotlib inline\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## 13.6.1 Review of Hypothesis Tests"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "np.random.seed(21)\n",
 44 |     "X = np.random.normal(loc=0.0, scale=1.0, size=(10, 100))\n",
 45 |     "offset = 0.5\n",
 46 |     "X[:,:50] = X[:,:50] + offset"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# here I used scipy. During google search, I came across bioinfokit module, could explore more. \n",
 56 |     "result=st.ttest_1samp(a = X[:, 0], popmean = 0)\n",
 57 |     "print(result.pvalue)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# let us run the same t-test for all 100 columns\n",
 67 |     "p_values = []\n",
 68 |     "decision = []\n",
 69 |     "for i in range(100):\n",
 70 |     "    result=st.ttest_1samp(a = X[:, i], popmean = 0)\n",
 71 |     "    p_values.append(result.pvalue)\n",
 72 |     "    if result.pvalue < 0.05:\n",
 73 |     "        decision.append('Reject H0')\n",
 74 |     "    else:\n",
 75 |     "        decision.append('Do not reject H0')\n"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# after computing the p-values, we can use the ground truth to evaluate the performance\n",
 85 |     "ground_truth = np.repeat(['Reject H0', 'Do not reject H0'], [50, 50], axis=0)\n",
 86 |     "labels = ['Reject H0', 'Do not reject H0']\n",
 87 |     "cm = confusion_matrix (ground_truth, decision, labels=labels)\n",
 88 |     "print(cm)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "fig = plt.figure()\n",
 98 |     "ax = fig.add_subplot(111)\n",
 99 |     "cax = ax.matshow(cm)\n",
100 |     "fig.colorbar(cax)\n",
101 |     "ax.set_xticklabels([''] + labels)\n",
102 |     "ax.set_yticklabels([''] + labels)\n",
103 |     "plt.xlabel('One sample t-test')\n",
104 |     "plt.ylabel('Ground truth')\n",
105 |     "plt.show()"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "# we could make the offset larger (from 0.5 to 1) and see the change to the confusion matrix\n",
115 |     "offset = 1\n",
116 |     "X[:,:50] = X[:,:50] + offset\n",
117 |     "\n",
118 |     "p_values = []\n",
119 |     "decision = []\n",
120 |     "for i in range(100):\n",
121 |     "    result=st.ttest_1samp(a = X[:, i], popmean = 0)\n",
122 |     "    p_values.append(result.pvalue)\n",
123 |     "    if result.pvalue < 0.05:\n",
124 |     "        decision.append('Reject H0')\n",
125 |     "    else:\n",
126 |     "        decision.append('Do not reject H0')\n",
127 |     "\n",
128 |     "\n",
129 |     "ground_truth = np.repeat(['Reject H0', 'Do not reject H0'], [50, 50], axis=0)\n",
130 |     "labels = ['Reject H0', 'Do not reject H0']\n",
131 |     "cm = confusion_matrix (ground_truth, decision, labels=labels)\n",
132 |     "print(cm)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## 13.6.2 The Family-Wise Error Rate"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "m = range(500)\n",
149 |     "fwe1 = list(map(lambda x:1 - pow(1 - 0.05,x),m))\n",
150 |     "fwe2 = list(map(lambda x:1 - pow(1 - 0.01,x),m))\n",
151 |     "fwe3 = list(map(lambda x:1 - pow(1 - 0.001,x),m))"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "plt.plot(m, fwe1, label = \"0.05\")\n",
161 |     "plt.plot(m, fwe2, label = \"0.01\")\n",
162 |     "plt.plot(m, fwe3, label = \"0.001\")\n",
163 |     "plt.xlabel('Number of tests in log scale')\n",
164 |     "plt.ylabel('FWE')\n",
165 |     "plt.xscale(\"log\")\n",
166 |     "plt.legend()\n",
167 |     "plt.show()\n",
168 |     "\n",
169 |     "\"\"\" \n",
170 |     "We see that setting α = 0.05 results in a high FWER even for moderate m. \n",
171 |     "With α = 0.01, we can test no more than five null hypotheses before the FWER exceeds 0.05. \n",
172 |     "Only for very small values, such as α = 0.001, do we manage to ensure a small FWER, \n",
173 |     "at least for moderately-sized m.\n",
174 |     "\n",
175 |     "Of course, the problem with setting α to such a low value is that we are likely to \n",
176 |     "make a number of Type II errors: in other words, our power is very low.\n",
177 |     "\"\"\""
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "Fund = pd.read_csv('data/Fund.csv')"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "Fund.head()"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "# we will do the one sample t test for the first manager\n",
205 |     "result=st.ttest_1samp(a = Fund['Manager1'], popmean = 0)\n",
206 |     "print(result.pvalue)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "p_values = []\n",
216 |     "manager_number = 5 \n",
217 |     "\n",
218 |     "for i in range(manager_number):\n",
219 |     "    result=st.ttest_1samp(a = Fund.iloc[:,i], popmean = 0)\n",
220 |     "    p_values.append(result.pvalue)\n",
221 |     "\n",
222 |     "print(p_values)\n",
223 |     "\n",
224 |     "\"\"\" \n",
225 |     "The p-values are low for Managers One and Three, and high for the other three managers. \n",
226 |     "However, we cannot simply reject H01 and H03, since this would fail to account for \n",
227 |     "the multiple testing that we have performed. \n",
228 |     "Instead, we will conduct Bonferroni’s method and Holm’s method to control the FWER.\n",
229 |     "\"\"\""
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "# we could bonferroni to adjust the raw p-values and take care of family wise error rate\n",
239 |     "reject, p_values_corrected, alphacSidak, alphacBonf = multipletests(p_values, method = 'bonferroni')\n",
240 |     "print(p_values_corrected)\n",
241 |     "\"\"\" \n",
242 |     "Therefore, using Bonferroni’s method, \n",
243 |     "we are able to reject the null hypothesis only for Manager One while controlling the FWER at 0.05.\n",
244 |     "This information is also available in the variable reject.\n",
245 |     "\"\"\"\n",
246 |     "print(reject)\n"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "# Bonferroni's method is more conservative. We could apply holm's method to control the FWER\n",
256 |     "reject, p_values_corrected, alphacSidak, alphacBonf = multipletests(p_values, method = 'holm')\n",
257 |     "print(p_values_corrected)\n",
258 |     "print(reject)\n",
259 |     "\"\"\" \n",
260 |     "By contrast, using Holm’s method, the adjusted p-values indicate that we can reject the null hypotheses \n",
261 |     "for Both Managers One and Three at a FWER of 0.05.\n",
262 |     "\"\"\""
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "# we can see the average for each manager \n",
272 |     "Fund.mean(axis=0)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "# next, we could test whether 2 managers are significantly different. For example Manager 1 and Manager 2\n",
282 |     "result=st.ttest_rel(a = Fund['Manager1'], b = Fund['Manager2'])\n",
283 |     "print(result.pvalue)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "\"\"\" \n",
293 |     "However, we decided to perform this test only after examining the data and \n",
294 |     "noting that Managers One and Two had the highest and lowest mean performances. \n",
295 |     "In a sense, this means that we have implicitly performed a manual selection \n",
296 |     "from the 5(5 − 1)/2 = 10 hypothesis tests, rather than just one. \n",
297 |     "Hence, we use Tukey’s method in order to adjust for multiple testing. \n",
298 |     "\"\"\"\n",
299 |     "returns = Fund.iloc[:, :5].to_numpy().flatten(order='F') # we flatten by col (i.e. order='F')\n",
300 |     "manager = np.repeat(['1', '2', '3', '4', '5'], repeats=Fund.shape[0])\n",
301 |     "\n",
302 |     "# perform Tukey's test\n",
303 |     "tukey = pairwise_tukeyhsd(endog=returns, groups=manager, alpha=0.05)\n",
304 |     "\n",
305 |     "print(tukey)\n",
306 |     "\n",
307 |     "\"\"\" \n",
308 |     "Notice that the p-value for the difference between Managers One and Two has increased from 0.038 to 0.186, \n",
309 |     "so there is no longer clear evidence of a difference between the managers’ performances.\n",
310 |     "\n",
311 |     "\"\"\""
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {},
317 |    "source": [
318 |     "## 13.6.3 The False Discovery Rate"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": null,
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "p_values = []\n",
328 |     "manager_number = Fund.shape[1]\n",
329 |     "\n",
330 |     "for i in range(manager_number):\n",
331 |     "    result=st.ttest_1samp(a = Fund.iloc[:,i], popmean = 0)\n",
332 |     "    p_values.append(result.pvalue)\n",
333 |     "\n",
334 |     "print(p_values[0:10])"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": null,
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": [
343 |     "\"\"\" \n",
344 |     "There are far too many managers to consider trying to control the FWER. \n",
345 |     "Instead, we focus on controlling the FDR: that is, the expected fraction of rejected null \n",
346 |     "hypotheses that are actually false positives. \n",
347 |     "\"\"\"\n",
348 |     "\n",
349 |     "reject, p_values_corrected, alphacSidak, alphacBonf = multipletests(p_values, method = 'fdr_bh')\n",
350 |     "print(p_values_corrected[0:10])\n",
351 |     "\n",
352 |     "\"\"\" \n",
353 |     "The q-values output by the Benjamini-Hochberg procedure can be interpreted as the smallest \n",
354 |     "FDR threshold at which we would reject a particular null hypothesis.\n",
355 |     "\n",
356 |     "For instance, a q-value of 0.1 indicates that we can reject the corresponding null hypothesis\n",
357 |     "at an FDR of 10% or greater, but that we cannot reject the null hypothesis at an FDR below 10%.\n",
358 |     "\"\"\""
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "# we would find that 146 of the 2,000 fund managers have a p_values_corrected below 0.1\n",
368 |     "sum(p_values_corrected <= .1)"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "metadata": {},
375 |    "outputs": [],
376 |    "source": [
377 |     "# if we use bonferroni method, we will find None\n",
378 |     "sum(np.array(p_values) <= .1/Fund.shape[1])"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {},
384 |    "source": [
385 |     "## 13.6.4 A Re-Sampling Approach"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "# I saved the gene expression data as a json file, in python we could load the json file using the json library\n",
395 |     "# after reading in the data, we can use the data is same as a dictionary, we can use the keys to access the data\n",
396 |     "\n",
397 |     "f = open('./data/Khan.json')\n",
398 |     "Khan = json.load(f)\n",
399 |     "\n",
400 |     "X_train = np.array(Khan['xtrain'])\n",
401 |     "y_train = np.array(Khan['ytrain'])\n",
402 |     "X_test = np.array(Khan['xtest'])\n",
403 |     "y_test = np.array(Khan['ytest'])"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": [
412 |     "x = np.concatenate((X_train, X_test), axis=0)\n",
413 |     "y = np.concatenate((y_train, y_test), axis=0)\n",
414 |     "unique, counts = np.unique(y, return_counts=True)\n",
415 |     "print(counts)"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "# x1: take the x for cancer type == 2\n",
425 |     "# x2: take the x for cancer type == 4\n",
426 |     "x1 = x[y==2, :]\n",
427 |     "x2 = x[y==4, :]\n",
428 |     "n1 = x1.shape[0]\n",
429 |     "n2 = x2.shape[0]\n",
430 |     "print(n1)\n",
431 |     "print(n2)"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": [
440 |     "# performing a standard two-sample t-test on the 11th (gene_index = 10 in python) gene produces a test-statistic \n",
441 |     "gene_index = 10\n",
442 |     "original_result=st.ttest_ind(a=x1[:,gene_index], b=x2[:,gene_index], equal_var=True)\n",
443 |     "print(original_result.statistic)\n",
444 |     "print(original_result.pvalue)\n",
445 |     "\n",
446 |     "\"\"\" \n",
447 |     "The 2 sample t-test produces a test-statistic of −2.09 and an associated p-value of 0.0412, \n",
448 |     "suggesting modest evidence of a difference in mean expression levels between the two cancer types.\n",
449 |     "\"\"\""
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": null,
455 |    "metadata": {},
456 |    "outputs": [],
457 |    "source": [
458 |     "\"\"\" \n",
459 |     "Instead of doing a parameterized 2 sample t-test, we could do a non-parameterized test(i.e. permutation test).\n",
460 |     "we can randomly split the 54 patients (in cancer group 2 and 4) into two groups of 29 and 25 \n",
461 |     "(same as the original split),and compute a new test statistic. \n",
462 |     "Under the null hypothesis of no difference between the groups, this new test statistic should have \n",
463 |     "the same distribution as our original one. \n",
464 |     "Repeating this process many (i.e.10,000) times allows us to approximate the null distribution of the test statistic. \n",
465 |     "We compute the fraction of the time that our observed test statistic exceeds the test statistics obtained \n",
466 |     "via re-sampling.\n",
467 |     "\"\"\"\n",
468 |     "\n",
469 |     "np.random.seed(21)\n",
470 |     "iteration = 10000\n",
471 |     "test_stats = []\n",
472 |     "x_temp = np.concatenate((x1[:,gene_index], x2[:,gene_index]), axis=0)\n",
473 |     "\n",
474 |     "for i in range(iteration):\n",
475 |     "    np.random.shuffle(x_temp)\n",
476 |     "    result_temp = st.ttest_ind(a=x_temp[:n1], b=x_temp[-n2:], equal_var=True)\n",
477 |     "    test_stats.append(result_temp.statistic)"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "metadata": {},
484 |    "outputs": [],
485 |    "source": [
486 |     "print(np.mean((np.abs(test_stats) >= np.abs(original_result.statistic))))\n",
487 |     "\n",
488 |     "\"\"\" \n",
489 |     "This fraction is our re-sampling-based p-value. It is almost identical to the p-value of 0.0412 \n",
490 |     "obtained using the theoretical null distribution.\n",
491 |     "\n",
492 |     "The reason for this is that the parametrized distribution is a pretty good assumption in this case\n",
493 |     "To see this, we can plot the histogram of the re-sampled statistics vs. parametrized distribution. \n",
494 |     "\n",
495 |     "We could try other genes (i.e. gene_index = 876) to see its theoretical and re-sampling null distributions are \n",
496 |     "quite different\n",
497 |     "\"\"\""
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "code",
502 |    "execution_count": null,
503 |    "metadata": {},
504 |    "outputs": [],
505 |    "source": [
506 |     "# construct the t distribution \n",
507 |     "df = n1 + n2 - 2\n",
508 |     "rv = st.t(df)\n",
509 |     "x = np.linspace(-4.2, 4.2, 1000)\n",
510 |     "\n",
511 |     "\n",
512 |     "plt.hist(test_stats, 100, density=True, facecolor='g', alpha=0.75)\n",
513 |     "plt.plot(x, rv.pdf(x), 'k-', lw=2)\n",
514 |     "plt.xlabel('Null Distribution of Test Statistic')\n",
515 |     "plt.ylabel('Probability')\n",
516 |     "plt.title('Histogram of re-sample stats')\n",
517 |     "plt.xlim(-4.2, 4.2)\n",
518 |     "plt.grid(True)\n",
519 |     "plt.show()"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": null,
525 |    "metadata": {},
526 |    "outputs": [],
527 |    "source": [
528 |     "# we could do this for 100 and see how FDR works under re-sample \n",
529 |     "# it would be good to use small iterations to make sure the code runs okay \n",
530 |     "num_gene = 100\n",
531 |     "iteration = 500\n",
532 |     "test_stats_matrix = []\n",
533 |     "test_stats_origin = []\n",
534 |     "\n",
535 |     "for j in range(num_gene):\n",
536 |     "    gene_index = j \n",
537 |     "    x_temp = np.concatenate((x1[:,gene_index], x2[:,gene_index]), axis=0)\n",
538 |     "    result_origin = st.ttest_ind(a=x1[:,gene_index], b=x2[:,gene_index], equal_var=True)\n",
539 |     "    test_stats_origin.append(result_origin.statistic)\n",
540 |     "    test_stats = []\n",
541 |     "    for i in range(iteration):\n",
542 |     "        np.random.shuffle(x_temp)\n",
543 |     "        result_temp = st.ttest_ind(a=x_temp[:n1], b=x_temp[-n2:], equal_var=True)\n",
544 |     "        test_stats.append(result_temp.statistic)\n",
545 |     "        \n",
546 |     "    test_stats_matrix.append(test_stats)"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": null,
552 |    "metadata": {},
553 |    "outputs": [],
554 |    "source": [
555 |     "test_stats_origin_sorted =  np.sort(np.abs(test_stats_origin))"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": null,
561 |    "metadata": {},
562 |    "outputs": [],
563 |    "source": [
564 |     "Rs = []\n",
565 |     "Vs = []\n",
566 |     "FDRs = []\n",
567 |     "for j in range(num_gene):\n",
568 |     "    R = np.sum(np.abs(test_stats_origin) >= test_stats_origin_sorted[j])\n",
569 |     "    V = np.sum(np.abs(test_stats_matrix) >= test_stats_origin_sorted[j]) / iteration\n",
570 |     "    Rs.append(R)\n",
571 |     "    Vs.append(V)\n",
572 |     "    FDRs.append(V*1.0/R)\n",
573 |     "\n",
574 |     "Rs = np.array(Rs)\n",
575 |     "Vs = np.array(Vs)\n",
576 |     "FDRs = np.array(FDRs) "
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": null,
582 |    "metadata": {},
583 |    "outputs": [],
584 |    "source": [
585 |     "print(np.max(Rs[FDRs <= .1]))\n",
586 |     "print(np.max(Rs[FDRs <= .2]))"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": null,
592 |    "metadata": {},
593 |    "outputs": [],
594 |    "source": [
595 |     "plt.plot(Rs, FDRs, 'k-', lw=2)\n",
596 |     "plt.xlabel('Number of Rejections')\n",
597 |     "plt.ylabel('False Discovery Rate')\n",
598 |     "plt.show()"
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "code",
603 |    "execution_count": null,
604 |    "metadata": {},
605 |    "outputs": [],
606 |    "source": [
607 |     "# End of Chapter 13"
608 |    ]
609 |   }
610 |  ],
611 |  "metadata": {
612 |   "interpreter": {
613 |    "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250"
614 |   },
615 |   "kernelspec": {
616 |    "display_name": "Python 2.7.16 64-bit ('base': conda)",
617 |    "name": "python3"
618 |   },
619 |   "language_info": {
620 |    "codemirror_mode": {
621 |     "name": "ipython",
622 |     "version": 3
623 |    },
624 |    "file_extension": ".py",
625 |    "mimetype": "text/x-python",
626 |    "name": "python",
627 |    "nbconvert_exporter": "python",
628 |    "pygments_lexer": "ipython3",
629 |    "version": "3.6.2"
630 |   }
631 |  },
632 |  "nbformat": 4,
633 |  "nbformat_minor": 2
634 | }
635 | 


--------------------------------------------------------------------------------
/Chapter_2_sec_3.1_3.5.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# 2.3 Lab: Introduction to R"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {
 17 |     "slideshow": {
 18 |      "slide_type": "slide"
 19 |     }
 20 |    },
 21 |    "source": [
 22 |     "## 2.3.1 Basic Commands"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {
 29 |     "slideshow": {
 30 |      "slide_type": "slide"
 31 |     }
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# best practice is to have all the modules imported at the top of the file, but for this one, I will import them when we need them\n",
 36 |     "import numpy as np  # for calculation purpose, let us use np.array \n",
 37 |     "import random # for the random number generation\n",
 38 |     "\n",
 39 |     "x = np.array([1, 3, 2, 5])\n",
 40 |     "# use print to see the array\n",
 41 |     "print(x)"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {
 48 |     "slideshow": {
 49 |      "slide_type": "slide"
 50 |     }
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "x = np.array([1, 6, 2])\n",
 55 |     "print(x)\n",
 56 |     "y = [1, 4, 3]\n",
 57 |     "print(y)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# use len() to find length of a vector\n",
 67 |     "len(x)  "
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "len(y)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "print(x + y) # please note that we define x and y a little bit differently, but we still can do the calculation "
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "# The whos function allows us to look at a list of all of the objects, such as data and functions, that we have saved so far\n",
 95 |     "%whos"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "# reset_selective x\n",
105 |     "del x "
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "%whos"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "# read the description of a function \n",
124 |     "%whos?"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "# get a matrix \n",
134 |     "x = [[1,2],[3, 4]]\n",
135 |     "print(x)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# we could also reshape a one dimensional array to a matrix\n",
145 |     "x = np.array([1, 2, 3, 4])\n",
146 |     "print(x)\n",
147 |     "x = np.reshape(x, [2,2])\n",
148 |     "print(x)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "# then we can use the matrix to do some calculations\n",
158 |     "np.sqrt(x)\n",
159 |     "x**2\n",
160 |     "np.square(x)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "# use random to generate random numbers/arrays/matrices\n",
170 |     "mu, sigma = 0, 1\n",
171 |     "x = np.random.normal(mu, sigma, 5)\n",
172 |     "y = x + np.random.normal(20, 0.1, 5)\n",
173 |     "print(x)\n",
174 |     "print(y)\n"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "# more calculation\n",
184 |     "np.corrcoef(x, y) "
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "# above will return the correlation matrix, let us see just the correlation coefficient between x and y\n",
194 |     "np.corrcoef(x, y)[0,1]"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {
201 |     "collapsed": true
202 |    },
203 |    "outputs": [],
204 |    "source": [
205 |     "# we can use the seed function to set up the random seed, so that every thing we run the code, we will get the same result\n",
206 |     "random.seed(2333)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "# after set up the seed, this should genernate the same result everytime we run the notebook\n",
216 |     "np.random.normal(mu, sigma, 5) "
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "# you could increase the number of samples to see the empirical distribution coverages to the theoretical distribution\n",
226 |     "mu, sigma = 0, 1\n",
227 |     "num_samples = 10\n",
228 |     "x = np.random.normal(mu, sigma, num_samples)\n",
229 |     "print(np.mean(x))\n",
230 |     "print(np.var(x))\n",
231 |     "print(np.sqrt(np.var(x)))\n",
232 |     "print(np.std(x))"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "## 2.3.2 Graphics"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "import numpy as np  # for calculation purpose, let use np.array \n",
249 |     "import random # for the random \n",
250 |     "\n",
251 |     "x = np.random.normal(0, 1, 100)\n",
252 |     "y = np.random.normal(0, 1, 100)\n",
253 |     "\n",
254 |     "# in python, matplotlib is the most used library for plot \n",
255 |     "# matplotlib.pyplot is a collection of command style functions that make matplotlib work like MATLAB.\n",
256 |     "import matplotlib.pyplot as plt\n",
257 |     "\n",
258 |     "\n",
259 |     "plt.plot(x, y, 'bo') # please use plt.plot? to look at more options \n",
260 |     "plt.ylabel(\"this is the y-axis\")\n",
261 |     "plt.xlabel(\"this is the x-axis\")\n",
262 |     "plt.title(\"Plot of X vs Y\")\n",
263 |     "plt.savefig('Figure.pdf') # use plt.savefig function to save images\n",
264 |     "plt.show() \n"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "# note the arange excludes right end of rande specification \n",
274 |     "x = np.arange(1, 11) \n",
275 |     "print(x) "
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "# note: np.arange actually can result in unexpected results; check np.arange(0.2, 0.6, 0.4) vs np.arange(0.2, 1.6, 1.4)\n",
285 |     "print(np.arange(0.2, 0.6, 0.4))\n",
286 |     "print(np.arange(0.2, 1.6, 1.4))"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "# in order to use Pi, math module needs to loaded first\n",
296 |     "import math\n",
297 |     "x = np.linspace(-math.pi, math.pi, num = 50)\n",
298 |     "print(x)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "metadata": {
305 |     "collapsed": true
306 |    },
307 |    "outputs": [],
308 |    "source": [
309 |     "import matplotlib.cm as cm\n",
310 |     "import matplotlib.mlab as mlab\n",
311 |     "y = x\n",
312 |     "X, Y = np.meshgrid(x,y)"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "%whos"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "# same as above, use plt.contour? to explore the options\n",
331 |     "f = np.cos(Y)/(1 + np.square(X))\n",
332 |     "CS = plt.contour(X, Y, f)\n",
333 |     "plt.show()"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "# I think imshow looks nicer for heatmap, use 'extent =' fix the x, y axis\n",
343 |     "fa = (f - f.T)/2 #f.T for transpose or tranpose(f)\n",
344 |     "plt.imshow(fa, extent=(x[0], x[-1], y[0], y[-1])) \n",
345 |     "plt.show()"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {
352 |     "scrolled": true
353 |    },
354 |    "outputs": [],
355 |    "source": [
356 |     "from mpl_toolkits.mplot3d import axes3d\n",
357 |     "fig = plt.figure()\n",
358 |     "ax = fig.add_subplot(111, projection='3d')\n",
359 |     "ax.plot_wireframe(X, Y, fa)\n",
360 |     "plt.show()"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {},
366 |    "source": [
367 |     "## 2.3.3 Indexing Data \n",
368 |     "Here we use np array. If the data structure is something else, the method below may not work"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "metadata": {},
375 |    "outputs": [],
376 |    "source": [
377 |     "A = np.arange(1,17,1).reshape(4, 4).transpose()\n",
378 |     "print(A)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "metadata": {},
385 |    "outputs": [],
386 |    "source": [
387 |     "# one thing to note here is that in python, the index starts from 0, not 1\n",
388 |     "print(A[2, 3])"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "# try the same index as the book, but we got different number. The reason is R starts the index from 1 (Matlab too), but Python starts the index from 0. To select the same number (10) as the book did, we reduce the index by 1\n",
398 |     "print(A[1, 2])"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": null,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "# to select a submatrix, need the non-singleton dimension of your indexing array to be aligned with the axis you're indexing into, \n",
408 |     "# e.g. for an n x m 2D subarray: A[n by 1 array,1 by m array]\n",
409 |     "A[[[0],[2]], [1,3]]"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "# this is another way to do that\n",
419 |     "A[0:3:2, 1:4:2] "
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "# select all columns in those two rows \n",
429 |     "A[0:3:2,:]"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "metadata": {},
436 |    "outputs": [],
437 |    "source": [
438 |     "# select all row in those two columns \n",
439 |     "A[:, 1:4:2] "
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "# the last two examples include either no index for the columns or no index for the rows. These indicate that Python should include all columns or all rows, respectively\n",
449 |     "A[0,:]"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": null,
455 |    "metadata": {},
456 |    "outputs": [],
457 |    "source": [
458 |     "# '-' sign has a different meaning and good usage in Python. This means index from the end, -1 means the last element \n",
459 |     "A[-1, -1] "
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": null,
465 |    "metadata": {},
466 |    "outputs": [],
467 |    "source": [
468 |     "# there are other ways to let Python keep all rows except certain index. For example, we could also use boolean. \n",
469 |     "ind = np.ones((4,), bool)\n",
470 |     "ind[[0,2]] = False\n",
471 |     "print(ind)"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": null,
477 |    "metadata": {},
478 |    "outputs": [],
479 |    "source": [
480 |     "A[ind,:]"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": null,
486 |    "metadata": {},
487 |    "outputs": [],
488 |    "source": [
489 |     "# we do not specify the row or column, the default is the for the row\n",
490 |     "A[ind]"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": [
499 |     "# we use .shape to get the shape of the matrix \n",
500 |     "A.shape"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {
506 |     "collapsed": true
507 |    },
508 |    "source": [
509 |     "## 2.3.4 Loading Data\n",
510 |     "\n",
511 |     "In Python, Pandas is a common used module to read from file into a data frame. I downloaded the Auto.csv from the book website. First, take a look at the csv file. There are headers, missing value is marked by '?'."
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "metadata": {},
518 |    "outputs": [],
519 |    "source": [
520 |     "import pandas as pd \n",
521 |     "Auto = pd.read_csv('data/Auto.csv', header=0, na_values='?')"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": null,
527 |    "metadata": {},
528 |    "outputs": [],
529 |    "source": [
530 |     "# we could use .head to see the first few rows (default = 5) of the data \n",
531 |     "Auto.head()"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": null,
537 |    "metadata": {},
538 |    "outputs": [],
539 |    "source": [
540 |     "# check one record with missing value, and make sure the missing value is correctly imported. \n",
541 |     "# Here we use the i.loc to select the row which is different from the indexing method above\n",
542 |     "# the reason is that Auto is a pandas dataframe, while the indexing method was for a numpy array\n",
543 |     "Auto.iloc[32]"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": null,
549 |    "metadata": {},
550 |    "outputs": [],
551 |    "source": [
552 |     "# Use the same .shape function as in ndarray to find out the dimension of the data frame \n",
553 |     "Auto.shape"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": null,
559 |    "metadata": {},
560 |    "outputs": [],
561 |    "source": [
562 |     "# an alternative way to select the first 4 rows. \n",
563 |     "Auto[:4]"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "code",
568 |    "execution_count": null,
569 |    "metadata": {},
570 |    "outputs": [],
571 |    "source": [
572 |     "# an alternative way to select the first 4 rows and first 2 columns.\n",
573 |     "Auto.iloc[:4, :2]"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": null,
579 |    "metadata": {},
580 |    "outputs": [],
581 |    "source": [
582 |     "# we can use list to find the column names or use .columns\n",
583 |     "print(list(Auto))\n",
584 |     "print(Auto.columns)"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": [
593 |     "# Use .isnull and .sum to find out how many NaNs in each variables\n",
594 |     "Auto.isnull().sum()"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "code",
599 |    "execution_count": null,
600 |    "metadata": {
601 |     "collapsed": true
602 |    },
603 |    "outputs": [],
604 |    "source": [
605 |     "# after the previous steps, there are 397 obs in the data and only 5 with missing values. We can just drop the ones with missing values  \n",
606 |     "print(Auto.shape)\n",
607 |     "Auto = Auto.dropna()\n",
608 |     "print(Auto.shape)"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "markdown",
613 |    "metadata": {},
614 |    "source": [
615 |     "## 2.3.5 Additional Graphical and Numerical Summaries"
616 |    ]
617 |   },
618 |   {
619 |    "cell_type": "code",
620 |    "execution_count": null,
621 |    "metadata": {},
622 |    "outputs": [],
623 |    "source": [
624 |     "# refer a column of data frame by name, by using a '.'. Ref the options in plt.plot for more.\n",
625 |     "plt.plot(Auto.cylinders, Auto.mpg, 'ro')\n",
626 |     "plt.show()"
627 |    ]
628 |   },
629 |   {
630 |    "cell_type": "code",
631 |    "execution_count": null,
632 |    "metadata": {},
633 |    "outputs": [],
634 |    "source": [
635 |     "# use .hist to get the histogram of certain variables. column = to specify which variable\n",
636 |     "Auto.hist(column = ['cylinders', 'mpg'])\n",
637 |     "plt.show()"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": null,
643 |    "metadata": {},
644 |    "outputs": [],
645 |    "source": [
646 |     "# use the .describe() to get a summary of the data frame. Use .describe ( include = 'all' ) for mix types, use describe(include = [np.number]) for numerical columns, use describe(include = ['O']) for objects.\n",
647 |     "Auto.describe()"
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "code",
652 |    "execution_count": null,
653 |    "metadata": {
654 |     "collapsed": true
655 |    },
656 |    "outputs": [],
657 |    "source": [
658 |     "# we can change type of certain variable(s). Here changed the cylinders into categorical variable \n",
659 |     "Auto['cylinders'] = Auto['cylinders'].astype('category')"
660 |    ]
661 |   },
662 |   {
663 |    "cell_type": "code",
664 |    "execution_count": null,
665 |    "metadata": {},
666 |    "outputs": [],
667 |    "source": [
668 |     "Auto.describe()"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": null,
674 |    "metadata": {},
675 |    "outputs": [],
676 |    "source": [
677 |     "Auto.describe(include= 'all')"
678 |    ]
679 |   },
680 |   {
681 |    "cell_type": "code",
682 |    "execution_count": null,
683 |    "metadata": {},
684 |    "outputs": [],
685 |    "source": [
686 |     "# End of Chapter 2"
687 |    ]
688 |   }
689 |  ],
690 |  "metadata": {
691 |   "anaconda-cloud": {},
692 |   "interpreter": {
693 |    "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250"
694 |   },
695 |   "kernelspec": {
696 |    "display_name": "Python 2.7.16 64-bit ('base': conda)",
697 |    "name": "python3"
698 |   },
699 |   "language_info": {
700 |    "codemirror_mode": {
701 |     "name": "ipython",
702 |     "version": 3
703 |    },
704 |    "file_extension": ".py",
705 |    "mimetype": "text/x-python",
706 |    "name": "python",
707 |    "nbconvert_exporter": "python",
708 |    "pygments_lexer": "ipython3",
709 |    "version": "3.6.2"
710 |   }
711 |  },
712 |  "nbformat": 4,
713 |  "nbformat_minor": 1
714 | }
715 | 


--------------------------------------------------------------------------------
/Chapter_3_sec_6.1_6.7.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 3.6 Lab: Linear Regression"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 3.6.1 Libraries"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# in Python, module can be imported by a command similar to 'import numpy as np'. \n",
 24 |     "# it is a good practice to maintain a section at the beginning of the notebook to import all necessary modules.\n",
 25 |     "# for new module, could use pip to install it. \n",
 26 |     "# for example 'pip install numpy'\n",
 27 |     "import numpy as np\n",
 28 |     "import matplotlib.pyplot as plt\n",
 29 |     "import pandas as pd \n",
 30 |     "import math\n",
 31 |     "import statsmodels.api as sm\n",
 32 |     "import statsmodels.formula.api as smf\n",
 33 |     "from statsmodels.stats.outliers_influence import variance_inflation_factor\n",
 34 |     "from statsmodels.graphics.regressionplots import *\n",
 35 |     "from sklearn import datasets, linear_model\n",
 36 |     "from patsy import dmatrices"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# since in Python, there is no default MASS module and Boston dataset, I will read in the Boston dataset from CSV. The data is in the ./data folder.\n",
 48 |     "Boston = pd.read_csv('data/Boston.csv', header=0)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## 3.6.2 Simple Linear Regression"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# use the commands we learned in the previous chapeter to exame the data.\n",
 65 |     "list(Boston) # or Boston.columns"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "Boston.head()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "Boston.shape"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "# to have similar formula notation as R, use the following import. \n",
 93 |     "# One thing to note is that the ' ' for the formula part in fitting step and the '.fit()' at the end.\n",
 94 |     "# import statsmodels.formula.api as smf, we would use smf to call the model. Of course, there are other ways to run linear regression in pythin, such as sklearn.\n",
 95 |     "lm = smf.ols ('medv~lstat', data = Boston).fit()"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "print(lm.summary())"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "# use dir() to get a list of all the attributes an object has\n",
114 |     "dir(lm)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "# we can try a few \n",
124 |     "print(lm.params)\n",
125 |     "print(lm.conf_int())"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "# provide prediction for 3 observations\n",
135 |     "lm.predict(pd.DataFrame({'lstat':[5, 10, 15]}))"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# plot the fitted line, we only take two extreme points to make the plot\n",
145 |     "X_new = pd.DataFrame({'lstat': [Boston.lstat.min(), Boston.lstat.max()]})\n",
146 |     "preds = lm.predict(X_new)\n",
147 |     "Boston.plot(kind='scatter', x='lstat', y='medv')\n",
148 |     "plt.plot(X_new, preds, c='red', linewidth=2)\n",
149 |     "plt.show()"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "fig, ((ax1, ax2), (ax3, ax4))= plt.subplots(2, 2)\n",
159 |     "ax1.plot(Boston.lstat, lm.predict(),'ro')\n",
160 |     "ax2.plot(lm.predict(), lm.resid, 'go')\n",
161 |     "ax3.plot(lm.predict(), lm.resid_pearson, 'bo')\n",
162 |     "plt.show()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "# the statistics of the linear regression mostly stored in lm.get_influence(), for example, the cookdistances, leverage.\n",
172 |     "dir(lm.get_influence())\n",
173 |     "# for example, the following identifies the observation with the largest leverage \n",
174 |     "np.argmax(lm.get_influence().hat_matrix_diag)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "# from statsmodels.graphics.regressionplots import * just as a reference\n",
184 |     "plot_leverage_resid2(lm)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "# as mentioned above. For machine learning models, sklearn is the most common used module, but sklearn is a little bit less on statistics.\n",
194 |     "x = pd.DataFrame(Boston.lstat)\n",
195 |     "y = Boston.medv\n",
196 |     "print(x.shape)\n",
197 |     "\n",
198 |     "model = linear_model.LinearRegression()\n",
199 |     "model.fit(x, y)\n",
200 |     "print(model.intercept_)\n",
201 |     "print(model.coef_)"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "## 3.6.3 Multiple Linear Regression"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "collapsed": true
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "# we can still use smg.ols to run multiple linear regression.\n",
220 |     "lm = smf.ols ('medv~lstat+age', data = Boston).fit()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "print(lm.summary())"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "# if we want to use all the variable. We can use the following trick to manually construct the list. In Python, most of time, you have to manully construct the variable list.\n",
239 |     "all_columns = \"+\".join(Boston.columns.difference([\"medv\"]))\n",
240 |     "my_formula = \"medv~\" + all_columns\n",
241 |     "lm = smf.ols(my_formula, data=Boston).fit()\n",
242 |     "print(lm.summary())"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "collapsed": true
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "# unlike R, Python is not fully up speeded to all the statistics. If you want to have the VIF of the variables in LM, you have to code a little bit.\n",
254 |     "# from patsy import dmatrices\n",
255 |     "# from statsmodels.stats.outliers_influence import variance_inflation_factor"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "y, X = dmatrices(my_formula, data=Boston, return_type='dataframe')\n",
265 |     "vif_coeff = {}\n",
266 |     "for i in range(X.shape[1]):\n",
267 |     "    vif_coeff[X.columns[i]] = variance_inflation_factor(np.array(X.dropna()),i)\n",
268 |     "    \n",
269 |     "print(vif_coeff)"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "## 3.6.4 Interaction Terms"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "# we use * to add interaction terms\n",
286 |     "lm = smf.ols('medv~lstat * age', data=Boston).fit()\n",
287 |     "print(lm.summary())"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "## 3.6.5 Non-linear Transformations of the Predictors "
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "lm_order1 = smf.ols('medv~ lstat', data=Boston).fit()\n",
304 |     "lm_order2 = smf.ols('medv~ lstat+ I(lstat ** 2.0)', data=Boston).fit()\n",
305 |     "print(lm_order2.summary())"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "fig, ((ax1, ax2), (ax3, ax4))= plt.subplots(2, 2)\n",
315 |     "ax1.plot(Boston.lstat, lm_order1.predict(),'ro')\n",
316 |     "ax3.plot(lm_order1.predict(), lm_order1.resid, 'go')\n",
317 |     "ax4.plot(lm_order1.predict(), lm_order1.resid_pearson, 'bo')\n",
318 |     "plt.show()"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "### if we added in the second order, we can see the residues are more random"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "fig, ((ax1, ax2), (ax3, ax4))= plt.subplots(2, 2)\n",
335 |     "ax1.plot(Boston.lstat,  lm_order2.predict(),'ro')\n",
336 |     "ax2.plot(Boston.lstat ** 2.0,  lm_order2.predict(),'ro')\n",
337 |     "ax3.plot(lm_order2.predict(), lm_order2.resid, 'go')\n",
338 |     "ax4.plot(lm_order2.predict(), lm_order2.resid_pearson, 'bo')\n",
339 |     "plt.show()"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "# there is anova function built in already in statsmodels.  \n",
349 |     "# if you know what to do, use the key words to google it and likely you will find a very good answer. \n",
350 |     "# here we compare the models with one order of stat and two orders of stats. \n",
351 |     "# by looking at the p value that will reject the null hypothesis that the coefficent of lstat**2 equals 0.\n",
352 |     "table = sm.stats.anova_lm(lm_order1, lm_order2)\n",
353 |     "print(table)"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "lm_log = smf.ols('medv~ np.log(rm)', data=Boston).fit()\n",
363 |     "lm_log.summary()"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "## 3.6.6 Qualitative Predictors \n",
371 |     "\n",
372 |     "I prepared the Carseats file from .Rdata. And it is saved under the data folder.  Let us load them in and explore this dataset."
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {
379 |     "collapsed": true
380 |    },
381 |    "outputs": [],
382 |    "source": [
383 |     "Carseats = pd.read_csv('data/Carseats.csv', header=0)"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "metadata": {},
390 |    "outputs": [],
391 |    "source": [
392 |     "list(Carseats)"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": null,
398 |    "metadata": {},
399 |    "outputs": [],
400 |    "source": [
401 |     "Carseats.dtypes"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "Carseats.head()"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": null,
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "lm_carseats = smf.ols('Sales ~ Income + Advertising + Price + Age', data = Carseats).fit()"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "lm_carseats.summary()"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": null,
434 |    "metadata": {
435 |     "collapsed": true
436 |    },
437 |    "outputs": [],
438 |    "source": [
439 |     "# let us create dummy variables using get_dummies, then exclude the first dummy column\n",
440 |     "ShelveLoc_dummies = pd.get_dummies(Carseats.ShelveLoc, prefix='ShelveLoc').iloc[:,1:]"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "metadata": {},
447 |    "outputs": [],
448 |    "source": [
449 |     "Carseats_dummy = pd.concat([Carseats, ShelveLoc_dummies], axis=1)\n",
450 |     "Carseats_dummy.head()"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {},
457 |    "outputs": [],
458 |    "source": [
459 |     "# then the model buliding will be the same with all numerrical variables.\n",
460 |     "lm_carseats_dummy = smf.ols('Sales ~ Income + Advertising + Price + Age + ShelveLoc_Good + ShelveLoc_Medium', \n",
461 |     "                            data = Carseats_dummy).fit()"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {},
468 |    "outputs": [],
469 |    "source": [
470 |     "# the interpretation of the coefficients are holding everything fixed, Medium shelve location is associated with an average\n",
471 |     "# increase of sale around 2.0046. \n",
472 |     "lm_carseats_dummy.summary() "
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {
479 |     "collapsed": true
480 |    },
481 |    "outputs": [],
482 |    "source": [
483 |     "# Compapre the summary of two models, one with explicit encoding of dummy varible, while the other used the built-in function.\n",
484 |     "lm_carseats_wo_dummy = smf.ols('Sales ~ Income + Advertising + Price + Age + C(ShelveLoc)', \n",
485 |     "                            data = Carseats).fit()\n",
486 |     "lm_carseats_wo_dummy.summary()"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "markdown",
491 |    "metadata": {},
492 |    "source": [
493 |     "## 3.6.7 Writing Functions"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": null,
499 |    "metadata": {},
500 |    "outputs": [],
501 |    "source": [
502 |     "# let us write a simple function to print current time. \n",
503 |     "# yhe key word in Python for user defined function is 'def'. \n",
504 |     "# pay attention to the ':'. The difference betwwen R (others) and Python is that Python \n",
505 |     "# forces you to obey its indentation rules. For example, the following function won't work because of the extra space in front of 'print'.\n",
506 |     "def print_current_time_wrong():\n",
507 |     "    from datetime import datetime # this is very bad practice !!! \n",
508 |     "    print(str(datetime.now()))  "
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": null,
514 |    "metadata": {
515 |     "collapsed": true
516 |    },
517 |    "outputs": [],
518 |    "source": [
519 |     "def print_current_time():\n",
520 |     "    from datetime import datetime\n",
521 |     "    print (str(datetime.now())) "
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": null,
527 |    "metadata": {},
528 |    "outputs": [],
529 |    "source": [
530 |     "print_current_time()"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": [
539 |     "# End of Chapter 3."
540 |    ]
541 |   }
542 |  ],
543 |  "metadata": {
544 |   "anaconda-cloud": {},
545 |   "interpreter": {
546 |    "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250"
547 |   },
548 |   "kernelspec": {
549 |    "display_name": "Python 2.7.16 64-bit ('base': conda)",
550 |    "name": "python3"
551 |   },
552 |   "language_info": {
553 |    "codemirror_mode": {
554 |     "name": "ipython",
555 |     "version": 3
556 |    },
557 |    "file_extension": ".py",
558 |    "mimetype": "text/x-python",
559 |    "name": "python",
560 |    "nbconvert_exporter": "python",
561 |    "pygments_lexer": "ipython3",
562 |    "version": "3.6.2"
563 |   }
564 |  },
565 |  "nbformat": 4,
566 |  "nbformat_minor": 1
567 | }
568 | 


--------------------------------------------------------------------------------
/Chapter_5_sec_3.1_3.4.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 5.3 Lab: Cross-Validation and the Bootstrap"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 5.3.1 The Validation Set Approach"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import matplotlib.pyplot as plt\n",
 25 |     "import scipy\n",
 26 |     "import pandas as pd \n",
 27 |     "import math\n",
 28 |     "import random\n",
 29 |     "import statsmodels.api as sm\n",
 30 |     "import statsmodels.formula.api as smf\n",
 31 |     "from statsmodels.graphics.regressionplots import *\n",
 32 |     "from sklearn import datasets, linear_model\n",
 33 |     "from sklearn.model_selection import KFold, cross_val_score\n",
 34 |     "from sklearn.preprocessing import PolynomialFeatures\n",
 35 |     "from sklearn.linear_model import LinearRegression\n",
 36 |     "from sklearn.pipeline import Pipeline\n",
 37 |     "from collections import OrderedDict"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "Auto = pd.read_csv('data/Auto.csv', header=0, na_values='?')\n",
 47 |     "Auto = Auto.dropna().reset_index(drop=True) # drop the observation with NA values and reindex the obs from 0\n",
 48 |     "print(Auto.shape)\n",
 49 |     "print(Auto.head())"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# split the data into training and record the index of train samples\n",
 59 |     "np.random.seed(1)\n",
 60 |     "train = np.random.choice(Auto.shape[0], 196, replace=False)\n",
 61 |     "select = np.in1d(range(Auto.shape[0]), train)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# start to build the model\n",
 71 |     "lm = smf.ols ('mpg~horsepower', data = Auto[select]).fit()\n",
 72 |     "print(lm.summary())"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# to follow the book, get prediction for all the observations in the dataset\n",
 82 |     "# here we use ~ select to exclude the result of the training samples\n",
 83 |     "preds = lm.predict(Auto)\n",
 84 |     "square_error = (Auto['mpg'] - preds)**2\n",
 85 |     "print('-------- Test error for 1st order model --------')\n",
 86 |     "print(np.mean(square_error[~select]))"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "# build a model with 2nd order of features  \n",
 96 |     "lm2 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0)', data = Auto[select]).fit()\n",
 97 |     "preds = lm2.predict(Auto)\n",
 98 |     "square_error = (Auto['mpg'] - preds)**2\n",
 99 |     "print('--------Test error for 2nd order--------')\n",
100 |     "print(square_error[~select].mean())"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# build a model with 3rd order of features  \n",
110 |     "lm3 = smf.ols ('mpg~horsepower + I(horsepower ** 2.0) + I(horsepower ** 3.0)', data = Auto[select]).fit()\n",
111 |     "preds = lm3.predict(Auto)\n",
112 |     "square_error = (Auto['mpg'] - preds)**2\n",
113 |     "print('--------Test rror for 3rd order--------')\n",
114 |     "print(np.mean(square_error[~select]))\n",
115 |     "\n",
116 |     "\"\"\" \n",
117 |     "These results are consistent with our previous findings: a model that predicts mpg using a quadratic function of \n",
118 |     "horsepower performs better than a model that involves only a linear function of horsepower, \n",
119 |     "and there is little evidence in favor of a model that uses a cubic function of horsepower.\n",
120 |     "\"\"\""
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "# if we look at the summmary for 3rd order regression, \n",
130 |     "# the coefficient of the 3rd order term is not statistically significant. \n",
131 |     "# I will use this as Supporting evidence for the above claim. \n",
132 |     "print(lm3.summary())"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## 5.3.2 Leave-One-Out Cross-Validation\n",
140 |     "The LOOCV estimates only keep one sample in the validation data and use the rest of the data to train the model. This way the training model has similar dataset comparing to the model trained on entire dataset."
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "# OLS fit \n",
150 |     "ols_fit = smf.ols ('mpg~horsepower', data = Auto).fit()\n",
151 |     "print(ols_fit.params)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "# GLM fit. Compare with OLS fit, the coeffs are the same\n",
161 |     "glm_fit = smf.glm('mpg~horsepower', data = Auto).fit()\n",
162 |     "print(glm_fit.params)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {
169 |     "collapsed": true
170 |    },
171 |    "outputs": [],
172 |    "source": [
173 |     "# trying CV in Python is not as easy as that in R. It will require some manual coding.\n",
174 |     "# to use some of implemented function in Python, we use Sklearn for linear model \n",
175 |     "# from sklearn.model_selection import KFold, cross_val_score\n",
176 |     "# from sklearn.preprocessing import PolynomialFeatures\n",
177 |     "# from sklearn.linear_model import LinearRegression\n",
178 |     "# from sklearn.pipeline import Pipeline"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "# let us re-train the model in sklearn\n",
188 |     "x = pd.DataFrame(Auto.horsepower)\n",
189 |     "y = Auto.mpg\n",
190 |     "\n",
191 |     "model = LinearRegression()\n",
192 |     "model.fit(x, y)\n",
193 |     "print(model.intercept_)\n",
194 |     "print(model.coef_)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "# loo use folds equal to # of observations. We could also choose other number of folds.\n",
204 |     "k_fold = KFold(n_splits=x.shape[0]) \n",
205 |     "test = cross_val_score(model, x, y, cv=k_fold,  scoring = 'neg_mean_squared_error', n_jobs=-1)\n",
206 |     "print(np.mean(-test))"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "# for higher order polynomial fit, we use pipline tool. \n",
216 |     "# below shows how to fit an order 1 to 20 polynomial data and show the loo results\n",
217 |     "# this step may take a few mins\n",
218 |     "A = OrderedDict()\n",
219 |     "n_split = x.shape[0]\n",
220 |     "for porder in range(1, 21, 2):\n",
221 |     "    model = Pipeline([('poly', PolynomialFeatures(degree=porder)), ('linear', LinearRegression())])\n",
222 |     "    k_fold = KFold(n_splits=n_split) # loo use folds equal to # of observations\n",
223 |     "    test = cross_val_score(model, x, y, cv=k_fold,  scoring = 'neg_mean_squared_error', n_jobs=-1)\n",
224 |     "    A[str(porder)] = np.mean(-test)\n",
225 |     "    \n",
226 |     "print(A)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "## 5.3.3 k-Fold Cross-Validation"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "# K-fold validation is exactly same as LOO with different n_splits parameter setup. \n",
243 |     "# the computation time is much shorter than that of LOOCV.\n",
244 |     "np.random.seed(2)\n",
245 |     "A = OrderedDict()\n",
246 |     "n_split = 10\n",
247 |     "for porder in range(1, 21, 2):\n",
248 |     "    model = Pipeline([('poly', PolynomialFeatures(degree=porder)), ('linear', LinearRegression())])\n",
249 |     "    k_fold = KFold(n_splits=n_split) \n",
250 |     "    test = cross_val_score(model, x, y, cv = k_fold,  scoring = 'neg_mean_squared_error', n_jobs = -1)\n",
251 |     "    A[str(porder)] = np.mean(-test)\n",
252 |     "    \n",
253 |     "print(A)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "## 5.3.4 The Bootstrap\n",
261 |     "Bootstrap means sampling with replacement. To eliminate the effect of sample size, the norm practice is to sample the same size as original dataset with replacement.\n",
262 |     "\n",
263 |     "Bootstrap can be used in a lot of other places, such as estimating the accuracy of a linear regression model coeffcients / Conduct non-parametric testing (permutation test) / Estimate some complicated probability "
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {
270 |     "collapsed": true
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "Portfolio = pd.read_csv('data/Portfolio.csv', header=0)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "# to illustrate the use of the bootstrap on this data, we must first create a function, alpha_fn(), \n",
284 |     "# which takes as input the (X, Y) data as well as a vector indicating which observations should be used to estimate alpha.\n",
285 |     "def alpha_fn(data, index):\n",
286 |     "    X = data.X.iloc[index]\n",
287 |     "    Y = data.Y.iloc[index]\n",
288 |     "    return (np.var(Y) - np.cov(X,Y)[0,1])/(np.var(X) + np.var(Y) - 2 * np.cov(X, Y)[0,1])"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "alpha_fn(Portfolio, range(0,100))"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "# generate one set of random index with 100 elements. The array has been sorted to show there are repeat elements.\n",
307 |     "np.sort(np.random.choice(range(0, 100), size=100, replace=True))"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": [
316 |     "# recall the previous function with a random set of input. \n",
317 |     "alpha_fn(Portfolio, np.random.choice(range(0, 100), size=100, replace=True))"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {
324 |     "collapsed": true
325 |    },
326 |    "outputs": [],
327 |    "source": [
328 |     "# since I am not aware of boot like function in python, I just defined an ad-hoc function called boot_python()\n",
329 |     "def boot_python(data, input_fun, iteration):\n",
330 |     "    n = Portfolio.shape[0]\n",
331 |     "    idx = np.random.randint(0, n, (iteration, n))\n",
332 |     "    stat = np.zeros(iteration)\n",
333 |     "    for i in range(len(idx)):\n",
334 |     "        stat[i] = input_fun(data, idx[i])\n",
335 |     "    \n",
336 |     "    return {'Mean': np.mean(stat), 'STD': np.std(stat)}\n",
337 |     "    "
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "boot_python(Portfolio, alpha_fn, 1000)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "# End of Chapter 5"
356 |    ]
357 |   }
358 |  ],
359 |  "metadata": {
360 |   "anaconda-cloud": {},
361 |   "interpreter": {
362 |    "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250"
363 |   },
364 |   "kernelspec": {
365 |    "display_name": "Python 2.7.16 64-bit ('base': conda)",
366 |    "name": "python3"
367 |   },
368 |   "language_info": {
369 |    "codemirror_mode": {
370 |     "name": "ipython",
371 |     "version": 3
372 |    },
373 |    "file_extension": ".py",
374 |    "mimetype": "text/x-python",
375 |    "name": "python",
376 |    "nbconvert_exporter": "python",
377 |    "pygments_lexer": "ipython3",
378 |    "version": "3.6.2"
379 |   }
380 |  },
381 |  "nbformat": 4,
382 |  "nbformat_minor": 1
383 | }
384 | 


--------------------------------------------------------------------------------
/Chapter_7_sec_7.8.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 7.8 Lab: Non-linear Modeling "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import matplotlib.pyplot as plt\n",
 18 |     "import scipy\n",
 19 |     "import pandas as pd \n",
 20 |     "from sklearn.linear_model import LinearRegression\n",
 21 |     "from sklearn.metrics import mean_squared_error, r2_score\n",
 22 |     "from sklearn.preprocessing import PolynomialFeatures\n",
 23 |     "import statsmodels.api as sm\n",
 24 |     "from patsy import dmatrix\n",
 25 |     "\n",
 26 |     "%matplotlib inline"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# in this lab, we will use Wage data. Let us read in the CSV data ans look at a sample of this data.\n",
 36 |     "Wage = pd.read_csv('data/Wage.csv', header=0, na_values='NA')\n",
 37 |     "print(Wage.shape)\n",
 38 |     "print(Wage.head())"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## 7.8.1 Polynomial Regression and Step Functions"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "\"\"\"\n",
 55 |     "We will examine how to fit a polynomial regression model on the wage dataset. As all the techniques, \n",
 56 |     "we have multiple ways to do this. Here I will use sklearn as we alreadly used statsmodel.api before in Chapter 3. \n",
 57 |     "If you are looking for more built-in functions around p-value, significance, confidence intervie, etc., \n",
 58 |     "I would recommend to use statsmodel.api. \n",
 59 |     "\n",
 60 |     "But scikit-learn does not have built error estimates for doing inference. But this problem forces us to \n",
 61 |     "think about a more general method to find Confidence Interview (key word: Bootstrap) \n",
 62 |     "\n",
 63 |     "Numpy also has a nice function to do ploynomial regression: https://www.ritchieng.com/machine-learning-polynomial-regression/\n",
 64 |     "\"\"\"\n",
 65 |     "\n",
 66 |     "n_deg = 4\n",
 67 |     "X = Wage.age\n",
 68 |     "y = Wage.wage\n",
 69 |     "X = X.values.reshape(X.shape[0], 1)\n",
 70 |     "y = y.values.reshape(y.shape[0], 1)\n",
 71 |     "\n",
 72 |     "polynomial_features= PolynomialFeatures(degree=n_deg)\n",
 73 |     "X_poly = polynomial_features.fit_transform(X)\n",
 74 |     "\n",
 75 |     "reg = LinearRegression()\n",
 76 |     "reg.fit(X_poly, y)\n",
 77 |     "\n",
 78 |     "# get coefficients and compare with the numbers \n",
 79 |     "print(reg.intercept_)\n",
 80 |     "print(reg.coef_)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# we now create a grid of values for age at which we want predictionsm and the call the generic predict() function \n",
 90 |     "# generate a sequence of age values spanning the range\n",
 91 |     "age_grid = np.arange(Wage.age.min(), Wage.age.max()).reshape(-1,1)\n",
 92 |     "\n",
 93 |     "# generate test data use PolynomialFeatures and fit_transform\n",
 94 |     "X_test = PolynomialFeatures(degree=n_deg).fit_transform(age_grid)\n",
 95 |     "\n",
 96 |     "# predict the value of the generated ages\n",
 97 |     "y_pred = reg.predict(X_test)\n",
 98 |     "\n",
 99 |     "# creating plots\n",
100 |     "plt.plot(age_grid, y_pred, color='red')\n",
101 |     "plt.show()"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "\"\"\"\n",
111 |     "Next we need to decide the order of the polynomial.\n",
112 |     "In the book, the authors did this by using hypothesis testing.  ANOVA using F-test was explanied. \n",
113 |     "In order to use the ANOVA function, two models $M_1$ and $M_2$ must be nested model: \n",
114 |     "the predictors in $M_1$ must be a subset of the predictors in $M_2$. \n",
115 |     "statsmodel.api has a nice built-in function to do that. \n",
116 |     "\n",
117 |     "As an alternative, we could choose the polynomial degree using cross-validation, as discussed in before. \n",
118 |     "Actually, the cross-validation approach is more commonly used in practice. \n",
119 |     "\"\"\"\n",
120 |     "\n",
121 |     "X1 = PolynomialFeatures(1).fit_transform(X)\n",
122 |     "X2 = PolynomialFeatures(2).fit_transform(X)\n",
123 |     "X3 = PolynomialFeatures(3).fit_transform(X)\n",
124 |     "X4 = PolynomialFeatures(4).fit_transform(X)\n",
125 |     "X5 = PolynomialFeatures(5).fit_transform(X)\n",
126 |     "fit1 = sm.GLS(y, X1).fit()\n",
127 |     "fit2 = sm.GLS(y, X2).fit()\n",
128 |     "fit3 = sm.GLS(y, X3).fit()\n",
129 |     "fit4 = sm.GLS(y, X4).fit()\n",
130 |     "fit5 = sm.GLS(y, X5).fit()"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "print(sm.stats.anova_lm(fit1, fit2, fit3, fit4, fit5, type=1))\n",
140 |     "\n",
141 |     "\"\"\"\n",
142 |     "The above table, we fit five different models and sequentially compare the simpler model to the more complex model.\n",
143 |     "The summary above shows the quadratic model fit2 is significantly better than fit1 at p value of $2.36*10^{-32}$.\n",
144 |     "Similarly, the cubic model is significnatly better than the quadratic model ($p = 1.68 * 10^{-3}$).\n",
145 |     "The p-value comparing the cubic and degree-4 polynomials, fit3 and fit4, is approximately 0.05 \n",
146 |     "while the degree-5 polynomial fit5 seems unnecessary because its p-value is 0.37. \n",
147 |     "Hence, either a cubic or a quartic polynomial appear to provide a reasonable fit to the data, \n",
148 |     "but lower- or higher-order models are not justified.\n",
149 |     "\"\"\""
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "# in the book, the authors also discussed logistic regression and the polynomial terms. \n",
159 |     "# in python, sm.GLM function provided some functions similar to glm() in R.\n",
160 |     "logistic_model = sm.GLM ((y>250), X4, family=sm.families.Binomial())\n",
161 |     "logistic_fit = logistic_model.fit()\n",
162 |     "print(logistic_fit.summary())"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "# in python, we could use the pd.cut() function to fit a step function.\n",
172 |     "age_cut, bins = pd.cut(Wage.age, bins=4, retbins=True, right=True)\n",
173 |     "age_cut.value_counts(sort=False)\n",
174 |     "\n",
175 |     "\"\"\" \n",
176 |     "Here cut() automatically picked the cutpoints at 33.5, 49, and 64.5 years of age. \n",
177 |     "We could also have specified our own cutpoints directly using the breaks option (set bins into a sequence of scalars, e.g. [0, 10, 20, 40, 100]). \n",
178 |     "Note in the following code, I manually added a constant column and dropped the lowest value bin (17.938, 33.5] dummy variable.\n",
179 |     "\"\"\""
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "age_cut_dummies = pd.get_dummies(age_cut)\n",
189 |     "age_cut_dummies = sm.add_constant(age_cut_dummies)\n",
190 |     "fit_age_cut = sm.GLM(Wage.wage, age_cut_dummies.drop(age_cut_dummies.columns[1], axis=1)).fit()\n",
191 |     "print(fit_age_cut.summary())"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "## 7.8.2 Splines"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "# in order to fit regression splines in python, we use the spatsy library. \n",
208 |     "# from patsy import dmatrix\n",
209 |     "\n",
210 |     "\"\"\" \n",
211 |     "In the content of section 7.4, we saw that regression splines can be fit by constructing an appropriate matrix of basis functions. \n",
212 |     "The bs() function generates the entire matrix of bs() basis functions for splines with the specified set of knots. \n",
213 |     "By default, cubic splines are produced. Here we have prespecified knots at ages 25, 40, and 60. \n",
214 |     "This produces a spline with six basis functions. \n",
215 |     "\"\"\"\n",
216 |     "age_grid = np.arange(Wage.age.min(), Wage.age.max()).reshape(-1,1)\n",
217 |     "spline_basis1 = dmatrix(\"bs(Wage.age, knots=(25,40,60), degree=3, include_intercept=False)\", {\"Wage.age\": Wage.age}, return_type='dataframe')"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "# now we can fit the model using the spline basis functions\n",
227 |     "spline_fit1 = sm.GLM(Wage.wage, spline_basis1).fit()\n",
228 |     "spline_fit1.summary()"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "# another approach is to fix the degree of freedom and let the code to automatically choose the knots.\n",
238 |     "spline_basis2 = dmatrix(\"bs(Wage.age, df=6, include_intercept=False)\",\n",
239 |     "                        {\"Wage.age\": Wage.age}, return_type='dataframe')\n",
240 |     "spline_fit2 = sm.GLM(Wage.wage, spline_basis2).fit()\n",
241 |     "spline_fit2.summary()"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "# package patsy also has a nice function to do natural spline using cr()\n",
251 |     "spline_basis3 = dmatrix(\"cr(Wage.age, df=4)\", {\"Wage.age\": Wage.age}, return_type='dataframe')\n",
252 |     "spline_fit3 = sm.GLM(Wage.wage, spline_basis3).fit()\n",
253 |     "spline_fit3.summary()"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "metadata": {
260 |     "collapsed": true
261 |    },
262 |    "outputs": [],
263 |    "source": [
264 |     "# finally, let us make some predictions\n",
265 |     "pred1 = spline_fit1.predict(dmatrix(\"bs(age_grid, knots=(25,40,60), include_intercept=False)\",{\"age_grid\": age_grid}, return_type='dataframe'))\n",
266 |     "pred2 = spline_fit2.predict(dmatrix(\"bs(age_grid, df=6, include_intercept=False)\",{\"age_grid\": age_grid}, return_type='dataframe'))\n",
267 |     "pred3 = spline_fit3.predict(dmatrix(\"cr(age_grid, df=4)\", {\"age_grid\": age_grid}, return_type='dataframe'))"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "# plot the splines and error bands\n",
277 |     "plt.scatter(Wage.age, Wage.wage, facecolor='None', edgecolor='k', alpha=0.1)\n",
278 |     "plt.plot(age_grid, pred1, color='r', label='Cubic spine with knots at [25, 40, 60]')\n",
279 |     "plt.plot(age_grid, pred2, color='g', label='Cubic spine with df=6')\n",
280 |     "plt.plot(age_grid, pred3, color='b', label='Natural spline df=4')\n",
281 |     "plt.legend()\n",
282 |     "plt.xlim(15,85)\n",
283 |     "plt.ylim(0,350)\n",
284 |     "plt.xlabel('age')\n",
285 |     "plt.ylabel('wage')\n",
286 |     "plt.show()"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "## 7.8.3 GAMs"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "# we now fit a GAM to predict wage using natural spline functions of year and age, treating education as a qualitative (i.e. categorical) predictor.\n",
303 |     "age_basis = dmatrix(\"cr(Wage.age, df=5)\", {\"Wage.age\": Wage.age}, return_type='dataframe')\n",
304 |     "year_basis = dmatrix(\"cr(Wage.year, df=4)\", {\"Wage.year\": Wage.year}, return_type='dataframe').drop (['Intercept'], axis = 1)\n",
305 |     "education_dummies = pd.get_dummies(Wage.education)\n",
306 |     "education_dummies = education_dummies.drop([education_dummies.columns[0]], axis = 1)\n",
307 |     "\n",
308 |     "# we concatenate all the predictors\n",
309 |     "x_all = pd.concat([age_basis, year_basis, education_dummies], axis=1)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "collapsed": true
317 |    },
318 |    "outputs": [],
319 |    "source": [
320 |     "# fit a model and print the summary\n",
321 |     "gam1_fit = sm.OLS(Wage.wage, x_all).fit()\n",
322 |     "print(gam1_fit.summary())\n",
323 |     "\n",
324 |     "\"\"\" \n",
325 |     "We could apply similar analysis procedure to this analysis, \n",
326 |     "such as ANOVA, construction of a classification model and visually inspecting the model performance.\n",
327 |     "\"\"\""
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": [
336 |     "# End of Chapter 7"
337 |    ]
338 |   }
339 |  ],
340 |  "metadata": {
341 |   "interpreter": {
342 |    "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250"
343 |   },
344 |   "kernelspec": {
345 |    "display_name": "Python 2.7.16 64-bit ('base': conda)",
346 |    "name": "python3"
347 |   },
348 |   "language_info": {
349 |    "codemirror_mode": {
350 |     "name": "ipython",
351 |     "version": 3
352 |    },
353 |    "file_extension": ".py",
354 |    "mimetype": "text/x-python",
355 |    "name": "python",
356 |    "nbconvert_exporter": "python",
357 |    "pygments_lexer": "ipython3",
358 |    "version": "3.6.2"
359 |   }
360 |  },
361 |  "nbformat": 4,
362 |  "nbformat_minor": 1
363 | }
364 | 


--------------------------------------------------------------------------------
/Chapter_8_sec_8.3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 8.3 Lab: Decision Trees"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "from sklearn.model_selection import train_test_split\n",
 19 |     "from sklearn.tree import DecisionTreeClassifier, export_graphviz, DecisionTreeRegressor, plot_tree\n",
 20 |     "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
 21 |     "from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error\n",
 22 |     "from sklearn import tree\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "\n",
 25 |     "%matplotlib inline"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## 8.3.1 Fitting Classification Trees"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "\"\"\" \n",
 42 |     "The sklearn library has a lot of useful tools for tress. We first use classification trees to analyze the Carseats data set.\n",
 43 |     "In these data, Sales is a continuous variable, and so we begin by recoding it as a binary variable (by thresholding). \n",
 44 |     "We use the map() function to create a variable, called High, which takes on a value of 'Y' if the Sales variable exceeds 8, \n",
 45 |     "and takes on a value of 'N' otherwise. In Python, we need to code catergorical variable into dummy variable.\n",
 46 |     "\"\"\"\n",
 47 |     "\n",
 48 |     "carseats = pd.read_csv('./data/Carseats.csv')\n",
 49 |     "carseats['High'] = carseats.Sales.map(lambda x: 'Y' if x>8 else 'N')\n",
 50 |     "carseats.ShelveLoc = pd.factorize(carseats.ShelveLoc)[0]\n",
 51 |     "carseats.Urban = carseats.Urban.map({'No':0, 'Yes':1})\n",
 52 |     "carseats.US = carseats.US.map({'No':0, 'Yes':1})\n",
 53 |     "print(carseats.describe())\n",
 54 |     "print(carseats.info())"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {
 61 |     "collapsed": true
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# we first split the dataset into training (200 samples) and test sets.\n",
 66 |     "X = carseats.drop(['Sales', 'High'], axis=1)\n",
 67 |     "y = carseats.High\n",
 68 |     "train_size = 200\n",
 69 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, \n",
 70 |     "                                    test_size=X.shape[0]-train_size, random_state=0)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "# to build a tree, we could use 'gini' or 'entropy' as split criterion at each node. \n",
 80 |     "# here we use an example use 'gini' and with a few other hyperparameters.\n",
 81 |     "criteria = \"gini\" \n",
 82 |     "max_depth = 6 \n",
 83 |     "min_sample_leaf = 4\n",
 84 |     "clf_gini = DecisionTreeClassifier(criterion=criteria, random_state=100,\n",
 85 |     "                                max_depth=max_depth, min_samples_leaf=min_sample_leaf)\n",
 86 |     "clf_gini.fit(X_train, y_train)\n",
 87 |     "print(clf_gini.score(X_train, y_train))\n"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "# one attractive feature of a tree is visulization. \n",
 97 |     "plt.figure(figsize=(40,20))  # customize according to the size of your tree\n",
 98 |     "plot_tree(clf_gini, feature_names = X_train.columns)\n",
 99 |     "plt.show()\n"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "# let us build the confusion matrix to evaluate the model in accuracy for both training and test datasets.\n",
109 |     "# we could also compute more metrics such as precision, recall, f1-score, etc.\n",
110 |     "y_pred_train = clf_gini.predict(X_train)\n",
111 |     "cm = pd.DataFrame(confusion_matrix(y_train, y_pred_train).T, index=['No', 'Yes'], columns=['No', 'Yes'])\n",
112 |     "print(cm)\n",
113 |     "print(\"Train Accuracy is \", accuracy_score(y_train,y_pred_train)*100)\n",
114 |     "\n",
115 |     "\n",
116 |     "y_pred = clf_gini.predict(X_test)\n",
117 |     "cm = pd.DataFrame(confusion_matrix(y_test, y_pred).T, index=['No', 'Yes'], columns=['No', 'Yes'])\n",
118 |     "print(cm)\n",
119 |     "print(\"Test Accuracy is \", accuracy_score(y_test,y_pred)*100)\n",
120 |     "\n",
121 |     "\"\"\"\n",
122 |     "The test accuracy of our model is significant lower than our training result, this may indicate overfitting. \n",
123 |     "we can go back and change the hyperparameters in the training process to reduce the dimension of the parameter space.\n",
124 |     "\"\"\""
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "## 8.3.2 Fitting Regression Trees"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "\"\"\" \n",
141 |     "Another attractive feature of a tree is the ability to solve both classification and regression problems.\n",
142 |     "Here we fit a regression tree to the Boston data set. First, we create a training set, and fit the tree to the training data. \n",
143 |     "Since Python does not support back-prune, let us use the max_depth at 2.\n",
144 |     "\"\"\"\n",
145 |     "\n",
146 |     "# as we move forward, it is good to keep the hyperparameters together for future iterations.\n",
147 |     "boston = pd.read_csv('./data/Boston.csv')\n",
148 |     "X = boston.drop('medv', axis=1)\n",
149 |     "y = boston.medv\n",
150 |     "train_size = 0.5 # we used specific train size before, we can also use a percentage. \n",
151 |     "random_state = 0 \n",
152 |     "max_depth = 2\n",
153 |     "\n",
154 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=random_state)\n",
155 |     "regr_tree = DecisionTreeRegressor(max_depth=max_depth)\n",
156 |     "regr_tree.fit(X_train, y_train)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "plt.figure(figsize=(40,20))  # customize according to the size of your tree\n",
166 |     "plot_tree(regr_tree, feature_names = X_train.columns)\n",
167 |     "plt.show()"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "y_pred = regr_tree.predict(X_test)\n",
177 |     "print(mean_squared_error(y_test, y_pred))\n",
178 |     "\"\"\" \n",
179 |     "We could look more into train and test accuracy to see whether the current model is overfitting or underfitting.\n",
180 |     "\"\"\""
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "## 8.3.3 Bagging and Random Forests"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "\"\"\"\n",
197 |     "Here we apply bagging and random forests (RF) to the Boston data.\n",
198 |     "RF is an ensemble method, which means it combines the results of multiple decision trees. \n",
199 |     "As a result, RF could help to reduce the variance of the model. \n",
200 |     "Similar to decision trees, RF can be used to solve both classification and regression problems.\n",
201 |     "\n",
202 |     "\n",
203 |     "In this excercise, we will use the randomForest package in Python. \n",
204 |     "The exact results obtained in this section may depend on the version of Python and the version of the randomForest package installed on your computer. \n",
205 |     "\"\"\""
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "# we reused the previous train and test sets.\n",
215 |     "all_features = X_train.shape[1]\n",
216 |     "regr_bagging = RandomForestRegressor(max_features=all_features, random_state=1)\n",
217 |     "regr_bagging.fit(X_train, y_train)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "y_pred = regr_bagging.predict(X_test)\n",
227 |     "print(mean_squared_error(y_test, y_pred))\n",
228 |     "\n",
229 |     "\"\"\"\n",
230 |     "we can compare the test accuracy of the bagging model with the test accuracy of the singl regression tree above.\n",
231 |     "Normally, the bagging model is better than the single tree model.\n",
232 |     "\"\"\""
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "\"\"\"\n",
242 |     "We can grow a random forest in exactly the same way, except that we'll use a smaller value of the max_features argument. \n",
243 |     "Theoretically, radomly selecting a subset of features could reduce the correlation of the tress and can reduce the variance of the model.\n",
244 |     "\"\"\"\n",
245 |     "# here we'll use max_features = 3 (close to square root of all features as a rule of thumb)\n",
246 |     "regr_rf = RandomForestRegressor(max_features=3, random_state=1)\n",
247 |     "regr_rf.fit(X_train, y_train)\n",
248 |     "\n",
249 |     "y_pred = regr_rf.predict(X_test)\n",
250 |     "print(mean_squared_error(y_test, y_pred))\n",
251 |     "\n",
252 |     "\"\"\" \n",
253 |     "The test set MSE is even lower; this indicates that random forests yielded an improvement over bagging in this case.\n",
254 |     "\"\"\""
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "# RF provides multiple ways to interpret the model. One way is to see the importance of each feature.\n",
264 |     "Importance = pd.DataFrame({'Importance':regr_rf.feature_importances_*100}, index=X_train.columns)\n",
265 |     "Importance.sort_values(by='Importance', axis=0, ascending=True).plot(kind='barh', color='r', )\n",
266 |     "plt.xlabel('Variable Importance')\n",
267 |     "plt.gca().legend_ = None"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "## 8.3.4 Boosting"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "Here we use the GradientBoostingRegressor package. The argument n_estimators=500 indicates that we want 500 trees, and the option interaction.depth=4 limits the depth of each tree."
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "\"\"\"\n",
291 |     "Boosting is another ensemble method. Gradient Boosting is a popular method, \n",
292 |     "and other well-known methods include AdaBoost, XGBoost, and LightGBM are buiit on top of it.\n",
293 |     "\n",
294 |     "Here we use the GradientBoostingRegressor package. The argument n_estimators=500 indicates that we want 500 trees, \n",
295 |     "and the option interaction.depth=4 limits the depth of each tree. See the manuel for more details.\n",
296 |     "\"\"\"\n",
297 |     "\n",
298 |     "regr_boost = GradientBoostingRegressor(n_estimators=500, learning_rate=0.02, max_depth=4, random_state=1)\n",
299 |     "regr_boost.fit(X_train, y_train)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "# let us check the feature importance and MSE.\n",
309 |     "feature_importance = regr_boost.feature_importances_*100\n",
310 |     "rel_imp = pd.Series(feature_importance, index=X_train.columns).sort_values(inplace=False)\n",
311 |     "rel_imp.T.plot(kind='barh', color='r', )\n",
312 |     "plt.xlabel('Variable Importance')\n",
313 |     "plt.gca().legend_ = None"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "y_pred = regr_boost.predict(X_test)\n",
323 |     "print(mean_squared_error(y_test,y_pred))"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "markdown",
328 |    "metadata": {},
329 |    "source": [
330 |     "## 8.3.5 Bayesian Additive Regression Trees\n",
331 |     "As of now (2021), I was not able to find a good package for BART in Python. Please reach out if you have a package that works.\n",
332 |     "** [To do: find a package for BART in Python.] **"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": [
341 |     "\"\"\"\n",
342 |     "Here we discuss Bayesian additive regression trees (BART), another ensemble method that uses decision trees as its building blocks.\n",
343 |     "BART is related to both Random forest and boosting: each tree is constructed in a random manner as in bagging and random forests, \n",
344 |     "and each tree tries to capture signal not yet accounted for by the current model, as in boosting. \n",
345 |     "The main novelty in BART is the way in which new trees are generated.\n",
346 |     "\"\"\""
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "# End of Chapter 8"
356 |    ]
357 |   }
358 |  ],
359 |  "metadata": {
360 |   "interpreter": {
361 |    "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250"
362 |   },
363 |   "kernelspec": {
364 |    "display_name": "Python 2.7.16 64-bit ('base': conda)",
365 |    "name": "python3"
366 |   },
367 |   "language_info": {
368 |    "codemirror_mode": {
369 |     "name": "ipython",
370 |     "version": 3
371 |    },
372 |    "file_extension": ".py",
373 |    "mimetype": "text/x-python",
374 |    "name": "python",
375 |    "nbconvert_exporter": "python",
376 |    "pygments_lexer": "ipython3",
377 |    "version": "3.6.2"
378 |   }
379 |  },
380 |  "nbformat": 4,
381 |  "nbformat_minor": 2
382 | }
383 | 


--------------------------------------------------------------------------------
/Chapter_9_sec_9.6.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# 9.6 Lab: Support Vector Machines"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "import matplotlib.pyplot as plt\n",
 19 |     "import seaborn as sns\n",
 20 |     "\n",
 21 |     "from sklearn.model_selection import train_test_split, GridSearchCV\n",
 22 |     "from sklearn.svm import SVC, LinearSVC\n",
 23 |     "from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve, auc, classification_report\n",
 24 |     "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
 25 |     "\n",
 26 |     "import json\n",
 27 |     "%matplotlib inline"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# support function to plot the decision boundary of svc and highlight the support vectors\n",
 37 |     "def plot_decision_boundary(svc, X, y, h=0.021, pad=0.21):\n",
 38 |     "    x_min, x_max = X[:, 0].min() - pad, X[:, 0].max() + pad\n",
 39 |     "    y_min, y_max = X[:, 1].min() - pad, X[:, 1].max() + pad\n",
 40 |     "    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n",
 41 |     "    Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])\n",
 42 |     "    Z = Z.reshape(xx.shape)\n",
 43 |     "    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.2)\n",
 44 |     "    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)\n",
 45 |     "\n",
 46 |     "    # highlight the support vectors\n",
 47 |     "    sv = svc.support_vectors_\n",
 48 |     "    plt.scatter(sv[:,0], sv[:,1], c='k', marker='*', s=21, linewidths=1)\n",
 49 |     "    plt.xlim(x_min, x_max)\n",
 50 |     "    plt.ylim(y_min, y_max)\n",
 51 |     "    plt.xlabel('X1')\n",
 52 |     "    plt.ylabel('X2')\n",
 53 |     "    plt.show()\n",
 54 |     "    print('Number of support vectors: ', svc.support_.size)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## 9.6.1 Support Vector Classifier"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# we start from generating random dataset: following the bookm we generate a dataset with 20 observations,\n",
 71 |     "# 2 features. And we divide these into two classes.\n",
 72 |     "# set seed \n",
 73 |     "np.random.seed(21)\n",
 74 |     "X = np.random.randn(20, 2)\n",
 75 |     "y = np.repeat([-1,1], 10)\n",
 76 |     "X[y==1] = X[y==1] + 1\n",
 77 |     "\n",
 78 |     "plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)\n",
 79 |     "plt.xlabel('X1')\n",
 80 |     "plt.ylabel('X2')\n",
 81 |     "plt.show()"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# Support Vector Classifier (i.e. support vector machine with linear kernel)\n",
 91 |     "svc1 = SVC(C= 10, kernel='linear')\n",
 92 |     "svc1.fit(X, y)\n",
 93 |     "\n",
 94 |     "plot_decision_boundary(svc1, X, y)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "# as mentioned before, we could use dir() to see the methods of the class\n",
104 |     "# I did not find a good way to print out the summary of the SVC model.\n",
105 |     "dir(svc1)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "# we could take a look at the defaul parameters of the SVC model\n",
115 |     "svc1.get_params()"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "# we use a small cost (c = 0.1). A smaller value of the cost parameter is being used, \n",
125 |     "# we obtain a larger number of support vectors, because the margin is now wider. \n",
126 |     "svc2 = SVC(C=0.1, kernel='linear')\n",
127 |     "svc2.fit(X, y)\n",
128 |     "\n",
129 |     "plot_decision_boundary(svc2, X, y)\n"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "# we could also try to tune the cost parameter (C) of the SVC model using GridSearchCV\n",
139 |     "# in this function, we need to specify cross validation folds and the metric to use for evaluation\n",
140 |     "tuned_parameters = [{'C': [0.001, 0.01, 0.1, 1, 5, 10, 100]}]\n",
141 |     "clf = GridSearchCV(SVC(kernel='linear'), tuned_parameters, cv=10, scoring='accuracy', return_train_score=True)\n",
142 |     "clf.fit(X, y)\n",
143 |     "clf.cv_results_"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "# let us see the best parameters. \n",
153 |     "# This is different from the results in the book, it is very likely due to the random generation of the datasetof the data\n",
154 |     "clf.best_params_"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "# we use the same generation process to generate test data\n",
164 |     "X_test = np.random.randn(20, 2)\n",
165 |     "y_test = np.repeat([-1,1], 10)\n",
166 |     "X_test[y_test==1] = X_test[y_test==1] + 1\n",
167 |     "\n",
168 |     "plt.scatter(X_test[:,0], X_test[:,1], s=70, c=y_test, cmap=plt.cm.Paired)\n",
169 |     "plt.xlabel('X1')\n",
170 |     "plt.ylabel('X2')\n",
171 |     "plt.show()"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "# train a model with the optimal parameters\n",
181 |     "svc3 = SVC(C=1, kernel='linear')\n",
182 |     "svc3.fit(X, y)\n",
183 |     "\n",
184 |     "y_pred = svc3.predict(X_test)\n",
185 |     "pd.DataFrame(confusion_matrix(y_test, y_pred),index=svc3.classes_, columns=svc3.classes_)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "# now we make our data linear separable. In the book, they add another 0.5 to seperate the data. \n",
195 |     "# here we start from the data generation process to aviod confusion.\n",
196 |     "np.random.seed(21)\n",
197 |     "X = np.random.randn(20, 2)\n",
198 |     "y = np.repeat([-1,1], 10)\n",
199 |     "X[y==1] = X[y==1] + 2.5\n",
200 |     "\n",
201 |     "plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)\n",
202 |     "plt.xlabel('X1')\n",
203 |     "plt.ylabel('X2')\n",
204 |     "plt.show()"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "X_test = np.random.randn(20, 2)\n",
214 |     "y_test = np.repeat([-1,1], 10)\n",
215 |     "X_test[y_test==1] = X_test[y_test==1] + 2.5\n",
216 |     "\n",
217 |     "plt.scatter(X_test[:,0], X_test[:,1], s=70, c=y_test, cmap=plt.cm.Paired)\n",
218 |     "plt.xlabel('X1')\n",
219 |     "plt.ylabel('X2')\n",
220 |     "plt.show()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "# here seems the data is linear separable. We could use a bigger cost parameter (C = 100) to train the model.\n",
230 |     "svc4 = SVC(C=100, kernel='linear')\n",
231 |     "svc4.fit(X, y)\n",
232 |     "\n",
233 |     "plot_decision_boundary(svc4, X, y)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "y_pred = svc4.predict(X_test)\n",
243 |     "pd.DataFrame(confusion_matrix(y_test, y_pred),index=svc4.classes_, columns=svc4.classes_)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "## 9.6.2 Support Vector Machine"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "# generating random dataset\n",
260 |     "np.random.seed(21)\n",
261 |     "X = np.random.randn(200,2)\n",
262 |     "X[:100] = X[:100] + 2\n",
263 |     "X[101:150] = X[101:150] - 2\n",
264 |     "y = np.concatenate([np.repeat(-1, 150), np.repeat(1,50)])\n",
265 |     "\n",
266 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2)\n",
267 |     "\n",
268 |     "plt.scatter(X[:,0], X[:,1], s=70, c=y, cmap=plt.cm.Paired)\n",
269 |     "plt.xlabel('X1')\n",
270 |     "plt.ylabel('X2')\n",
271 |     "plt.show()"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "# in python, we can use the same svc model abd kernel to specific the kernel\n",
281 |     "# for rbf kernel, we need to specify the gamma parameter\n",
282 |     "svm = SVC(C=1.0, kernel='rbf', gamma=1)\n",
283 |     "svm.fit(X_train, y_train)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "plot_decision_boundary(svm, X_train, y_train)"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "y_pred = svm.predict(X_test)\n",
302 |     "pd.DataFrame(confusion_matrix(y_test, y_pred),index=svm.classes_, columns=svm.classes_)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "# increasing C parameter which increases more flexibility\n",
312 |     "svm2 = SVC(C=100, kernel='rbf', gamma=1.0)\n",
313 |     "svm2.fit(X_train, y_train)\n",
314 |     "plot_decision_boundary(svm2, X_train, y_train)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "\"\"\"\n",
324 |     "The above decision boundary seems overfitting. We can compute the test accuracy of the model to\n",
325 |     "see whether that is the case. \n",
326 |     "\n",
327 |     "The model (c = 1) yields a test accuracy of 0.85; the model(c = 100) yields a test accuracy of 0.77.\n",
328 |     "\"\"\"\n",
329 |     "y_pred = svm2.predict(X_test)\n",
330 |     "pd.DataFrame(confusion_matrix(y_test, y_pred),index=svm2.classes_, columns=svm2.classes_)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "# set the parameters by cross-validation\n",
340 |     "tuned_parameters = [{'C': [0.01, 0.1, 1, 10, 100],\n",
341 |     "                     'gamma': [0.5, 1,2,3,4]}]\n",
342 |     "clf = GridSearchCV(SVC(kernel='rbf'), tuned_parameters, cv=10, scoring='accuracy', return_train_score=True)\n",
343 |     "clf.fit(X_train, y_train)\n",
344 |     "clf.cv_results_"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": [
353 |     "# let us see the best parameters.\n",
354 |     "clf.best_params_"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "# confusion matrix for the best model\n",
364 |     "confusion_matrix(y_test, clf.best_estimator_.predict(X_test))"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {},
371 |    "outputs": [],
372 |    "source": [
373 |     "# calculate the test accuracy\n",
374 |     "clf.best_estimator_.score(X_test, y_test)"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {},
380 |    "source": [
381 |     "## 9.6.3 ROC Curves"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "svm3 = SVC(C=1, kernel='rbf', gamma=2)\n",
391 |     "svm3.fit(X_train, y_train)\n",
392 |     "\n",
393 |     "# we train another model flexible model\n",
394 |     "svm4 = SVC(C=1, kernel='rbf', gamma=50)\n",
395 |     "svm4.fit(X_train, y_train)\n",
396 |     "\n",
397 |     "y_train_score3 = svm3.decision_function(X_train)\n",
398 |     "y_train_score4 = svm4.decision_function(X_train)\n",
399 |     "\n",
400 |     "false_pos_rate3, true_pos_rate3, _ = roc_curve(y_train, y_train_score3)\n",
401 |     "roc_auc3 = auc(false_pos_rate3, true_pos_rate3)\n",
402 |     "\n",
403 |     "false_pos_rate4, true_pos_rate4, _ = roc_curve(y_train, y_train_score4)\n",
404 |     "roc_auc4 = auc(false_pos_rate4, true_pos_rate4)"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(14,6))\n",
414 |     "ax1.plot(false_pos_rate3, true_pos_rate3, label='SVM $\\gamma = 1$ ROC curve (area = %0.2f)' % roc_auc3, color='b')\n",
415 |     "ax1.plot(false_pos_rate4, true_pos_rate4, label='SVM $\\gamma = 50$ ROC curve (area = %0.2f)' % roc_auc4, color='r')\n",
416 |     "ax1.set_title('Training Data')\n",
417 |     "\n",
418 |     "y_test_score3 = svm3.decision_function(X_test)\n",
419 |     "y_test_score4 = svm4.decision_function(X_test)\n",
420 |     "\n",
421 |     "false_pos_rate3, true_pos_rate3, _ = roc_curve(y_test, y_test_score3)\n",
422 |     "roc_auc3 = auc(false_pos_rate3, true_pos_rate3)\n",
423 |     "\n",
424 |     "false_pos_rate4, true_pos_rate4, _ = roc_curve(y_test, y_test_score4)\n",
425 |     "roc_auc4 = auc(false_pos_rate4, true_pos_rate4)\n",
426 |     "\n",
427 |     "ax2.plot(false_pos_rate3, true_pos_rate3, label='SVM $\\gamma = 1$ ROC curve (area = %0.2f)' % roc_auc3, color='b')\n",
428 |     "ax2.plot(false_pos_rate4, true_pos_rate4, label='SVM $\\gamma = 50$ ROC curve (area = %0.2f)' % roc_auc4, color='r')\n",
429 |     "ax2.set_title('Test Data')\n",
430 |     "\n",
431 |     "for ax in fig.axes:\n",
432 |     "    ax.plot([0, 1], [0, 1], 'k--')\n",
433 |     "    ax.set_xlim([-0.05, 1.0])\n",
434 |     "    ax.set_ylim([0.0, 1.05])\n",
435 |     "    ax.set_xlabel('False Positive Rate')\n",
436 |     "    ax.set_ylabel('True Positive Rate')\n",
437 |     "    ax.legend(loc=\"lower right\")\n",
438 |     "\n",
439 |     "\"\"\" \n",
440 |     "From the plots below, we can see that the model with gamma = 50 is overfitting the training data \n",
441 |     "(i.e. the training metric is much better than the test metric).\n",
442 |     "\"\"\""
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "markdown",
447 |    "metadata": {},
448 |    "source": [
449 |     "## 9.6.4 SVM with Multiple Classes"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": null,
455 |    "metadata": {},
456 |    "outputs": [],
457 |    "source": [
458 |     "# generate the previously used random dataset\n",
459 |     "np.random.seed(21)\n",
460 |     "X = np.random.randn(200,2)\n",
461 |     "X[:100] = X[:100] + 2\n",
462 |     "X[101:150] = X[101:150] - 2\n",
463 |     "y = np.concatenate([np.repeat(-1, 150), np.repeat(1,50)])\n",
464 |     "\n",
465 |     "# adding another class to the dataset, I used a different offset to separate the classes better\n",
466 |     "XX = np.vstack([X, np.random.randn(50,2)])\n",
467 |     "yy = np.hstack([y, np.repeat(0,50)])\n",
468 |     "XX[yy==0, 1] = XX[yy==0, 1] + 6\n",
469 |     "\n",
470 |     "plt.scatter(XX[:,0], XX[:,1], s=70, c=yy, cmap=plt.cm.prism)\n",
471 |     "plt.xlabel('XX1')\n",
472 |     "plt.ylabel('XX2')"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "# fit the svm model \n",
482 |     "svm5 = SVC(C=10, kernel='rbf', gamma=1)\n",
483 |     "svm5.fit(XX, yy)\n",
484 |     "plot_decision_boundary(svm5, XX, yy)"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "markdown",
489 |    "metadata": {},
490 |    "source": [
491 |     "## 9.6.5 Application to Gene Expression Data"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": null,
497 |    "metadata": {},
498 |    "outputs": [],
499 |    "source": [
500 |     "# I saved the gene expression data as a json file, in python we could load the json file using the json library\n",
501 |     "# after reading in the data, we can use the data is same as a dictionary, we can use the keys to access the data\n",
502 |     "# import json\n",
503 |     "f = open('./data/Khan.json',)\n",
504 |     "Khan = json.load(f)\n",
505 |     "print(Khan.keys())"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": null,
511 |    "metadata": {},
512 |    "outputs": [],
513 |    "source": [
514 |     "X_train = np.array(Khan['xtrain'])\n",
515 |     "y_train = np.array(Khan['ytrain'])\n",
516 |     "X_test = np.array(Khan['xtest'])\n",
517 |     "y_test = np.array(Khan['ytest'])"
518 |    ]
519 |   },
520 |   {
521 |    "cell_type": "code",
522 |    "execution_count": null,
523 |    "metadata": {},
524 |    "outputs": [],
525 |    "source": [
526 |     "# take a look at the data, we will notice there are 4 classes\n",
527 |     "np.unique(y_train)"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": null,
533 |    "metadata": {},
534 |    "outputs": [],
535 |    "source": [
536 |     "svm6 = SVC(C = 10, kernel='linear')\n",
537 |     "svm6.fit(X_train, y_train)"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": null,
543 |    "metadata": {},
544 |    "outputs": [],
545 |    "source": [
546 |     "\"\"\" \n",
547 |     "We see below that the model is perfect on training data. In fact, this is not surprising, \n",
548 |     "because the large number of variables relative to the number of observations implies that \n",
549 |     "it is easy to find hyperplanes that fully separate the classes. We are most interested not \n",
550 |     "in the support vector classifier’s performance on the training observations, but rather its \n",
551 |     "performance on the test observations.\n",
552 |     "\"\"\"\n",
553 |     "print('train accuracy', svm6.score(X_train, y_train))\n",
554 |     "y_pred = svm6.predict(X_test)\n",
555 |     "print('test accuracy', svm6.score(X_test, y_test))"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": null,
561 |    "metadata": {},
562 |    "outputs": [],
563 |    "source": [
564 |     "# End of Chapter 9"
565 |    ]
566 |   }
567 |  ],
568 |  "metadata": {
569 |   "interpreter": {
570 |    "hash": "4548a0e672c5b3a287feee7b2962606840aa548749d1830ef724408652b0c250"
571 |   },
572 |   "kernelspec": {
573 |    "display_name": "Python 2.7.16 64-bit ('base': conda)",
574 |    "name": "python3"
575 |   },
576 |   "language_info": {
577 |    "codemirror_mode": {
578 |     "name": "ipython",
579 |     "version": 3
580 |    },
581 |    "file_extension": ".py",
582 |    "mimetype": "text/x-python",
583 |    "name": "python",
584 |    "nbconvert_exporter": "python",
585 |    "pygments_lexer": "ipython3",
586 |    "version": "3.6.2"
587 |   }
588 |  },
589 |  "nbformat": 4,
590 |  "nbformat_minor": 2
591 | }
592 | 


--------------------------------------------------------------------------------
/ISLR_v1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qx0731/Sharing_ISL_python/6b588e773be9984d0a4b91172fcce803c39fe0d6/ISLR_v1.pdf


--------------------------------------------------------------------------------
/ISLR_v2_2021_Nov.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qx0731/Sharing_ISL_python/6b588e773be9984d0a4b91172fcce803c39fe0d6/ISLR_v2_2021_Nov.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ISL_python
 2 | ### An Introduction to Statistical Learning with Applications in PYTHON
 3 | 
 4 | I love the book << An Introduction to Statistical Learning with Applications in R>> by Gareth James • Daniela Witten • Trevor Hastie and Robert Tibshirani. This book has been super helpful for me. 
 5 | 
 6 | In this repository, I have implemented the same/similar functionality in Python. The code is in a script format to show the thought process. Hope this could help this book reach more broad audience. *Don't let the language barriers stop you from exploring something fun and useful.*
 7 | 
 8 | Please refer https://www.statlearning.com/ for more details. In 2nd Edition, the authors introduced a R library ISLR2 for all the dataset used in the book. 
 9 | 
10 | ### Setup for this repository:
11 | * Python==3.6.2
12 | * ipykernal==4.10.0
13 | * numpy==1.19.2
14 | * matplotlib==3.3.4
15 | * pandas==1.1.5
16 | * statsmodels==0.12.2
17 | * scikit-learn==0.21.1
18 | * patsy==0.5.1
19 | * scipy==1.5.2
20 | * seaborn==0.11.2
21 | * json==2.0.9
22 | * tensorflow==2.0.0
23 | * keras==2.3.1
24 | * lifelines==0.26.3
25 | * math 
26 | * random 
27 | * collections
28 | * itertools
29 | 
30 | 
31 | ### Special thanks to Bommy
32 | 
33 |                __
34 |           (___()'`;  
35 |           /,    /`
36 |           \\"--\\
37 | 
38 | Reference: https://www.asciiart.eu/animals/dogs
39 | 


--------------------------------------------------------------------------------
/data/Auto.csv:
--------------------------------------------------------------------------------
  1 | mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
  2 | 18,8,307,130,3504,12,70,1,chevrolet chevelle malibu
  3 | 15,8,350,165,3693,11.5,70,1,buick skylark 320
  4 | 18,8,318,150,3436,11,70,1,plymouth satellite
  5 | 16,8,304,150,3433,12,70,1,amc rebel sst
  6 | 17,8,302,140,3449,10.5,70,1,ford torino
  7 | 15,8,429,198,4341,10,70,1,ford galaxie 500
  8 | 14,8,454,220,4354,9,70,1,chevrolet impala
  9 | 14,8,440,215,4312,8.5,70,1,plymouth fury iii
 10 | 14,8,455,225,4425,10,70,1,pontiac catalina
 11 | 15,8,390,190,3850,8.5,70,1,amc ambassador dpl
 12 | 15,8,383,170,3563,10,70,1,dodge challenger se
 13 | 14,8,340,160,3609,8,70,1,plymouth 'cuda 340
 14 | 15,8,400,150,3761,9.5,70,1,chevrolet monte carlo
 15 | 14,8,455,225,3086,10,70,1,buick estate wagon (sw)
 16 | 24,4,113,95,2372,15,70,3,toyota corona mark ii
 17 | 22,6,198,95,2833,15.5,70,1,plymouth duster
 18 | 18,6,199,97,2774,15.5,70,1,amc hornet
 19 | 21,6,200,85,2587,16,70,1,ford maverick
 20 | 27,4,97,88,2130,14.5,70,3,datsun pl510
 21 | 26,4,97,46,1835,20.5,70,2,volkswagen 1131 deluxe sedan
 22 | 25,4,110,87,2672,17.5,70,2,peugeot 504
 23 | 24,4,107,90,2430,14.5,70,2,audi 100 ls
 24 | 25,4,104,95,2375,17.5,70,2,saab 99e
 25 | 26,4,121,113,2234,12.5,70,2,bmw 2002
 26 | 21,6,199,90,2648,15,70,1,amc gremlin
 27 | 10,8,360,215,4615,14,70,1,ford f250
 28 | 10,8,307,200,4376,15,70,1,chevy c20
 29 | 11,8,318,210,4382,13.5,70,1,dodge d200
 30 | 9,8,304,193,4732,18.5,70,1,hi 1200d
 31 | 27,4,97,88,2130,14.5,71,3,datsun pl510
 32 | 28,4,140,90,2264,15.5,71,1,chevrolet vega 2300
 33 | 25,4,113,95,2228,14,71,3,toyota corona
 34 | 25,4,98,?,2046,19,71,1,ford pinto
 35 | 19,6,232,100,2634,13,71,1,amc gremlin
 36 | 16,6,225,105,3439,15.5,71,1,plymouth satellite custom
 37 | 17,6,250,100,3329,15.5,71,1,chevrolet chevelle malibu
 38 | 19,6,250,88,3302,15.5,71,1,ford torino 500
 39 | 18,6,232,100,3288,15.5,71,1,amc matador
 40 | 14,8,350,165,4209,12,71,1,chevrolet impala
 41 | 14,8,400,175,4464,11.5,71,1,pontiac catalina brougham
 42 | 14,8,351,153,4154,13.5,71,1,ford galaxie 500
 43 | 14,8,318,150,4096,13,71,1,plymouth fury iii
 44 | 12,8,383,180,4955,11.5,71,1,dodge monaco (sw)
 45 | 13,8,400,170,4746,12,71,1,ford country squire (sw)
 46 | 13,8,400,175,5140,12,71,1,pontiac safari (sw)
 47 | 18,6,258,110,2962,13.5,71,1,amc hornet sportabout (sw)
 48 | 22,4,140,72,2408,19,71,1,chevrolet vega (sw)
 49 | 19,6,250,100,3282,15,71,1,pontiac firebird
 50 | 18,6,250,88,3139,14.5,71,1,ford mustang
 51 | 23,4,122,86,2220,14,71,1,mercury capri 2000
 52 | 28,4,116,90,2123,14,71,2,opel 1900
 53 | 30,4,79,70,2074,19.5,71,2,peugeot 304
 54 | 30,4,88,76,2065,14.5,71,2,fiat 124b
 55 | 31,4,71,65,1773,19,71,3,toyota corolla 1200
 56 | 35,4,72,69,1613,18,71,3,datsun 1200
 57 | 27,4,97,60,1834,19,71,2,volkswagen model 111
 58 | 26,4,91,70,1955,20.5,71,1,plymouth cricket
 59 | 24,4,113,95,2278,15.5,72,3,toyota corona hardtop
 60 | 25,4,97.5,80,2126,17,72,1,dodge colt hardtop
 61 | 23,4,97,54,2254,23.5,72,2,volkswagen type 3
 62 | 20,4,140,90,2408,19.5,72,1,chevrolet vega
 63 | 21,4,122,86,2226,16.5,72,1,ford pinto runabout
 64 | 13,8,350,165,4274,12,72,1,chevrolet impala
 65 | 14,8,400,175,4385,12,72,1,pontiac catalina
 66 | 15,8,318,150,4135,13.5,72,1,plymouth fury iii
 67 | 14,8,351,153,4129,13,72,1,ford galaxie 500
 68 | 17,8,304,150,3672,11.5,72,1,amc ambassador sst
 69 | 11,8,429,208,4633,11,72,1,mercury marquis
 70 | 13,8,350,155,4502,13.5,72,1,buick lesabre custom
 71 | 12,8,350,160,4456,13.5,72,1,oldsmobile delta 88 royale
 72 | 13,8,400,190,4422,12.5,72,1,chrysler newport royal
 73 | 19,3,70,97,2330,13.5,72,3,mazda rx2 coupe
 74 | 15,8,304,150,3892,12.5,72,1,amc matador (sw)
 75 | 13,8,307,130,4098,14,72,1,chevrolet chevelle concours (sw)
 76 | 13,8,302,140,4294,16,72,1,ford gran torino (sw)
 77 | 14,8,318,150,4077,14,72,1,plymouth satellite custom (sw)
 78 | 18,4,121,112,2933,14.5,72,2,volvo 145e (sw)
 79 | 22,4,121,76,2511,18,72,2,volkswagen 411 (sw)
 80 | 21,4,120,87,2979,19.5,72,2,peugeot 504 (sw)
 81 | 26,4,96,69,2189,18,72,2,renault 12 (sw)
 82 | 22,4,122,86,2395,16,72,1,ford pinto (sw)
 83 | 28,4,97,92,2288,17,72,3,datsun 510 (sw)
 84 | 23,4,120,97,2506,14.5,72,3,toyouta corona mark ii (sw)
 85 | 28,4,98,80,2164,15,72,1,dodge colt (sw)
 86 | 27,4,97,88,2100,16.5,72,3,toyota corolla 1600 (sw)
 87 | 13,8,350,175,4100,13,73,1,buick century 350
 88 | 14,8,304,150,3672,11.5,73,1,amc matador
 89 | 13,8,350,145,3988,13,73,1,chevrolet malibu
 90 | 14,8,302,137,4042,14.5,73,1,ford gran torino
 91 | 15,8,318,150,3777,12.5,73,1,dodge coronet custom
 92 | 12,8,429,198,4952,11.5,73,1,mercury marquis brougham
 93 | 13,8,400,150,4464,12,73,1,chevrolet caprice classic
 94 | 13,8,351,158,4363,13,73,1,ford ltd
 95 | 14,8,318,150,4237,14.5,73,1,plymouth fury gran sedan
 96 | 13,8,440,215,4735,11,73,1,chrysler new yorker brougham
 97 | 12,8,455,225,4951,11,73,1,buick electra 225 custom
 98 | 13,8,360,175,3821,11,73,1,amc ambassador brougham
 99 | 18,6,225,105,3121,16.5,73,1,plymouth valiant
100 | 16,6,250,100,3278,18,73,1,chevrolet nova custom
101 | 18,6,232,100,2945,16,73,1,amc hornet
102 | 18,6,250,88,3021,16.5,73,1,ford maverick
103 | 23,6,198,95,2904,16,73,1,plymouth duster
104 | 26,4,97,46,1950,21,73,2,volkswagen super beetle
105 | 11,8,400,150,4997,14,73,1,chevrolet impala
106 | 12,8,400,167,4906,12.5,73,1,ford country
107 | 13,8,360,170,4654,13,73,1,plymouth custom suburb
108 | 12,8,350,180,4499,12.5,73,1,oldsmobile vista cruiser
109 | 18,6,232,100,2789,15,73,1,amc gremlin
110 | 20,4,97,88,2279,19,73,3,toyota carina
111 | 21,4,140,72,2401,19.5,73,1,chevrolet vega
112 | 22,4,108,94,2379,16.5,73,3,datsun 610
113 | 18,3,70,90,2124,13.5,73,3,maxda rx3
114 | 19,4,122,85,2310,18.5,73,1,ford pinto
115 | 21,6,155,107,2472,14,73,1,mercury capri v6
116 | 26,4,98,90,2265,15.5,73,2,fiat 124 sport coupe
117 | 15,8,350,145,4082,13,73,1,chevrolet monte carlo s
118 | 16,8,400,230,4278,9.5,73,1,pontiac grand prix
119 | 29,4,68,49,1867,19.5,73,2,fiat 128
120 | 24,4,116,75,2158,15.5,73,2,opel manta
121 | 20,4,114,91,2582,14,73,2,audi 100ls
122 | 19,4,121,112,2868,15.5,73,2,volvo 144ea
123 | 15,8,318,150,3399,11,73,1,dodge dart custom
124 | 24,4,121,110,2660,14,73,2,saab 99le
125 | 20,6,156,122,2807,13.5,73,3,toyota mark ii
126 | 11,8,350,180,3664,11,73,1,oldsmobile omega
127 | 20,6,198,95,3102,16.5,74,1,plymouth duster
128 | 21,6,200,?,2875,17,74,1,ford maverick
129 | 19,6,232,100,2901,16,74,1,amc hornet
130 | 15,6,250,100,3336,17,74,1,chevrolet nova
131 | 31,4,79,67,1950,19,74,3,datsun b210
132 | 26,4,122,80,2451,16.5,74,1,ford pinto
133 | 32,4,71,65,1836,21,74,3,toyota corolla 1200
134 | 25,4,140,75,2542,17,74,1,chevrolet vega
135 | 16,6,250,100,3781,17,74,1,chevrolet chevelle malibu classic
136 | 16,6,258,110,3632,18,74,1,amc matador
137 | 18,6,225,105,3613,16.5,74,1,plymouth satellite sebring
138 | 16,8,302,140,4141,14,74,1,ford gran torino
139 | 13,8,350,150,4699,14.5,74,1,buick century luxus (sw)
140 | 14,8,318,150,4457,13.5,74,1,dodge coronet custom (sw)
141 | 14,8,302,140,4638,16,74,1,ford gran torino (sw)
142 | 14,8,304,150,4257,15.5,74,1,amc matador (sw)
143 | 29,4,98,83,2219,16.5,74,2,audi fox
144 | 26,4,79,67,1963,15.5,74,2,volkswagen dasher
145 | 26,4,97,78,2300,14.5,74,2,opel manta
146 | 31,4,76,52,1649,16.5,74,3,toyota corona
147 | 32,4,83,61,2003,19,74,3,datsun 710
148 | 28,4,90,75,2125,14.5,74,1,dodge colt
149 | 24,4,90,75,2108,15.5,74,2,fiat 128
150 | 26,4,116,75,2246,14,74,2,fiat 124 tc
151 | 24,4,120,97,2489,15,74,3,honda civic
152 | 26,4,108,93,2391,15.5,74,3,subaru
153 | 31,4,79,67,2000,16,74,2,fiat x1.9
154 | 19,6,225,95,3264,16,75,1,plymouth valiant custom
155 | 18,6,250,105,3459,16,75,1,chevrolet nova
156 | 15,6,250,72,3432,21,75,1,mercury monarch
157 | 15,6,250,72,3158,19.5,75,1,ford maverick
158 | 16,8,400,170,4668,11.5,75,1,pontiac catalina
159 | 15,8,350,145,4440,14,75,1,chevrolet bel air
160 | 16,8,318,150,4498,14.5,75,1,plymouth grand fury
161 | 14,8,351,148,4657,13.5,75,1,ford ltd
162 | 17,6,231,110,3907,21,75,1,buick century
163 | 16,6,250,105,3897,18.5,75,1,chevroelt chevelle malibu
164 | 15,6,258,110,3730,19,75,1,amc matador
165 | 18,6,225,95,3785,19,75,1,plymouth fury
166 | 21,6,231,110,3039,15,75,1,buick skyhawk
167 | 20,8,262,110,3221,13.5,75,1,chevrolet monza 2+2
168 | 13,8,302,129,3169,12,75,1,ford mustang ii
169 | 29,4,97,75,2171,16,75,3,toyota corolla
170 | 23,4,140,83,2639,17,75,1,ford pinto
171 | 20,6,232,100,2914,16,75,1,amc gremlin
172 | 23,4,140,78,2592,18.5,75,1,pontiac astro
173 | 24,4,134,96,2702,13.5,75,3,toyota corona
174 | 25,4,90,71,2223,16.5,75,2,volkswagen dasher
175 | 24,4,119,97,2545,17,75,3,datsun 710
176 | 18,6,171,97,2984,14.5,75,1,ford pinto
177 | 29,4,90,70,1937,14,75,2,volkswagen rabbit
178 | 19,6,232,90,3211,17,75,1,amc pacer
179 | 23,4,115,95,2694,15,75,2,audi 100ls
180 | 23,4,120,88,2957,17,75,2,peugeot 504
181 | 22,4,121,98,2945,14.5,75,2,volvo 244dl
182 | 25,4,121,115,2671,13.5,75,2,saab 99le
183 | 33,4,91,53,1795,17.5,75,3,honda civic cvcc
184 | 28,4,107,86,2464,15.5,76,2,fiat 131
185 | 25,4,116,81,2220,16.9,76,2,opel 1900
186 | 25,4,140,92,2572,14.9,76,1,capri ii
187 | 26,4,98,79,2255,17.7,76,1,dodge colt
188 | 27,4,101,83,2202,15.3,76,2,renault 12tl
189 | 17.5,8,305,140,4215,13,76,1,chevrolet chevelle malibu classic
190 | 16,8,318,150,4190,13,76,1,dodge coronet brougham
191 | 15.5,8,304,120,3962,13.9,76,1,amc matador
192 | 14.5,8,351,152,4215,12.8,76,1,ford gran torino
193 | 22,6,225,100,3233,15.4,76,1,plymouth valiant
194 | 22,6,250,105,3353,14.5,76,1,chevrolet nova
195 | 24,6,200,81,3012,17.6,76,1,ford maverick
196 | 22.5,6,232,90,3085,17.6,76,1,amc hornet
197 | 29,4,85,52,2035,22.2,76,1,chevrolet chevette
198 | 24.5,4,98,60,2164,22.1,76,1,chevrolet woody
199 | 29,4,90,70,1937,14.2,76,2,vw rabbit
200 | 33,4,91,53,1795,17.4,76,3,honda civic
201 | 20,6,225,100,3651,17.7,76,1,dodge aspen se
202 | 18,6,250,78,3574,21,76,1,ford granada ghia
203 | 18.5,6,250,110,3645,16.2,76,1,pontiac ventura sj
204 | 17.5,6,258,95,3193,17.8,76,1,amc pacer d/l
205 | 29.5,4,97,71,1825,12.2,76,2,volkswagen rabbit
206 | 32,4,85,70,1990,17,76,3,datsun b-210
207 | 28,4,97,75,2155,16.4,76,3,toyota corolla
208 | 26.5,4,140,72,2565,13.6,76,1,ford pinto
209 | 20,4,130,102,3150,15.7,76,2,volvo 245
210 | 13,8,318,150,3940,13.2,76,1,plymouth volare premier v8
211 | 19,4,120,88,3270,21.9,76,2,peugeot 504
212 | 19,6,156,108,2930,15.5,76,3,toyota mark ii
213 | 16.5,6,168,120,3820,16.7,76,2,mercedes-benz 280s
214 | 16.5,8,350,180,4380,12.1,76,1,cadillac seville
215 | 13,8,350,145,4055,12,76,1,chevy c10
216 | 13,8,302,130,3870,15,76,1,ford f108
217 | 13,8,318,150,3755,14,76,1,dodge d100
218 | 31.5,4,98,68,2045,18.5,77,3,honda accord cvcc
219 | 30,4,111,80,2155,14.8,77,1,buick opel isuzu deluxe
220 | 36,4,79,58,1825,18.6,77,2,renault 5 gtl
221 | 25.5,4,122,96,2300,15.5,77,1,plymouth arrow gs
222 | 33.5,4,85,70,1945,16.8,77,3,datsun f-10 hatchback
223 | 17.5,8,305,145,3880,12.5,77,1,chevrolet caprice classic
224 | 17,8,260,110,4060,19,77,1,oldsmobile cutlass supreme
225 | 15.5,8,318,145,4140,13.7,77,1,dodge monaco brougham
226 | 15,8,302,130,4295,14.9,77,1,mercury cougar brougham
227 | 17.5,6,250,110,3520,16.4,77,1,chevrolet concours
228 | 20.5,6,231,105,3425,16.9,77,1,buick skylark
229 | 19,6,225,100,3630,17.7,77,1,plymouth volare custom
230 | 18.5,6,250,98,3525,19,77,1,ford granada
231 | 16,8,400,180,4220,11.1,77,1,pontiac grand prix lj
232 | 15.5,8,350,170,4165,11.4,77,1,chevrolet monte carlo landau
233 | 15.5,8,400,190,4325,12.2,77,1,chrysler cordoba
234 | 16,8,351,149,4335,14.5,77,1,ford thunderbird
235 | 29,4,97,78,1940,14.5,77,2,volkswagen rabbit custom
236 | 24.5,4,151,88,2740,16,77,1,pontiac sunbird coupe
237 | 26,4,97,75,2265,18.2,77,3,toyota corolla liftback
238 | 25.5,4,140,89,2755,15.8,77,1,ford mustang ii 2+2
239 | 30.5,4,98,63,2051,17,77,1,chevrolet chevette
240 | 33.5,4,98,83,2075,15.9,77,1,dodge colt m/m
241 | 30,4,97,67,1985,16.4,77,3,subaru dl
242 | 30.5,4,97,78,2190,14.1,77,2,volkswagen dasher
243 | 22,6,146,97,2815,14.5,77,3,datsun 810
244 | 21.5,4,121,110,2600,12.8,77,2,bmw 320i
245 | 21.5,3,80,110,2720,13.5,77,3,mazda rx-4
246 | 43.1,4,90,48,1985,21.5,78,2,volkswagen rabbit custom diesel
247 | 36.1,4,98,66,1800,14.4,78,1,ford fiesta
248 | 32.8,4,78,52,1985,19.4,78,3,mazda glc deluxe
249 | 39.4,4,85,70,2070,18.6,78,3,datsun b210 gx
250 | 36.1,4,91,60,1800,16.4,78,3,honda civic cvcc
251 | 19.9,8,260,110,3365,15.5,78,1,oldsmobile cutlass salon brougham
252 | 19.4,8,318,140,3735,13.2,78,1,dodge diplomat
253 | 20.2,8,302,139,3570,12.8,78,1,mercury monarch ghia
254 | 19.2,6,231,105,3535,19.2,78,1,pontiac phoenix lj
255 | 20.5,6,200,95,3155,18.2,78,1,chevrolet malibu
256 | 20.2,6,200,85,2965,15.8,78,1,ford fairmont (auto)
257 | 25.1,4,140,88,2720,15.4,78,1,ford fairmont (man)
258 | 20.5,6,225,100,3430,17.2,78,1,plymouth volare
259 | 19.4,6,232,90,3210,17.2,78,1,amc concord
260 | 20.6,6,231,105,3380,15.8,78,1,buick century special
261 | 20.8,6,200,85,3070,16.7,78,1,mercury zephyr
262 | 18.6,6,225,110,3620,18.7,78,1,dodge aspen
263 | 18.1,6,258,120,3410,15.1,78,1,amc concord d/l
264 | 19.2,8,305,145,3425,13.2,78,1,chevrolet monte carlo landau
265 | 17.7,6,231,165,3445,13.4,78,1,buick regal sport coupe (turbo)
266 | 18.1,8,302,139,3205,11.2,78,1,ford futura
267 | 17.5,8,318,140,4080,13.7,78,1,dodge magnum xe
268 | 30,4,98,68,2155,16.5,78,1,chevrolet chevette
269 | 27.5,4,134,95,2560,14.2,78,3,toyota corona
270 | 27.2,4,119,97,2300,14.7,78,3,datsun 510
271 | 30.9,4,105,75,2230,14.5,78,1,dodge omni
272 | 21.1,4,134,95,2515,14.8,78,3,toyota celica gt liftback
273 | 23.2,4,156,105,2745,16.7,78,1,plymouth sapporo
274 | 23.8,4,151,85,2855,17.6,78,1,oldsmobile starfire sx
275 | 23.9,4,119,97,2405,14.9,78,3,datsun 200-sx
276 | 20.3,5,131,103,2830,15.9,78,2,audi 5000
277 | 17,6,163,125,3140,13.6,78,2,volvo 264gl
278 | 21.6,4,121,115,2795,15.7,78,2,saab 99gle
279 | 16.2,6,163,133,3410,15.8,78,2,peugeot 604sl
280 | 31.5,4,89,71,1990,14.9,78,2,volkswagen scirocco
281 | 29.5,4,98,68,2135,16.6,78,3,honda accord lx
282 | 21.5,6,231,115,3245,15.4,79,1,pontiac lemans v6
283 | 19.8,6,200,85,2990,18.2,79,1,mercury zephyr 6
284 | 22.3,4,140,88,2890,17.3,79,1,ford fairmont 4
285 | 20.2,6,232,90,3265,18.2,79,1,amc concord dl 6
286 | 20.6,6,225,110,3360,16.6,79,1,dodge aspen 6
287 | 17,8,305,130,3840,15.4,79,1,chevrolet caprice classic
288 | 17.6,8,302,129,3725,13.4,79,1,ford ltd landau
289 | 16.5,8,351,138,3955,13.2,79,1,mercury grand marquis
290 | 18.2,8,318,135,3830,15.2,79,1,dodge st. regis
291 | 16.9,8,350,155,4360,14.9,79,1,buick estate wagon (sw)
292 | 15.5,8,351,142,4054,14.3,79,1,ford country squire (sw)
293 | 19.2,8,267,125,3605,15,79,1,chevrolet malibu classic (sw)
294 | 18.5,8,360,150,3940,13,79,1,chrysler lebaron town @ country (sw)
295 | 31.9,4,89,71,1925,14,79,2,vw rabbit custom
296 | 34.1,4,86,65,1975,15.2,79,3,maxda glc deluxe
297 | 35.7,4,98,80,1915,14.4,79,1,dodge colt hatchback custom
298 | 27.4,4,121,80,2670,15,79,1,amc spirit dl
299 | 25.4,5,183,77,3530,20.1,79,2,mercedes benz 300d
300 | 23,8,350,125,3900,17.4,79,1,cadillac eldorado
301 | 27.2,4,141,71,3190,24.8,79,2,peugeot 504
302 | 23.9,8,260,90,3420,22.2,79,1,oldsmobile cutlass salon brougham
303 | 34.2,4,105,70,2200,13.2,79,1,plymouth horizon
304 | 34.5,4,105,70,2150,14.9,79,1,plymouth horizon tc3
305 | 31.8,4,85,65,2020,19.2,79,3,datsun 210
306 | 37.3,4,91,69,2130,14.7,79,2,fiat strada custom
307 | 28.4,4,151,90,2670,16,79,1,buick skylark limited
308 | 28.8,6,173,115,2595,11.3,79,1,chevrolet citation
309 | 26.8,6,173,115,2700,12.9,79,1,oldsmobile omega brougham
310 | 33.5,4,151,90,2556,13.2,79,1,pontiac phoenix
311 | 41.5,4,98,76,2144,14.7,80,2,vw rabbit
312 | 38.1,4,89,60,1968,18.8,80,3,toyota corolla tercel
313 | 32.1,4,98,70,2120,15.5,80,1,chevrolet chevette
314 | 37.2,4,86,65,2019,16.4,80,3,datsun 310
315 | 28,4,151,90,2678,16.5,80,1,chevrolet citation
316 | 26.4,4,140,88,2870,18.1,80,1,ford fairmont
317 | 24.3,4,151,90,3003,20.1,80,1,amc concord
318 | 19.1,6,225,90,3381,18.7,80,1,dodge aspen
319 | 34.3,4,97,78,2188,15.8,80,2,audi 4000
320 | 29.8,4,134,90,2711,15.5,80,3,toyota corona liftback
321 | 31.3,4,120,75,2542,17.5,80,3,mazda 626
322 | 37,4,119,92,2434,15,80,3,datsun 510 hatchback
323 | 32.2,4,108,75,2265,15.2,80,3,toyota corolla
324 | 46.6,4,86,65,2110,17.9,80,3,mazda glc
325 | 27.9,4,156,105,2800,14.4,80,1,dodge colt
326 | 40.8,4,85,65,2110,19.2,80,3,datsun 210
327 | 44.3,4,90,48,2085,21.7,80,2,vw rabbit c (diesel)
328 | 43.4,4,90,48,2335,23.7,80,2,vw dasher (diesel)
329 | 36.4,5,121,67,2950,19.9,80,2,audi 5000s (diesel)
330 | 30,4,146,67,3250,21.8,80,2,mercedes-benz 240d
331 | 44.6,4,91,67,1850,13.8,80,3,honda civic 1500 gl
332 | 40.9,4,85,?,1835,17.3,80,2,renault lecar deluxe
333 | 33.8,4,97,67,2145,18,80,3,subaru dl
334 | 29.8,4,89,62,1845,15.3,80,2,vokswagen rabbit
335 | 32.7,6,168,132,2910,11.4,80,3,datsun 280-zx
336 | 23.7,3,70,100,2420,12.5,80,3,mazda rx-7 gs
337 | 35,4,122,88,2500,15.1,80,2,triumph tr7 coupe
338 | 23.6,4,140,?,2905,14.3,80,1,ford mustang cobra
339 | 32.4,4,107,72,2290,17,80,3,honda accord
340 | 27.2,4,135,84,2490,15.7,81,1,plymouth reliant
341 | 26.6,4,151,84,2635,16.4,81,1,buick skylark
342 | 25.8,4,156,92,2620,14.4,81,1,dodge aries wagon (sw)
343 | 23.5,6,173,110,2725,12.6,81,1,chevrolet citation
344 | 30,4,135,84,2385,12.9,81,1,plymouth reliant
345 | 39.1,4,79,58,1755,16.9,81,3,toyota starlet
346 | 39,4,86,64,1875,16.4,81,1,plymouth champ
347 | 35.1,4,81,60,1760,16.1,81,3,honda civic 1300
348 | 32.3,4,97,67,2065,17.8,81,3,subaru
349 | 37,4,85,65,1975,19.4,81,3,datsun 210 mpg
350 | 37.7,4,89,62,2050,17.3,81,3,toyota tercel
351 | 34.1,4,91,68,1985,16,81,3,mazda glc 4
352 | 34.7,4,105,63,2215,14.9,81,1,plymouth horizon 4
353 | 34.4,4,98,65,2045,16.2,81,1,ford escort 4w
354 | 29.9,4,98,65,2380,20.7,81,1,ford escort 2h
355 | 33,4,105,74,2190,14.2,81,2,volkswagen jetta
356 | 34.5,4,100,?,2320,15.8,81,2,renault 18i
357 | 33.7,4,107,75,2210,14.4,81,3,honda prelude
358 | 32.4,4,108,75,2350,16.8,81,3,toyota corolla
359 | 32.9,4,119,100,2615,14.8,81,3,datsun 200sx
360 | 31.6,4,120,74,2635,18.3,81,3,mazda 626
361 | 28.1,4,141,80,3230,20.4,81,2,peugeot 505s turbo diesel
362 | 30.7,6,145,76,3160,19.6,81,2,volvo diesel
363 | 25.4,6,168,116,2900,12.6,81,3,toyota cressida
364 | 24.2,6,146,120,2930,13.8,81,3,datsun 810 maxima
365 | 22.4,6,231,110,3415,15.8,81,1,buick century
366 | 26.6,8,350,105,3725,19,81,1,oldsmobile cutlass ls
367 | 20.2,6,200,88,3060,17.1,81,1,ford granada gl
368 | 17.6,6,225,85,3465,16.6,81,1,chrysler lebaron salon
369 | 28,4,112,88,2605,19.6,82,1,chevrolet cavalier
370 | 27,4,112,88,2640,18.6,82,1,chevrolet cavalier wagon
371 | 34,4,112,88,2395,18,82,1,chevrolet cavalier 2-door
372 | 31,4,112,85,2575,16.2,82,1,pontiac j2000 se hatchback
373 | 29,4,135,84,2525,16,82,1,dodge aries se
374 | 27,4,151,90,2735,18,82,1,pontiac phoenix
375 | 24,4,140,92,2865,16.4,82,1,ford fairmont futura
376 | 36,4,105,74,1980,15.3,82,2,volkswagen rabbit l
377 | 37,4,91,68,2025,18.2,82,3,mazda glc custom l
378 | 31,4,91,68,1970,17.6,82,3,mazda glc custom
379 | 38,4,105,63,2125,14.7,82,1,plymouth horizon miser
380 | 36,4,98,70,2125,17.3,82,1,mercury lynx l
381 | 36,4,120,88,2160,14.5,82,3,nissan stanza xe
382 | 36,4,107,75,2205,14.5,82,3,honda accord
383 | 34,4,108,70,2245,16.9,82,3,toyota corolla
384 | 38,4,91,67,1965,15,82,3,honda civic
385 | 32,4,91,67,1965,15.7,82,3,honda civic (auto)
386 | 38,4,91,67,1995,16.2,82,3,datsun 310 gx
387 | 25,6,181,110,2945,16.4,82,1,buick century limited
388 | 38,6,262,85,3015,17,82,1,oldsmobile cutlass ciera (diesel)
389 | 26,4,156,92,2585,14.5,82,1,chrysler lebaron medallion
390 | 22,6,232,112,2835,14.7,82,1,ford granada l
391 | 32,4,144,96,2665,13.9,82,3,toyota celica gt
392 | 36,4,135,84,2370,13,82,1,dodge charger 2.2
393 | 27,4,151,90,2950,17.3,82,1,chevrolet camaro
394 | 27,4,140,86,2790,15.6,82,1,ford mustang gl
395 | 44,4,97,52,2130,24.6,82,2,vw pickup
396 | 32,4,135,84,2295,11.6,82,1,dodge rampage
397 | 28,4,120,79,2625,18.6,82,1,ford ranger
398 | 31,4,119,82,2720,19.4,82,1,chevy s-10
399 | 


--------------------------------------------------------------------------------
/data/BrainCancer.csv:
--------------------------------------------------------------------------------
 1 | "sex","diagnosis","loc","ki","gtv","stereo","status","time"
 2 | "Female","Meningioma","Infratentorial",90,6.11,"SRS",0,57.64
 3 | "Male","HG glioma","Supratentorial",90,19.35,"SRT",1,8.98
 4 | "Female","Meningioma","Infratentorial",70,7.95,"SRS",0,26.46
 5 | "Female","LG glioma","Supratentorial",80,7.61,"SRT",1,47.8
 6 | "Male","HG glioma","Supratentorial",90,5.06,"SRT",1,6.3
 7 | "Female","Meningioma","Supratentorial",80,4.82,"SRS",0,52.75
 8 | "Male","Meningioma","Supratentorial",80,3.19,"SRT",0,55.8
 9 | "Male","LG glioma","Supratentorial",80,12.37,"SRT",0,42.1
10 | "Female","Meningioma","Supratentorial",70,12.16,"SRT",0,34.66
11 | "Male","HG glioma","Supratentorial",100,2.53,"SRT",0,11.48
12 | "Male","LG glioma","Supratentorial",80,0.14,"SRT",1,35.93
13 | "Female","Meningioma","Infratentorial",90,6.54,"SRS",0,34.26
14 | "Female","Meningioma","Infratentorial",90,0.63,"SRS",0,32.98
15 | "Male",NA,"Supratentorial",90,6.38,"SRT",0,50.85
16 | "Female","Meningioma","Supratentorial",60,9.18,"SRT",0,41.44
17 | "Female","HG glioma","Supratentorial",70,11.38,"SRS",1,7.05
18 | "Female","Other","Infratentorial",60,24,"SRT",1,6.82
19 | "Male","HG glioma","Supratentorial",90,10.8,"SRT",0,82.56
20 | "Male","Meningioma","Supratentorial",80,13.49,"SRS",1,6.92
21 | "Female","Meningioma","Supratentorial",90,2.5,"SRT",0,30.16
22 | "Female","Meningioma","Supratentorial",80,2.82,"SRS",0,24.39
23 | "Male","HG glioma","Supratentorial",70,14.44,"SRT",1,14
24 | "Female","Other","Infratentorial",80,2.11,"SRS",0,10.49
25 | "Female","Meningioma","Infratentorial",100,2.13,"SRS",1,51.02
26 | "Female","Meningioma","Supratentorial",70,6.48,"SRT",1,33.41
27 | "Male","LG glioma","Supratentorial",90,4.23,"SRT",1,25.02
28 | "Male","Other","Supratentorial",60,34.64,"SRT",1,11.57
29 | "Male","HG glioma","Supratentorial",70,33.69,"SRT",1,0.07
30 | "Male","Meningioma","Supratentorial",60,3.81,"SRT",0,36.1
31 | "Female","Meningioma","Supratentorial",90,4.72,"SRS",0,65.02
32 | "Female","LG glioma","Supratentorial",80,0.85,"SRS",1,6.1
33 | "Male","Meningioma","Supratentorial",90,2.56,"SRS",0,44.39
34 | "Female","Other","Infratentorial",70,13.45,"SRT",1,10.82
35 | "Male","Other","Infratentorial",80,6.81,"SRS",0,57.11
36 | "Female","Meningioma","Supratentorial",90,7.3,"SRT",0,5.51
37 | "Female","Other","Supratentorial",70,14.26,"SRT",0,7.18
38 | "Female","Meningioma","Supratentorial",80,6.6,"SRT",0,14.75
39 | "Male","HG glioma","Supratentorial",90,9.95,"SRT",1,6.23
40 | "Male","Other","Infratentorial",80,12.51,"SRT",1,29.7
41 | "Female","Meningioma","Supratentorial",90,2.54,"SRT",0,45.74
42 | "Female","Meningioma","Supratentorial",80,1.57,"SRT",0,2.03
43 | "Male","HG glioma","Supratentorial",90,0.28,"SRT",1,16.43
44 | "Female","Meningioma","Supratentorial",70,6.7,"SRT",0,14.56
45 | "Male","Meningioma","Supratentorial",80,12.63,"SRT",1,4.16
46 | "Male","Other","Infratentorial",90,3.12,"SRT",0,18.95
47 | "Male","Meningioma","Supratentorial",60,7.09,"SRS",1,31.25
48 | "Male","HG glioma","Supratentorial",80,29.27,"SRT",0,5.15
49 | "Female","Meningioma","Supratentorial",80,26.31,"SRT",1,39.54
50 | "Male","Meningioma","Supratentorial",70,0.97,"SRT",1,1.41
51 | "Female","LG glioma","Supratentorial",80,0.19,"SRS",0,11.51
52 | "Female","HG glioma","Supratentorial",90,0.04,"SRT",0,31.67
53 | "Female","Meningioma","Infratentorial",90,9.24,"SRT",0,26.85
54 | "Male","HG glioma","Supratentorial",90,2.5,"SRT",1,9.77
55 | "Male","Meningioma","Infratentorial",80,24.41,"SRT",0,39.54
56 | "Female","HG glioma","Supratentorial",80,0.63,"SRT",1,16.92
57 | "Male","Other","Infratentorial",90,0.48,"SRS",0,54.43
58 | "Male","HG glioma","Infratentorial",80,0.22,"SRS",0,33.67
59 | "Male","HG glioma","Supratentorial",80,3.75,"SRT",1,19.9
60 | "Female","Other","Supratentorial",80,11.83,"SRT",1,22.03
61 | "Female","Meningioma","Supratentorial",90,2.47,"SRT",0,17.57
62 | "Female","HG glioma","Supratentorial",80,12.08,"SRT",1,7.25
63 | "Male","Meningioma","Supratentorial",80,11.51,"SRT",1,14.62
64 | "Female","HG glioma","Supratentorial",40,22.87,"SRT",1,3.38
65 | "Male","Meningioma","Supratentorial",80,4.77,"SRT",0,67.38
66 | "Male","LG glioma","Supratentorial",80,9.58,"SRT",0,78.75
67 | "Female","Meningioma","Supratentorial",100,4,"SRT",0,52.23
68 | "Female","HG glioma","Supratentorial",80,7.59,"SRT",1,4.56
69 | "Male","Other","Infratentorial",70,0.01,"SRS",0,23.67
70 | "Female","Meningioma","Supratentorial",80,6.93,"SRS",0,10.1
71 | "Female","Meningioma","Supratentorial",70,3.63,"SRT",0,32.82
72 | "Male","Meningioma","Supratentorial",70,8.45,"SRT",0,19.41
73 | "Male","Meningioma","Supratentorial",80,20.93,"SRT",1,31.15
74 | "Male","LG glioma","Supratentorial",90,2.64,"SRT",0,20.13
75 | "Female","HG glioma","Supratentorial",80,0.19,"SRT",1,11.02
76 | "Male","Other","Supratentorial",100,24.91,"SRT",0,19.74
77 | "Female","Meningioma","Supratentorial",80,31.74,"SRT",0,57.25
78 | "Female","Meningioma","Supratentorial",80,2.39,"SRS",0,73.74
79 | "Female","Meningioma","Supratentorial",90,7.26,"SRT",0,49.05
80 | "Female","Meningioma","Supratentorial",100,9.66,"SRT",0,39.25
81 | "Female","Meningioma","Infratentorial",70,2.94,"SRS",0,1.54
82 | "Female","HG glioma","Supratentorial",80,15.45,"SRT",1,46.16
83 | "Female","Other","Supratentorial",90,1.82,"SRT",0,47.11
84 | "Male","LG glioma","Infratentorial",90,30.41,"SRT",0,1.18
85 | "Male","HG glioma","Supratentorial",80,0.16,"SRT",1,20.69
86 | "Male","HG glioma","Supratentorial",80,19.81,"SRT",1,6.39
87 | "Male","Meningioma","Supratentorial",90,2.5,"SRT",0,32.82
88 | "Male","Meningioma","Supratentorial",90,2.02,"SRS",0,42.07
89 | "Male","Other","Infratentorial",80,0.11,"SRT",0,13.9
90 | 


--------------------------------------------------------------------------------
/data/Carseats.csv:
--------------------------------------------------------------------------------
1 | Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US9.5,138,73,11,276,120,Bad,42,17,Yes,Yes11.22,111,48,16,260,83,Good,65,10,Yes,Yes10.06,113,35,10,269,80,Medium,59,12,Yes,Yes7.4,117,100,4,466,97,Medium,55,14,Yes,Yes4.15,141,64,3,340,128,Bad,38,13,Yes,No10.81,124,113,13,501,72,Bad,78,16,No,Yes6.63,115,105,0,45,108,Medium,71,15,Yes,No11.85,136,81,15,425,120,Good,67,10,Yes,Yes6.54,132,110,0,108,124,Medium,76,10,No,No4.69,132,113,0,131,124,Medium,76,17,No,Yes9.01,121,78,9,150,100,Bad,26,10,No,Yes11.96,117,94,4,503,94,Good,50,13,Yes,Yes3.98,122,35,2,393,136,Medium,62,18,Yes,No10.96,115,28,11,29,86,Good,53,18,Yes,Yes11.17,107,117,11,148,118,Good,52,18,Yes,Yes8.71,149,95,5,400,144,Medium,76,18,No,No7.58,118,32,0,284,110,Good,63,13,Yes,No12.29,147,74,13,251,131,Good,52,10,Yes,Yes13.91,110,110,0,408,68,Good,46,17,No,Yes8.73,129,76,16,58,121,Medium,69,12,Yes,Yes6.41,125,90,2,367,131,Medium,35,18,Yes,Yes12.13,134,29,12,239,109,Good,62,18,No,Yes5.08,128,46,6,497,138,Medium,42,13,Yes,No5.87,121,31,0,292,109,Medium,79,10,Yes,No10.14,145,119,16,294,113,Bad,42,12,Yes,Yes14.9,139,32,0,176,82,Good,54,11,No,No8.33,107,115,11,496,131,Good,50,11,No,Yes5.27,98,118,0,19,107,Medium,64,17,Yes,No2.99,103,74,0,359,97,Bad,55,11,Yes,Yes7.81,104,99,15,226,102,Bad,58,17,Yes,Yes13.55,125,94,0,447,89,Good,30,12,Yes,No8.25,136,58,16,241,131,Medium,44,18,Yes,Yes6.2,107,32,12,236,137,Good,64,10,No,Yes8.77,114,38,13,317,128,Good,50,16,Yes,Yes2.67,115,54,0,406,128,Medium,42,17,Yes,Yes11.07,131,84,11,29,96,Medium,44,17,No,Yes8.89,122,76,0,270,100,Good,60,18,No,No4.95,121,41,5,412,110,Medium,54,10,Yes,Yes6.59,109,73,0,454,102,Medium,65,15,Yes,No3.24,130,60,0,144,138,Bad,38,10,No,No2.07,119,98,0,18,126,Bad,73,17,No,No7.96,157,53,0,403,124,Bad,58,16,Yes,No10.43,77,69,0,25,24,Medium,50,18,Yes,No4.12,123,42,11,16,134,Medium,59,13,Yes,Yes4.16,85,79,6,325,95,Medium,69,13,Yes,Yes4.56,141,63,0,168,135,Bad,44,12,Yes,Yes12.44,127,90,14,16,70,Medium,48,15,No,Yes4.38,126,98,0,173,108,Bad,55,16,Yes,No3.91,116,52,0,349,98,Bad,69,18,Yes,No10.61,157,93,0,51,149,Good,32,17,Yes,No1.42,99,32,18,341,108,Bad,80,16,Yes,Yes4.42,121,90,0,150,108,Bad,75,16,Yes,No7.91,153,40,3,112,129,Bad,39,18,Yes,Yes6.92,109,64,13,39,119,Medium,61,17,Yes,Yes4.9,134,103,13,25,144,Medium,76,17,No,Yes6.85,143,81,5,60,154,Medium,61,18,Yes,Yes11.91,133,82,0,54,84,Medium,50,17,Yes,No0.91,93,91,0,22,117,Bad,75,11,Yes,No5.42,103,93,15,188,103,Bad,74,16,Yes,Yes5.21,118,71,4,148,114,Medium,80,13,Yes,No8.32,122,102,19,469,123,Bad,29,13,Yes,Yes7.32,105,32,0,358,107,Medium,26,13,No,No1.82,139,45,0,146,133,Bad,77,17,Yes,Yes8.47,119,88,10,170,101,Medium,61,13,Yes,Yes7.8,100,67,12,184,104,Medium,32,16,No,Yes4.9,122,26,0,197,128,Medium,55,13,No,No8.85,127,92,0,508,91,Medium,56,18,Yes,No9.01,126,61,14,152,115,Medium,47,16,Yes,Yes13.39,149,69,20,366,134,Good,60,13,Yes,Yes7.99,127,59,0,339,99,Medium,65,12,Yes,No9.46,89,81,15,237,99,Good,74,12,Yes,Yes6.5,148,51,16,148,150,Medium,58,17,No,Yes5.52,115,45,0,432,116,Medium,25,15,Yes,No12.61,118,90,10,54,104,Good,31,11,No,Yes6.2,150,68,5,125,136,Medium,64,13,No,Yes8.55,88,111,23,480,92,Bad,36,16,No,Yes10.64,102,87,10,346,70,Medium,64,15,Yes,Yes7.7,118,71,12,44,89,Medium,67,18,No,Yes4.43,134,48,1,139,145,Medium,65,12,Yes,Yes9.14,134,67,0,286,90,Bad,41,13,Yes,No8.01,113,100,16,353,79,Bad,68,11,Yes,Yes7.52,116,72,0,237,128,Good,70,13,Yes,No11.62,151,83,4,325,139,Good,28,17,Yes,Yes4.42,109,36,7,468,94,Bad,56,11,Yes,Yes2.23,111,25,0,52,121,Bad,43,18,No,No8.47,125,103,0,304,112,Medium,49,13,No,No8.7,150,84,9,432,134,Medium,64,15,Yes,No11.7,131,67,7,272,126,Good,54,16,No,Yes6.56,117,42,7,144,111,Medium,62,10,Yes,Yes7.95,128,66,3,493,119,Medium,45,16,No,No5.33,115,22,0,491,103,Medium,64,11,No,No4.81,97,46,11,267,107,Medium,80,15,Yes,Yes4.53,114,113,0,97,125,Medium,29,12,Yes,No8.86,145,30,0,67,104,Medium,55,17,Yes,No8.39,115,97,5,134,84,Bad,55,11,Yes,Yes5.58,134,25,10,237,148,Medium,59,13,Yes,Yes9.48,147,42,10,407,132,Good,73,16,No,Yes7.45,161,82,5,287,129,Bad,33,16,Yes,Yes12.49,122,77,24,382,127,Good,36,16,No,Yes4.88,121,47,3,220,107,Bad,56,16,No,Yes4.11,113,69,11,94,106,Medium,76,12,No,Yes6.2,128,93,0,89,118,Medium,34,18,Yes,No5.3,113,22,0,57,97,Medium,65,16,No,No5.07,123,91,0,334,96,Bad,78,17,Yes,Yes4.62,121,96,0,472,138,Medium,51,12,Yes,No5.55,104,100,8,398,97,Medium,61,11,Yes,Yes0.16,102,33,0,217,139,Medium,70,18,No,No8.55,134,107,0,104,108,Medium,60,12,Yes,No3.47,107,79,2,488,103,Bad,65,16,Yes,No8.98,115,65,0,217,90,Medium,60,17,No,No9,128,62,7,125,116,Medium,43,14,Yes,Yes6.62,132,118,12,272,151,Medium,43,14,Yes,Yes6.67,116,99,5,298,125,Good,62,12,Yes,Yes6.01,131,29,11,335,127,Bad,33,12,Yes,Yes9.31,122,87,9,17,106,Medium,65,13,Yes,Yes8.54,139,35,0,95,129,Medium,42,13,Yes,No5.08,135,75,0,202,128,Medium,80,10,No,No8.8,145,53,0,507,119,Medium,41,12,Yes,No7.57,112,88,2,243,99,Medium,62,11,Yes,Yes7.37,130,94,8,137,128,Medium,64,12,Yes,Yes6.87,128,105,11,249,131,Medium,63,13,Yes,Yes11.67,125,89,10,380,87,Bad,28,10,Yes,Yes6.88,119,100,5,45,108,Medium,75,10,Yes,Yes8.19,127,103,0,125,155,Good,29,15,No,Yes8.87,131,113,0,181,120,Good,63,14,Yes,No9.34,89,78,0,181,49,Medium,43,15,No,No11.27,153,68,2,60,133,Good,59,16,Yes,Yes6.52,125,48,3,192,116,Medium,51,14,Yes,Yes4.96,133,100,3,350,126,Bad,55,13,Yes,Yes4.47,143,120,7,279,147,Bad,40,10,No,Yes8.41,94,84,13,497,77,Medium,51,12,Yes,Yes6.5,108,69,3,208,94,Medium,77,16,Yes,No9.54,125,87,9,232,136,Good,72,10,Yes,Yes7.62,132,98,2,265,97,Bad,62,12,Yes,Yes3.67,132,31,0,327,131,Medium,76,16,Yes,No6.44,96,94,14,384,120,Medium,36,18,No,Yes5.17,131,75,0,10,120,Bad,31,18,No,No6.52,128,42,0,436,118,Medium,80,11,Yes,No10.27,125,103,12,371,109,Medium,44,10,Yes,Yes12.3,146,62,10,310,94,Medium,30,13,No,Yes6.03,133,60,10,277,129,Medium,45,18,Yes,Yes6.53,140,42,0,331,131,Bad,28,15,Yes,No7.44,124,84,0,300,104,Medium,77,15,Yes,No0.53,122,88,7,36,159,Bad,28,17,Yes,Yes9.09,132,68,0,264,123,Good,34,11,No,No8.77,144,63,11,27,117,Medium,47,17,Yes,Yes3.9,114,83,0,412,131,Bad,39,14,Yes,No10.51,140,54,9,402,119,Good,41,16,No,Yes7.56,110,119,0,384,97,Medium,72,14,No,Yes11.48,121,120,13,140,87,Medium,56,11,Yes,Yes10.49,122,84,8,176,114,Good,57,10,No,Yes10.77,111,58,17,407,103,Good,75,17,No,Yes7.64,128,78,0,341,128,Good,45,13,No,No5.93,150,36,7,488,150,Medium,25,17,No,Yes6.89,129,69,10,289,110,Medium,50,16,No,Yes7.71,98,72,0,59,69,Medium,65,16,Yes,No7.49,146,34,0,220,157,Good,51,16,Yes,No10.21,121,58,8,249,90,Medium,48,13,No,Yes12.53,142,90,1,189,112,Good,39,10,No,Yes9.32,119,60,0,372,70,Bad,30,18,No,No4.67,111,28,0,486,111,Medium,29,12,No,No2.93,143,21,5,81,160,Medium,67,12,No,Yes3.63,122,74,0,424,149,Medium,51,13,Yes,No5.68,130,64,0,40,106,Bad,39,17,No,No8.22,148,64,0,58,141,Medium,27,13,No,Yes0.37,147,58,7,100,191,Bad,27,15,Yes,Yes6.71,119,67,17,151,137,Medium,55,11,Yes,Yes6.71,106,73,0,216,93,Medium,60,13,Yes,No7.3,129,89,0,425,117,Medium,45,10,Yes,No11.48,104,41,15,492,77,Good,73,18,Yes,Yes8.01,128,39,12,356,118,Medium,71,10,Yes,Yes12.49,93,106,12,416,55,Medium,75,15,Yes,Yes9.03,104,102,13,123,110,Good,35,16,Yes,Yes6.38,135,91,5,207,128,Medium,66,18,Yes,Yes0,139,24,0,358,185,Medium,79,15,No,No7.54,115,89,0,38,122,Medium,25,12,Yes,No5.61,138,107,9,480,154,Medium,47,11,No,Yes10.48,138,72,0,148,94,Medium,27,17,Yes,Yes10.66,104,71,14,89,81,Medium,25,14,No,Yes7.78,144,25,3,70,116,Medium,77,18,Yes,Yes4.94,137,112,15,434,149,Bad,66,13,Yes,Yes7.43,121,83,0,79,91,Medium,68,11,Yes,No4.74,137,60,4,230,140,Bad,25,13,Yes,No5.32,118,74,6,426,102,Medium,80,18,Yes,Yes9.95,132,33,7,35,97,Medium,60,11,No,Yes10.07,130,100,11,449,107,Medium,64,10,Yes,Yes8.68,120,51,0,93,86,Medium,46,17,No,No6.03,117,32,0,142,96,Bad,62,17,Yes,No8.07,116,37,0,426,90,Medium,76,15,Yes,No12.11,118,117,18,509,104,Medium,26,15,No,Yes8.79,130,37,13,297,101,Medium,37,13,No,Yes6.67,156,42,13,170,173,Good,74,14,Yes,Yes7.56,108,26,0,408,93,Medium,56,14,No,No13.28,139,70,7,71,96,Good,61,10,Yes,Yes7.23,112,98,18,481,128,Medium,45,11,Yes,Yes4.19,117,93,4,420,112,Bad,66,11,Yes,Yes4.1,130,28,6,410,133,Bad,72,16,Yes,Yes2.52,124,61,0,333,138,Medium,76,16,Yes,No3.62,112,80,5,500,128,Medium,69,10,Yes,Yes6.42,122,88,5,335,126,Medium,64,14,Yes,Yes5.56,144,92,0,349,146,Medium,62,12,No,No5.94,138,83,0,139,134,Medium,54,18,Yes,No4.1,121,78,4,413,130,Bad,46,10,No,Yes2.05,131,82,0,132,157,Bad,25,14,Yes,No8.74,155,80,0,237,124,Medium,37,14,Yes,No5.68,113,22,1,317,132,Medium,28,12,Yes,No4.97,162,67,0,27,160,Medium,77,17,Yes,Yes8.19,111,105,0,466,97,Bad,61,10,No,No7.78,86,54,0,497,64,Bad,33,12,Yes,No3.02,98,21,11,326,90,Bad,76,11,No,Yes4.36,125,41,2,357,123,Bad,47,14,No,Yes9.39,117,118,14,445,120,Medium,32,15,Yes,Yes12.04,145,69,19,501,105,Medium,45,11,Yes,Yes8.23,149,84,5,220,139,Medium,33,10,Yes,Yes4.83,115,115,3,48,107,Medium,73,18,Yes,Yes2.34,116,83,15,170,144,Bad,71,11,Yes,Yes5.73,141,33,0,243,144,Medium,34,17,Yes,No4.34,106,44,0,481,111,Medium,70,14,No,No9.7,138,61,12,156,120,Medium,25,14,Yes,Yes10.62,116,79,19,359,116,Good,58,17,Yes,Yes10.59,131,120,15,262,124,Medium,30,10,Yes,Yes6.43,124,44,0,125,107,Medium,80,11,Yes,No7.49,136,119,6,178,145,Medium,35,13,Yes,Yes3.45,110,45,9,276,125,Medium,62,14,Yes,Yes4.1,134,82,0,464,141,Medium,48,13,No,No6.68,107,25,0,412,82,Bad,36,14,Yes,No7.8,119,33,0,245,122,Good,56,14,Yes,No8.69,113,64,10,68,101,Medium,57,16,Yes,Yes5.4,149,73,13,381,163,Bad,26,11,No,Yes11.19,98,104,0,404,72,Medium,27,18,No,No5.16,115,60,0,119,114,Bad,38,14,No,No8.09,132,69,0,123,122,Medium,27,11,No,No13.14,137,80,10,24,105,Good,61,15,Yes,Yes8.65,123,76,18,218,120,Medium,29,14,No,Yes9.43,115,62,11,289,129,Good,56,16,No,Yes5.53,126,32,8,95,132,Medium,50,17,Yes,Yes9.32,141,34,16,361,108,Medium,69,10,Yes,Yes9.62,151,28,8,499,135,Medium,48,10,Yes,Yes7.36,121,24,0,200,133,Good,73,13,Yes,No3.89,123,105,0,149,118,Bad,62,16,Yes,Yes10.31,159,80,0,362,121,Medium,26,18,Yes,No12.01,136,63,0,160,94,Medium,38,12,Yes,No4.68,124,46,0,199,135,Medium,52,14,No,No7.82,124,25,13,87,110,Medium,57,10,Yes,Yes8.78,130,30,0,391,100,Medium,26,18,Yes,No10,114,43,0,199,88,Good,57,10,No,Yes6.9,120,56,20,266,90,Bad,78,18,Yes,Yes5.04,123,114,0,298,151,Bad,34,16,Yes,No5.36,111,52,0,12,101,Medium,61,11,Yes,Yes5.05,125,67,0,86,117,Bad,65,11,Yes,No9.16,137,105,10,435,156,Good,72,14,Yes,Yes3.72,139,111,5,310,132,Bad,62,13,Yes,Yes8.31,133,97,0,70,117,Medium,32,16,Yes,No5.64,124,24,5,288,122,Medium,57,12,No,Yes9.58,108,104,23,353,129,Good,37,17,Yes,Yes7.71,123,81,8,198,81,Bad,80,15,Yes,Yes4.2,147,40,0,277,144,Medium,73,10,Yes,No8.67,125,62,14,477,112,Medium,80,13,Yes,Yes3.47,108,38,0,251,81,Bad,72,14,No,No5.12,123,36,10,467,100,Bad,74,11,No,Yes7.67,129,117,8,400,101,Bad,36,10,Yes,Yes5.71,121,42,4,188,118,Medium,54,15,Yes,Yes6.37,120,77,15,86,132,Medium,48,18,Yes,Yes7.77,116,26,6,434,115,Medium,25,17,Yes,Yes6.95,128,29,5,324,159,Good,31,15,Yes,Yes5.31,130,35,10,402,129,Bad,39,17,Yes,Yes9.1,128,93,12,343,112,Good,73,17,No,Yes5.83,134,82,7,473,112,Bad,51,12,No,Yes6.53,123,57,0,66,105,Medium,39,11,Yes,No5.01,159,69,0,438,166,Medium,46,17,Yes,No11.99,119,26,0,284,89,Good,26,10,Yes,No4.55,111,56,0,504,110,Medium,62,16,Yes,No12.98,113,33,0,14,63,Good,38,12,Yes,No10.04,116,106,8,244,86,Medium,58,12,Yes,Yes7.22,135,93,2,67,119,Medium,34,11,Yes,Yes6.67,107,119,11,210,132,Medium,53,11,Yes,Yes6.93,135,69,14,296,130,Medium,73,15,Yes,Yes7.8,136,48,12,326,125,Medium,36,16,Yes,Yes7.22,114,113,2,129,151,Good,40,15,No,Yes3.42,141,57,13,376,158,Medium,64,18,Yes,Yes2.86,121,86,10,496,145,Bad,51,10,Yes,Yes11.19,122,69,7,303,105,Good,45,16,No,Yes7.74,150,96,0,80,154,Good,61,11,Yes,No5.36,135,110,0,112,117,Medium,80,16,No,No6.97,106,46,11,414,96,Bad,79,17,No,No7.6,146,26,11,261,131,Medium,39,10,Yes,Yes7.53,117,118,11,429,113,Medium,67,18,No,Yes6.88,95,44,4,208,72,Bad,44,17,Yes,Yes6.98,116,40,0,74,97,Medium,76,15,No,No8.75,143,77,25,448,156,Medium,43,17,Yes,Yes9.49,107,111,14,400,103,Medium,41,11,No,Yes6.64,118,70,0,106,89,Bad,39,17,Yes,No11.82,113,66,16,322,74,Good,76,15,Yes,Yes11.28,123,84,0,74,89,Good,59,10,Yes,No12.66,148,76,3,126,99,Good,60,11,Yes,Yes4.21,118,35,14,502,137,Medium,79,10,No,Yes8.21,127,44,13,160,123,Good,63,18,Yes,Yes3.07,118,83,13,276,104,Bad,75,10,Yes,Yes10.98,148,63,0,312,130,Good,63,15,Yes,No9.4,135,40,17,497,96,Medium,54,17,No,Yes8.57,116,78,1,158,99,Medium,45,11,Yes,Yes7.41,99,93,0,198,87,Medium,57,16,Yes,Yes5.28,108,77,13,388,110,Bad,74,14,Yes,Yes10.01,133,52,16,290,99,Medium,43,11,Yes,Yes11.93,123,98,12,408,134,Good,29,10,Yes,Yes8.03,115,29,26,394,132,Medium,33,13,Yes,Yes4.78,131,32,1,85,133,Medium,48,12,Yes,Yes5.9,138,92,0,13,120,Bad,61,12,Yes,No9.24,126,80,19,436,126,Medium,52,10,Yes,Yes11.18,131,111,13,33,80,Bad,68,18,Yes,Yes9.53,175,65,29,419,166,Medium,53,12,Yes,Yes6.15,146,68,12,328,132,Bad,51,14,Yes,Yes6.8,137,117,5,337,135,Bad,38,10,Yes,Yes9.33,103,81,3,491,54,Medium,66,13,Yes,No7.72,133,33,10,333,129,Good,71,14,Yes,Yes6.39,131,21,8,220,171,Good,29,14,Yes,Yes15.63,122,36,5,369,72,Good,35,10,Yes,Yes6.41,142,30,0,472,136,Good,80,15,No,No10.08,116,72,10,456,130,Good,41,14,No,Yes6.97,127,45,19,459,129,Medium,57,11,No,Yes5.86,136,70,12,171,152,Medium,44,18,Yes,Yes7.52,123,39,5,499,98,Medium,34,15,Yes,No9.16,140,50,10,300,139,Good,60,15,Yes,Yes10.36,107,105,18,428,103,Medium,34,12,Yes,Yes2.66,136,65,4,133,150,Bad,53,13,Yes,Yes11.7,144,69,11,131,104,Medium,47,11,Yes,Yes4.69,133,30,0,152,122,Medium,53,17,Yes,No6.23,112,38,17,316,104,Medium,80,16,Yes,Yes3.15,117,66,1,65,111,Bad,55,11,Yes,Yes11.27,100,54,9,433,89,Good,45,12,Yes,Yes4.99,122,59,0,501,112,Bad,32,14,No,No10.1,135,63,15,213,134,Medium,32,10,Yes,Yes5.74,106,33,20,354,104,Medium,61,12,Yes,Yes5.87,136,60,7,303,147,Medium,41,10,Yes,Yes7.63,93,117,9,489,83,Bad,42,13,Yes,Yes6.18,120,70,15,464,110,Medium,72,15,Yes,Yes5.17,138,35,6,60,143,Bad,28,18,Yes,No8.61,130,38,0,283,102,Medium,80,15,Yes,No5.97,112,24,0,164,101,Medium,45,11,Yes,No11.54,134,44,4,219,126,Good,44,15,Yes,Yes7.5,140,29,0,105,91,Bad,43,16,Yes,No7.38,98,120,0,268,93,Medium,72,10,No,No7.81,137,102,13,422,118,Medium,71,10,No,Yes5.99,117,42,10,371,121,Bad,26,14,Yes,Yes8.43,138,80,0,108,126,Good,70,13,No,Yes4.81,121,68,0,279,149,Good,79,12,Yes,No8.97,132,107,0,144,125,Medium,33,13,No,No6.88,96,39,0,161,112,Good,27,14,No,No12.57,132,102,20,459,107,Good,49,11,Yes,Yes9.32,134,27,18,467,96,Medium,49,14,No,Yes8.64,111,101,17,266,91,Medium,63,17,No,Yes10.44,124,115,16,458,105,Medium,62,16,No,Yes13.44,133,103,14,288,122,Good,61,17,Yes,Yes9.45,107,67,12,430,92,Medium,35,12,No,Yes5.3,133,31,1,80,145,Medium,42,18,Yes,Yes7.02,130,100,0,306,146,Good,42,11,Yes,No3.58,142,109,0,111,164,Good,72,12,Yes,No13.36,103,73,3,276,72,Medium,34,15,Yes,Yes4.17,123,96,10,71,118,Bad,69,11,Yes,Yes3.13,130,62,11,396,130,Bad,66,14,Yes,Yes8.77,118,86,7,265,114,Good,52,15,No,Yes8.68,131,25,10,183,104,Medium,56,15,No,Yes5.25,131,55,0,26,110,Bad,79,12,Yes,Yes10.26,111,75,1,377,108,Good,25,12,Yes,No10.5,122,21,16,488,131,Good,30,14,Yes,Yes6.53,154,30,0,122,162,Medium,57,17,No,No5.98,124,56,11,447,134,Medium,53,12,No,Yes14.37,95,106,0,256,53,Good,52,17,Yes,No10.71,109,22,10,348,79,Good,74,14,No,Yes10.26,135,100,22,463,122,Medium,36,14,Yes,Yes7.68,126,41,22,403,119,Bad,42,12,Yes,Yes9.08,152,81,0,191,126,Medium,54,16,Yes,No7.8,121,50,0,508,98,Medium,65,11,No,No5.58,137,71,0,402,116,Medium,78,17,Yes,No9.44,131,47,7,90,118,Medium,47,12,Yes,Yes7.9,132,46,4,206,124,Medium,73,11,Yes,No16.27,141,60,19,319,92,Good,44,11,Yes,Yes6.81,132,61,0,263,125,Medium,41,12,No,No6.11,133,88,3,105,119,Medium,79,12,Yes,Yes5.81,125,111,0,404,107,Bad,54,15,Yes,No9.64,106,64,10,17,89,Medium,68,17,Yes,Yes3.9,124,65,21,496,151,Bad,77,13,Yes,Yes4.95,121,28,19,315,121,Medium,66,14,Yes,Yes9.35,98,117,0,76,68,Medium,63,10,Yes,No12.85,123,37,15,348,112,Good,28,12,Yes,Yes5.87,131,73,13,455,132,Medium,62,17,Yes,Yes5.32,152,116,0,170,160,Medium,39,16,Yes,No8.67,142,73,14,238,115,Medium,73,14,No,Yes8.14,135,89,11,245,78,Bad,79,16,Yes,Yes8.44,128,42,8,328,107,Medium,35,12,Yes,Yes5.47,108,75,9,61,111,Medium,67,12,Yes,Yes6.1,153,63,0,49,124,Bad,56,16,Yes,No4.53,129,42,13,315,130,Bad,34,13,Yes,Yes5.57,109,51,10,26,120,Medium,30,17,No,Yes5.35,130,58,19,366,139,Bad,33,16,Yes,Yes12.57,138,108,17,203,128,Good,33,14,Yes,Yes6.14,139,23,3,37,120,Medium,55,11,No,Yes7.41,162,26,12,368,159,Medium,40,18,Yes,Yes5.94,100,79,7,284,95,Bad,50,12,Yes,Yes9.71,134,37,0,27,120,Good,49,16,Yes,Yes


--------------------------------------------------------------------------------
/data/Khan.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qx0731/Sharing_ISL_python/6b588e773be9984d0a4b91172fcce803c39fe0d6/data/Khan.rda


--------------------------------------------------------------------------------
/data/NCI60.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qx0731/Sharing_ISL_python/6b588e773be9984d0a4b91172fcce803c39fe0d6/data/NCI60.rda


--------------------------------------------------------------------------------
/data/NCI60_labs.csv:
--------------------------------------------------------------------------------
 1 | "x"
 2 | "CNS"
 3 | "CNS"
 4 | "CNS"
 5 | "RENAL"
 6 | "BREAST"
 7 | "CNS"
 8 | "CNS"
 9 | "BREAST"
10 | "NSCLC"
11 | "NSCLC"
12 | "RENAL"
13 | "RENAL"
14 | "RENAL"
15 | "RENAL"
16 | "RENAL"
17 | "RENAL"
18 | "RENAL"
19 | "BREAST"
20 | "NSCLC"
21 | "RENAL"
22 | "UNKNOWN"
23 | "OVARIAN"
24 | "MELANOMA"
25 | "PROSTATE"
26 | "OVARIAN"
27 | "OVARIAN"
28 | "OVARIAN"
29 | "OVARIAN"
30 | "OVARIAN"
31 | "PROSTATE"
32 | "NSCLC"
33 | "NSCLC"
34 | "NSCLC"
35 | "LEUKEMIA"
36 | "K562B-repro"
37 | "K562A-repro"
38 | "LEUKEMIA"
39 | "LEUKEMIA"
40 | "LEUKEMIA"
41 | "LEUKEMIA"
42 | "LEUKEMIA"
43 | "COLON"
44 | "COLON"
45 | "COLON"
46 | "COLON"
47 | "COLON"
48 | "COLON"
49 | "COLON"
50 | "MCF7A-repro"
51 | "BREAST"
52 | "MCF7D-repro"
53 | "BREAST"
54 | "NSCLC"
55 | "NSCLC"
56 | "NSCLC"
57 | "MELANOMA"
58 | "BREAST"
59 | "BREAST"
60 | "MELANOMA"
61 | "MELANOMA"
62 | "MELANOMA"
63 | "MELANOMA"
64 | "MELANOMA"
65 | "MELANOMA"
66 | 


--------------------------------------------------------------------------------
/data/Portfolio.csv:
--------------------------------------------------------------------------------
  1 | "X","Y"
  2 | -0.895250889141557,-0.234923525765402
  3 | -1.5624543274753,-0.885175993044695
  4 | -0.417089883126492,0.271888018049829
  5 | 1.04435572526951,-0.734197504067649
  6 | -0.315568406681027,0.841983429961188
  7 | -1.73712384902476,-2.03719104074984
  8 | 1.96641315717111,1.45295666192369
  9 | 2.1528678980109,-0.434138628179502
 10 | -0.0812080267602958,1.45080850218963
 11 | -0.891781794029037,0.821016234539977
 12 | -0.293201702010266,-1.04239112183501
 13 | 0.50577917106943,0.608477825846609
 14 | 0.52675125409276,-0.222493343282789
 15 | 1.06646932095091,1.2313566752569
 16 | 0.294015895063748,0.628589480036184
 17 | 0.0425492997633765,-1.26757361755317
 18 | 1.83096958062302,-0.572751605498511
 19 | -0.32693749887808,-0.487472465045569
 20 | 0.521480415807099,2.56598528732423
 21 | 1.39986834733422,-0.35783612748179
 22 | -0.645447596468841,-1.41243138949505
 23 | -0.904351878449744,-0.568304791041892
 24 | -1.76458606961956,-0.746272562068363
 25 | -1.81048463818975,0.493747359351401
 26 | -1.16989891378141,-2.72528149494243
 27 | -0.685375735369436,-0.457615734339251
 28 | 1.09091803183517,0.0144945075275287
 29 | -0.432340114040807,-0.399831023509433
 30 | 0.268814775370724,-0.201608350198064
 31 | -0.851840753541132,-1.74182928585454
 32 | -1.49708417203583,-0.826033329437587
 33 | 0.0887747459974043,-0.887360712723633
 34 | -1.60172430963135,-0.695299045952921
 35 | -1.24685724025742,-1.52958488449121
 36 | -1.06298912830654,-0.110637447363915
 37 | -0.26628305530967,0.0451634696288592
 38 | 1.67658383263088,2.5200528826286
 39 | 0.119572571440877,0.535542781034257
 40 | -0.0860079872690871,1.36359582805839
 41 | 0.368080289748909,1.72937250996995
 42 | -0.271494206939639,1.37926732742329
 43 | -0.0859264618788124,-0.127662573750838
 44 | -0.190750153683344,-0.461333357787814
 45 | -0.781679768391051,1.02239787730549
 46 | 0.792436346460761,-0.814298088654853
 47 | -0.28286988623389,-1.03846880699277
 48 | -0.236625531902745,0.928450553143057
 49 | 1.17183009101022,1.72983145002732
 50 | 0.496942768505321,-0.925139825948684
 51 | -0.887370979477135,-2.2834979593885
 52 | -1.30695315836496,-2.38160058115405
 53 | -2.43276412040427,-2.02554558512495
 54 | -0.407188960959958,-0.335098643325459
 55 | -0.285665299455223,-1.30878131266949
 56 | 1.52221488310337,1.20100315334525
 57 | -0.998106907437742,-0.946268900068486
 58 | -0.289973726127379,0.206256579940999
 59 | -1.23683924300474,-0.675447507316727
 60 | -0.359506962064002,-2.70015447021752
 61 | 0.543559153033075,0.42254755209331
 62 | -0.403647282894893,-0.0543899228706378
 63 | 1.30330893265591,1.32896747385231
 64 | -0.717117243405944,1.33137979803966
 65 | -1.01270788405516,-0.92476923081864
 66 | 0.831992902158869,2.24774586894653
 67 | 1.33764359604195,0.868256457487716
 68 | 0.601693509867379,-0.198217563055149
 69 | 1.30285098047145,1.10466637601686
 70 | -0.881700578927026,-1.54068478518396
 71 | -0.824529071304578,-1.33700787719544
 72 | -0.984356518466055,-1.1391602659206
 73 | -1.38499150721135,0.70269993294853
 74 | -0.358842560435759,-1.69451276977832
 75 | -0.226618229456359,0.801938547570983
 76 | -0.941077436691343,-0.733188708932247
 77 | 2.4603359481276,-0.048372817002224
 78 | 0.716797281412897,0.602336759898045
 79 | -0.248087023209405,-1.01849037378952
 80 | 1.01077288944297,0.0529780222228798
 81 | 2.31304863448491,1.75235887915611
 82 | 0.835179797449368,0.98571487565829
 83 | -1.07190333913753,-1.24729787324372
 84 | -1.6505261438491,0.215464529577012
 85 | -0.60048569030458,-0.420940526974254
 86 | -0.0585293830470613,0.127620874053089
 87 | 0.0757267446338611,-0.522149221026395
 88 | -1.15783156137448,0.590893742238611
 89 | 1.67360608794112,0.114623316085095
 90 | -1.04398823978305,-0.418944284341397
 91 | 0.0146874765920298,-0.558746620672602
 92 | 0.675321970429067,1.48262978763307
 93 | 1.77834230986132,0.942774111448264
 94 | -1.29576363940663,-1.0852038131022
 95 | 0.0796020218474959,-0.539100814053817
 96 | 2.2608577144194,0.673224840266669
 97 | 0.479090923233913,1.45477446090542
 98 | -0.535019997432816,-0.399174811276031
 99 | -0.773129330645406,-0.957174849520677
100 | 0.403634339015336,1.39603816898688
101 | -0.58849643871802,-0.497285090817856
102 | 


--------------------------------------------------------------------------------
/data/Publication.csv:
--------------------------------------------------------------------------------
  1 | "posres","multi","clinend","mech","sampsize","budget","impact","time","status"
  2 | 0,0,1,"R01",39876,8.0169405,44.016,11.20328542,1
  3 | 0,0,1,"R01",39876,8.0169405,23.494,15.17864476,1
  4 | 0,0,1,"R01",8171,7.612606,8.391,24.41067762,1
  5 | 0,0,1,"Contract",24335,11.771928,15.402,2.595482546,1
  6 | 0,0,1,"Contract",33357,76.517537,16.783,8.607802875,1
  7 | 0,0,1,"Contract",10355,9.809938,16.783,8.607802875,1
  8 | 0,1,0,"U01",1704,23.818344,5.692,40.04928131,1
  9 | 1,0,0,"R01",150,2.703848,3.496,27.07186858,1
 10 | 0,0,0,"R01",135,3.454153,9.835,36.00821355,1
 11 | 0,1,0,"Contract",423,11.154085,16.783,9.626283368,1
 12 | 0,0,1,"Contract",4060,17.95,31.736,14.12731006,1
 13 | 0,1,1,"Contract",2481,29.417184,24.831,25.0349076,1
 14 | 0,0,1,"Contract",8290,21.55,38.57,11.23613963,1
 15 | 0,0,0,"U01",226,5.190647,34.833,34.00410678,1
 16 | 0,0,0,"R01",293,1.32166,6.278,22.53798768,1
 17 | 1,0,0,"U01",83,2.001333,3.811,50.98973306,1
 18 | 1,0,1,"U01",129,5.963904,29.065,3.646817248,1
 19 | 1,1,1,"U01",2521,8.96723,44.016,14.68583162,1
 20 | 0,0,1,"Contract",6213,4.487559,15.397,5.585215606,1
 21 | 0,1,1,"Contract",16608,103.4,16.783,1.544147844,1
 22 | 0,1,1,"Contract",10739,79.3,24.831,1.478439425,1
 23 | 0,1,1,"Contract",48835,415.1,23.175,10.31622177,1
 24 | 0,1,1,"Contract",36282,18.2,51.296,10.57905544,1
 25 | 0,0,0,"R01",210,2.255735,0,130.9897331,0
 26 | 1,1,1,"R01",508,4.472178,34.833,2.792607803,1
 27 | 0,0,0,"U01",261,4.801247,4.029,56.08213552,1
 28 | 0,0,0,"U01",303,4.379891,4.029,29.01026694,1
 29 | 1,1,0,"U01",810,13.684516,21.455,4.73100616,1
 30 | 0,0,1,"U01",4012,8.804224,44.016,16.65708419,1
 31 | 0,0,1,"Contract",433,4.699537,23.494,22.6036961,1
 32 | 0,1,1,"Contract",10251,106.058401,50.017,3.975359343,1
 33 | 0,1,1,"Contract",5518,55.633493,53.406,8.443531828,1
 34 | 0,1,1,"Contract",4733,55.633493,53.406,8.443531828,1
 35 | 0,0,0,"R01",200,2.231324,23.494,22.50513347,1
 36 | 0,0,0,"R01",402,1.12463,11.632,37.05954825,1
 37 | 1,0,1,"Contract",19000,9.497703,38.57,10.41478439,1
 38 | 0,0,0,"R01",162,2.206498,0,119.063655,0
 39 | 0,1,1,"U01",2166,17.595514,51.296,10.4476386,1
 40 | 1,0,0,"R01",492,2.198572,8.016,31.47433265,1
 41 | 0,0,0,"U01",124,2.708256,3.037,30.06160164,1
 42 | 0,1,1,"U01",2368,62.59858062,47.05,6.209445585,1
 43 | 1,1,0,"U01",8727,34.224829,3.766,20.99383984,1
 44 | 1,0,0,"R01",427,3.58889,25.547,16.45995893,1
 45 | 1,0,0,"R01",201,3.487832,1.52,34.95687885,1
 46 | 0,0,0,"R01",135,2.292573,0,84.99383984,0
 47 | 0,0,1,"R01",902,5.900058,30.011,46.71868583,1
 48 | 1,0,0,"R01",500,3.275109,1.711,17.0513347,1
 49 | 0,1,1,"U01",1000,16.79932,47.05,3.876796715,1
 50 | 0,1,1,"U01",2136,22.592598,53.298,5.059548255,1
 51 | 0,0,0,"R01",380,0.925,10.94,3.482546201,1
 52 | 0,0,0,"R01",436,2.561561,0,69.84804928,0
 53 | 0,0,0,"R01",389,2.617741,0,39.9835729,0
 54 | 1,0,0,"R01",302,2.88127,13.254,0.985626283,1
 55 | 0,0,1,"U01",7001,17.178972,50.017,6.045174538,1
 56 | 1,0,0,"U01",499,11.494152,31.71,9.297741273,1
 57 | 0,1,1,"U01",2331,39.218339,28.899,12.78028747,1
 58 | 1,0,0,"R01",164,6.776063,23.494,4.566735113,1
 59 | 0,0,0,"R01",153,1.827712,23.175,12.12320329,1
 60 | 0,0,0,"R01",286,1.951931,4.357,52.36960986,1
 61 | 1,1,0,"U01",1032,17.613839,31.71,8.410677618,1
 62 | 0,0,0,"U01",811,7.571583,47.05,14.88295688,1
 63 | 0,1,1,"U01",911,25.650638,30.011,7.885010267,1
 64 | 1,0,0,"R01",453,5.354273,28.899,16.55852156,1
 65 | 0,0,0,"U01",300,2.81576,0,26.97330595,0
 66 | 1,0,0,"R01",156,4.920416,28.899,5.223819302,1
 67 | 0,0,0,"P50",64,1.422245,14.816,16.49281314,1
 68 | 0,0,0,"R01",162,2.027778,4.106,38.99794661,1
 69 | 0,0,0,"R01",750,3.128757,0,9.987679671,0
 70 | 0,1,0,"U01",230,9.41282,14.816,23.35934292,1
 71 | 1,0,0,"R01",204,5.18198,0,41.98767967,0
 72 | 0,0,1,"R01",209,1.262139,4.259,30.02874743,1
 73 | 0,0,0,"R01",247,0.556358,0,63.73716632,1
 74 | 1,1,1,"U01",549,11.61306,53.406,6.833675565,1
 75 | 0,0,0,"R01",122,2.864764,28.899,6.702258727,1
 76 | 0,0,0,"R01",3000,4.076873,0,16,0
 77 | 1,0,0,"R01",288,2.895148,11.462,9.626283368,1
 78 | 0,0,0,"R01",660,3.064513,0,12.02464066,0
 79 | 0,0,0,"R01",300,1.22786,14.816,25.10061602,1
 80 | 0,1,1,"U01",3414,23.418146,53.298,5.519507187,1
 81 | 0,0,0,"R01",1200,2.568866,3.299,14.98151951,1
 82 | 0,0,0,"R01",55,1.721499,0,33.01848049,0
 83 | 0,0,0,"R01",6100,5.5364,0,14.98151951,0
 84 | 1,0,0,"R01",454,3.586954,1.997,34.00410678,0
 85 | 1,0,0,"R01",279,2.467951,2.83,70.14373717,1
 86 | 1,0,0,"R01",636,3.702054,16.226,21.5523614,1
 87 | 0,0,0,"R01",600,3.749889,0,26.97330595,0
 88 | 1,0,0,"R01",4000,3.057155,0,14.98151951,0
 89 | 0,0,0,"R01",806,4.266761,4.11,18.0698152,1
 90 | 0,0,0,"R01",174,2.617692,6.296,27.43326489,1
 91 | 0,0,1,"R01",4837,1.946543,53.406,3.942505133,1
 92 | 1,0,0,"R01",419,3.632625,9.813,39.78644764,1
 93 | 0,1,0,"U01",199,6.451432,52.589,23.52361396,1
 94 | 0,1,0,"U01",119,6.795788,14.739,4.106776181,1
 95 | 1,0,0,"R01",845,3.757502,2.83,16.42710472,1
 96 | 0,0,0,"R01",464,1.895319,0,43.95893224,0
 97 | 0,0,0,"R01",476,3.725195,0,16.65708419,0
 98 | 0,0,0,"R01",777,2.728562,0,20.69815195,0
 99 | 1,0,0,"R01",847,3.76502,0,20.96098563,0
100 | 1,0,0,"R01",595,3.622656,0,60.02464066,0
101 | 1,0,0,"R01",96,0.747358,0,45.53593429,1
102 | 1,0,0,"R01",181,2.831705,3.873,39.9835729,1
103 | 1,0,0,"R01",448,0.966937,0,43.99178645,0
104 | 1,0,0,"R01",279,2.997898,0,24.04928131,1
105 | 1,1,0,"U01",78,1.824367,4.651,80.16427105,1
106 | 0,0,0,"R01",50,2.341968,7.36,13.07597536,1
107 | 0,0,0,"R01",113,4.547943,0,16,0
108 | 1,0,0,"R24, K24",157,3.455642,10.639,19.35112936,1
109 | 1,0,0,"R01",778,2.667221,31.71,6.800821355,1
110 | 0,0,0,"R01",39,0.762945,4.086,29.76591376,1
111 | 0,0,0,"R01",28,1.193323,6.603,20.40246407,1
112 | 0,0,0,"R01",248,2.422114,4.518,34.98973306,1
113 | 0,0,0,"R01",388,1.030004,6.307,30.68583162,1
114 | 0,0,0,"R01",900,2.097957,0,130.0041068,0
115 | 1,1,0,"U01",53,1.356094,0,37.65092402,0
116 | 1,0,1,"R44",2470,5.653211,38.278,5.749486653,1
117 | 1,0,0,"R01",50,0.25,0,66.98973306,0
118 | 0,0,0,"R01",152,0.896992,0,120.0164271,0
119 | 0,0,0,"R01",239,1.600511,3.873,39.95071869,1
120 | 0,0,0,"R01",249,1.86971,1.438,17.0513347,1
121 | 1,0,0,"R01",201,1.747354,3.873,39.19507187,1
122 | 1,0,0,"R01",182,4.232768,7.508,18.39835729,1
123 | 1,0,0,"R18",234,2.733214,9.11,17.83983573,1
124 | 0,0,0,"R01",402,3.159323,9.813,19.4825462,1
125 | 1,0,0,"R01",179,1.974407,2.238,17.01848049,1
126 | 0,0,0,"R01",670,4.508157,2.83,29.8973306,1
127 | 1,0,0,"K01",56,0.761375,2.83,21.9137577,1
128 | 1,0,0,"R01",574,3.475804,6.615,16,1
129 | 0,0,0,"R01",870,2.645003,0,43.95893224,0
130 | 1,0,0,"R01",216,1.658351,3.672,25.00205339,1
131 | 1,0,0,"R01",190,1.424504,3.122,25.79055441,1
132 | 1,0,0,"R01",170,1.011292,0,14.75154004,1
133 | 0,0,0,"R01",60,3.880263,4.457,24.0164271,1
134 | 1,0,0,"R01",600,2.523507,0,27.95893224,0
135 | 1,0,0,"R01",337,2.612405,0,53.09240246,1
136 | 1,0,0,"P01",159,1.290477,0,35.08829569,1
137 | 1,0,0,"R01",355,3.011476,5.43,19.8110883,1
138 | 0,0,0,"R01",305,0.882736,1.318,25.42915811,1
139 | 0,0,0,"R01",305,0.882736,0,37.94661191,0
140 | 1,0,0,"R01",250,1.363761,0,16,0
141 | 0,0,0,"R01",200,2.28017,0,47.01437372,0
142 | 0,0,0,"R01",61,3.557896,1.506,26.0862423,1
143 | 0,0,0,"R01",252,2.960115,0,59.00616016,0
144 | 0,0,0,"R01",33,0.25,5.632,17.7412731,1
145 | 1,0,0,"R01",130,2.266291,4.844,18.98973306,1
146 | 0,0,0,"R01",990,4.529343,0,24.0164271,0
147 | 1,0,0,"R01",280,4.748245,0,11.72895277,1
148 | 0,0,0,"R01",120,2.771899,0,11.00616016,0
149 | 1,0,0,"R01",618,2.49585,4.284,18.98973306,1
150 | 1,0,0,"R01",350,3.693318,0,10.02053388,0
151 | 1,0,0,"R01",525,3.89671,4.9,13.8973306,1
152 | 0,0,0,"R01",394,1.560963,4.11,40.04928131,1
153 | 0,0,0,"U01",579,2.783147,0,4.008213552,0
154 | 1,0,0,"R01",221,1.456284,5.967,19.51540041,1
155 | 1,0,0,"R21",24,0.484714,5.799,5.979466119,1
156 | 1,0,0,"Contract",242,2.361381,11.462,49.7412731,1
157 | 1,0,0,"Contract",256,2.361381,11.462,49.90554415,1
158 | 0,0,0,"R01",660,3.079765,0,35.58110883,1
159 | 0,0,0,"R01",655,3.284871,0,89.98767967,0
160 | 0,0,0,"R01",500,1.940301,0,68.00821355,0
161 | 0,0,0,"R01",196,2.977829,3.368,30.91581109,1
162 | 0,0,0,"R01",133,2.213278,12.755,11.8275154,1
163 | 1,0,0,"R01",221,1.563877,3.565,11.00616016,1
164 | 0,0,1,"R01",1653,6.857323,53.406,4.566735113,1
165 | 0,0,0,"R01",594,2.784882,0,58.28336756,0
166 | 0,1,1,"U01",1331,7.907451,30.011,8.147843943,1
167 | 0,1,1,"U01",853,7.907451,7.492,22.99794661,1
168 | 0,0,0,"K23",41,0.79536,0,6.997946612,0
169 | 1,0,0,"R01",2319,2.985753,11.462,21.78234086,1
170 | 1,0,0,"R01",973,5.557458,9.11,48.45995893,1
171 | 1,0,0,"R01",32,0.966019,2.289,68.27104723,1
172 | 1,0,0,"R01",62,2.505966,0,31.96714579,0
173 | 1,0,0,"R01",158,2.160419,3.974,58.0862423,1
174 | 0,0,0,"R01",248,2.016397,3.873,24.57494867,1
175 | 0,0,0,"R01",500,1.890413,0,18.98973306,0
176 | 0,0,1,"R01",605,4.243261,0,13.9301848,0
177 | 1,0,0,"R01",1257,3.477152,0,6.012320329,0
178 | 0,0,0,"P01",58,0.754531,14.156,27.23613963,1
179 | 1,0,0,"R01",492,2.057492,9.813,47.86858316,1
180 | 0,1,1,"U01",9933,11.135179,53.298,13.04312115,1
181 | 0,1,1,"U01",8718,11.135179,53.298,13.04312115,1
182 | 0,0,0,"R01",392,2.771647,0,40.96919918,0
183 | 0,0,0,"P01",44,0.905437,0,11.99178645,0
184 | 1,0,0,"P01",44,1.358157,0,11.99178645,0
185 | 1,0,0,"R01",360,3.406196,0,8.016427105,0
186 | 1,0,0,"P50",35,1.243022,0,21.02669405,0
187 | 0,0,0,"P50",150,3.193432,0,4.008213552,0
188 | 0,0,0,"R01",1004,3.245225,0,22.99794661,0
189 | 1,0,0,"K23",200,0.774032,1.66,23.12936345,1
190 | 0,0,0,"R01",2115,1.934249,0,34.00410678,0
191 | 0,0,1,"R01",266,1.996759,0,14.98151951,0
192 | 0,0,0,"R42",259,0.930637,0,24.0164271,0
193 | 0,0,0,"U01",900,2.898552,0,33.01848049,0
194 | 0,1,1,"U01",1586,2.026512,14.093,22.89938398,1
195 | 1,1,0,"Contract",3472,15.490398,53.406,6.899383984,1
196 | 0,0,0,"K23",24,0.568258,0,53.38809035,0
197 | 1,0,0,"R01",150,3.700714,10.639,13.8973306,1
198 | 0,1,0,"U01",308,8.26,53.298,12.09034908,1
199 | 0,0,0,"P01",146,2.245685,0,41.49486653,0
200 | 0,0,0,"R01",67,2.53601,3.794,23.9835729,1
201 | 0,0,0,"P01",43,1.581877,0,48.98562628,0
202 | 0,0,0,"R01",163,10.227118,0,3.975359343,0
203 | 0,0,0,"U01",149,1.807713,0,9.034907598,0
204 | 1,0,0,"R01",420,2.948155,0,9.987679671,0
205 | 1,0,0,"U01",141,3.482757,0,25.95482546,0
206 | 0,0,0,"R01",862,1.750767,0,29.17453799,0
207 | 1,0,0,"U01",365,4.903409,0,11.00616016,0
208 | 0,0,0,"R01",392,2.354367,0,5.979466119,0
209 | 0,0,0,"R01",94,2.695184,0,6.012320329,0
210 | 0,1,0,"U01",87,8.17572,30.011,2.464065708,1
211 | 0,0,0,"R01",120,4.391234,0,15.50718686,0
212 | 0,0,0,"R01",256,3.609222,0,7.983572895,0
213 | 0,0,0,"R01",30,0.906345,8.266,22.50513347,1
214 | 0,0,0,"R01",501,3.539633,4.84,4.468172485,1
215 | 1,0,0,"R44",300,0.84868,0,22.24229979,0
216 | 1,0,0,"R01",415,5.702727,53.298,9.429158111,1
217 | 1,0,0,"R44",308,1.367797,0,22.86652977,0
218 | 0,0,0,"R01",26,0.638626,0,4.993839836,0
219 | 0,1,0,"U01",92,9.422223,30.011,4.303901437,1
220 | 1,0,0,"U01",390,5.381624,53.298,9.067761807,1
221 | 1,0,0,"R21",51,0.4125,0,22.99794661,0
222 | 0,0,0,"R21",22,0.390914,0,8.180698152,0
223 | 1,0,0,"R01",18,1.208598,6.606,38.11088296,1
224 | 1,0,0,"R01",266,2.26775,4.844,41.0349076,1
225 | 1,0,0,"R01",21,1.576176,14.739,21.51950719,1
226 | 1,0,0,"R21",120,0.362247,0,18.4312115,0
227 | 0,0,0,"K23",24,0.67217,0,28.94455852,0
228 | 1,0,1,"U54",31,5.649036,38.278,7.983572895,1
229 | 0,0,0,"R01",132,2.481322,0,2.98973306,0
230 | 0,0,0,"R01",55,1.41009,0,4.993839836,0
231 | 1,0,0,"R21",30,0.407,0,19.97535934,0
232 | 0,0,0,"R01",60,1.591641,0,6.012320329,0
233 | 1,0,0,"RC2",150,4.136888,0,30.39014374,0
234 | 1,0,0,"R44",1800,0.988677,3.299,35.02258727,1
235 | 1,0,0,"R21",181,0.37509,0,4.008213552,0
236 | 0,0,0,"R01",66,1.15875,0,5.979466119,0
237 | 0,0,0,"P50",100,2.531065,0,11.86036961,0
238 | 1,0,0,"K23",99,0.601928,0,13.04312115,0
239 | 1,0,0,"R01",247,1.3054805,0,9.691991786,1
240 | 1,0,0,"R01",247,1.3054805,0,16.45995893,1
241 | 0,0,0,"R01",4105,2.703653,5.355,65.01848049,1
242 | 1,0,0,"R44",181,1.117084,0,66.98973306,0
243 | 0,0,0,"K23",104,0.472321,0,9.987679671,0
244 | 0,0,0,"R21",69,0.40471,0,21.97946612,0
245 | 1,0,0,"R01",1699,2.957751,0,4.632443532,0
246 | 


--------------------------------------------------------------------------------
/data/Readme_datalist:
--------------------------------------------------------------------------------
 1 | Auto
 2 | Bikeshare
 3 | Caravan
 4 | Carseats
 5 | College
 6 | Default
 7 | Hitters
 8 | Khan (json file)
 9 | NCI60 (json file)
10 | OJ
11 | Portfolio
12 | Smarket
13 | Wage
14 | Weekly
15 | 


--------------------------------------------------------------------------------
/data/USArrests.csv:
--------------------------------------------------------------------------------
 1 | "","Murder","Assault","UrbanPop","Rape"
 2 | "Alabama",13.2,236,58,21.2
 3 | "Alaska",10,263,48,44.5
 4 | "Arizona",8.1,294,80,31
 5 | "Arkansas",8.8,190,50,19.5
 6 | "California",9,276,91,40.6
 7 | "Colorado",7.9,204,78,38.7
 8 | "Connecticut",3.3,110,77,11.1
 9 | "Delaware",5.9,238,72,15.8
10 | "Florida",15.4,335,80,31.9
11 | "Georgia",17.4,211,60,25.8
12 | "Hawaii",5.3,46,83,20.2
13 | "Idaho",2.6,120,54,14.2
14 | "Illinois",10.4,249,83,24
15 | "Indiana",7.2,113,65,21
16 | "Iowa",2.2,56,57,11.3
17 | "Kansas",6,115,66,18
18 | "Kentucky",9.7,109,52,16.3
19 | "Louisiana",15.4,249,66,22.2
20 | "Maine",2.1,83,51,7.8
21 | "Maryland",11.3,300,67,27.8
22 | "Massachusetts",4.4,149,85,16.3
23 | "Michigan",12.1,255,74,35.1
24 | "Minnesota",2.7,72,66,14.9
25 | "Mississippi",16.1,259,44,17.1
26 | "Missouri",9,178,70,28.2
27 | "Montana",6,109,53,16.4
28 | "Nebraska",4.3,102,62,16.5
29 | "Nevada",12.2,252,81,46
30 | "New Hampshire",2.1,57,56,9.5
31 | "New Jersey",7.4,159,89,18.8
32 | "New Mexico",11.4,285,70,32.1
33 | "New York",11.1,254,86,26.1
34 | "North Carolina",13,337,45,16.1
35 | "North Dakota",0.8,45,44,7.3
36 | "Ohio",7.3,120,75,21.4
37 | "Oklahoma",6.6,151,68,20
38 | "Oregon",4.9,159,67,29.3
39 | "Pennsylvania",6.3,106,72,14.9
40 | "Rhode Island",3.4,174,87,8.3
41 | "South Carolina",14.4,279,48,22.5
42 | "South Dakota",3.8,86,45,12.8
43 | "Tennessee",13.2,188,59,26.9
44 | "Texas",12.7,201,80,25.5
45 | "Utah",3.2,120,80,22.9
46 | "Vermont",2.2,48,32,11.2
47 | "Virginia",8.5,156,63,20.7
48 | "Washington",4,145,73,26.2
49 | "West Virginia",5.7,81,39,9.3
50 | "Wisconsin",2.6,53,66,10.8
51 | "Wyoming",6.8,161,60,15.6
52 | 


--------------------------------------------------------------------------------
/data/dog_test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qx0731/Sharing_ISL_python/6b588e773be9984d0a4b91172fcce803c39fe0d6/data/dog_test.jpg


--------------------------------------------------------------------------------