├── README.md
└── Session1
    └── Tuesday
        ├── data
            └── IntroStat_demo.csv
        └── IntroStat.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # LSST-DSFP-Resources
2 | For computing/programming/statistical resources compiled by LSSTC Data Science Fellows
3 | 


--------------------------------------------------------------------------------
/Session1/Tuesday/data/IntroStat_demo.csv:
--------------------------------------------------------------------------------
 1 | time,mag.hom,mag.outlier,mag.t,mag.het,mag.het.error,mag5,mag5.error
 2 | 0,18.5589442683513,18.5306774114789,18.5605273552103,18.5674472421206,0.0673447662033141,18.5460560136707,0.00976609719927976
 3 | 0.17615778844447,18.5311784194935,18.5515604297523,18.5499055533718,18.5473152136787,0.0171485664555803,18.5426048480591,0.00770946746896302
 4 | 0.250164970666731,18.5536692551417,18.4680954415675,18.5446177769689,18.5641906767379,0.0206004055216908,18.5362693395877,0.00977726355227015
 5 | 1.42698258962969,18.5394751614305,18.5373682418528,18.5608179688024,18.5541623997199,0.0743387433001772,18.5457818773318,0.0087049168517297
 6 | 1.67714756266668,18.5754423277417,18.7905210141074,18.5561264918157,18.5147653876404,0.0274689674843103,18.5536441886995,0.0100835735266972
 7 | 1.92731250014822,18.5688096791856,18.5484733020366,18.5307572973929,18.5511344995731,0.00128030302003026,18.5535616090684,0.00862268498232961
 8 | 2.00131967288894,18.5306839699509,18.5738948687948,18.5923607846999,18.5463770669773,0.0385863671079278,18.5435323687151,0.00353901734083689
 9 | 2.17747751822225,18.5404357191549,18.5838537499684,18.5550741257684,18.5096817311331,0.0311729908455163,18.547784233619,0.00730273392960781
10 | 2.25148467911117,18.5336487465924,18.5505136689393,18.5487974225727,18.6204548264426,0.030169404274784,18.5423081349491,0.00586801461150437
11 | 2.42764244385194,18.5559246236964,18.5562794305921,18.5379523235324,18.5621240466992,0.00825049160048366,18.5412702481465,0.0121236431161133
12 | 2.50164955970376,18.5501084203234,18.5557037579903,18.5529945336716,18.547843601537,0.0507529947673902,18.5452619740364,0.00735818068071379
13 | 2.67780726755558,18.5645197288917,18.5549543912184,18.5618490369254,18.5460361520644,0.0809831742430106,18.5600811582278,0.00613799327651888
14 | 2.75181445214821,18.5635795082143,18.548374238575,18.5531600560735,18.5210358782911,0.0934805805096403,18.5447121835436,0.00681950987245638
15 | 2.92797224059268,18.5589321160663,18.5415504934681,18.5500181376146,18.5564302249123,0.0121413861168548,18.5482663380215,0.00402238791936502
16 | 3.00197940148155,18.554400284808,18.5433372839744,18.5531433811103,18.5458148129442,0.0128612750209868,18.5464045389296,0.00496515945617352
17 | 3.42830205866676,18.546831141818,18.5392489410442,18.5445689926193,18.4163400443553,0.0893393303267658,18.5465421911513,0.00825006995218595
18 | 3.50230923140748,18.5557632083347,18.7706070325729,18.5419814301352,18.5956025884245,0.0608045848319307,18.5438209692729,0.0087205281991559
19 | 3.678466962963,18.544265974006,18.5626371757059,18.4650401520377,18.5828734311861,0.0547708112280816,18.552398885318,0.0064182670261062
20 | 3.75247411200007,18.5351829265835,18.522141743808,18.5717250651362,18.4845672560639,0.0829137690132484,18.5552325488267,0.00493270036606317
21 | 3.92863187674078,18.5516688603311,18.567852081761,18.5400049667626,18.5240258816669,0.0423200894379988,18.5487543275974,0.0065567976860378
22 | 4.00263900444452,18.5285122655126,19.0505896825117,18.5576683009049,18.5594464913454,0.0772306305123493,18.5586718452476,0.00875526584287117
23 | 4.17879676918523,18.5557929338106,18.5569551618032,18.5493963939202,18.5326367831398,0.0280512649798766,18.5607002282244,0.00557509450438661
24 | 4.25280389451854,18.5529326279236,18.5586263953885,18.5647347420499,18.5722450389334,0.0787957694847137,18.5468448441239,0.00775270203974073
25 | 4.42896166162967,18.5466721253703,18.5662731977358,18.5318617352833,18.5691142289703,0.0125670915469527,18.5433680008302,0.00947502961077787
26 | 4.50296879881489,18.5455843284892,18.5457739476833,18.543658236517,18.5550040797419,0.0425309023354203,18.5554296288972,0.00626346095601517
27 | 4.67912653037035,18.5409110697306,18.5496056417444,18.5510661058619,18.561286360916,0.0702062232652679,18.5594793550286,0.00723944009740642
28 | 4.75313367940748,18.5519659351588,18.5381468484592,18.5608269244419,18.5280668947586,0.0233837144449353,18.5515065815533,0.0066153948037125
29 | 4.92929143466671,18.5706305083806,18.537476364426,18.5156572585304,18.6086035990201,0.0518757134443149,18.5434085528152,0.00928274239492134
30 | 5.00329857185187,18.5461062937116,18.5067553864728,18.551052788274,18.4834255226187,0.0612845422700048,18.5504014218671,0.00756243632938453
31 | 5.17945630340745,18.5590888928021,18.5700617572974,18.5435521208434,18.5562747021189,0.0972205097321421,18.5539690640214,0.00450351228300857
32 | 5.25346344296304,18.5565274652536,18.5785041773708,18.5370502723415,18.4631089748217,0.0927823832491413,18.5480576832403,0.0100016004981465
33 | 5.42962116266671,18.5388473006871,18.5187224700964,18.5514549479656,18.4588274189247,0.0710915053263307,18.54330290055,0.00707806461260917
34 | 5.50362828800007,18.5375868547063,18.5524568672701,18.5483980877705,18.5418905128437,0.0468997861025855,18.5547896497908,0.00774302896781335
35 | 5.67978603140745,18.5536257470751,18.5593359659178,18.553944265792,18.4274408901564,0.0653245693072677,18.5717981769911,0.00722942938671263
36 | 5.75379315911113,18.5427818423028,18.5531113998726,18.5775096647854,18.5840574022885,0.0320707945851609,18.5588027590485,0.0050738080157706
37 | 5.92995090014819,18.5555780126327,18.5742190074774,18.5409538297753,18.5486975066292,0.0774393061175943,18.5605463263609,0.00969025293870255
38 | 6.00395802785187,18.5565287734809,18.5775463590886,18.5617849366835,18.5708663214191,0.0920553196454421,18.5599068939509,0.0109402676459909
39 | 6.25412282785192,18.5559035686907,18.5630972859709,18.5308566700525,18.5157921009237,0.0548952181823552,18.5604036521072,0.00453180586065513
40 | 6.43028060681485,18.5661313710837,18.5591448931475,18.5589808141151,18.5110368266975,0.0858319909777492,18.5535933735478,0.00978519182482998
41 | 6.50428772029636,18.5477366775425,18.523965847213,18.5372351265929,18.545297330534,0.0821733459597453,18.5631851541897,0.00812575237508486
42 | 6.68044546370379,18.5708228054324,18.5590228343367,18.542158981161,18.5888687475159,0.0922145419754088,18.5644907342443,0.0097660092222279
43 | 6.75445256770377,18.5311585305758,18.5601414916841,18.5885692358754,18.5677957938898,0.0548793095164001,18.5533484567386,0.0097874431665338
44 | 6.93061029925934,18.5362721280303,18.5565164042252,18.5362535880405,18.5333127716303,0.0767854744335636,18.5636281678286,0.00520761918669793
45 | 7.00461737955561,18.5543355018981,18.5199741555534,18.5348857719401,18.5118432333631,0.0414669126272202,18.5608657030309,0.00839216899655168
46 | 7.18077512296298,18.5602723554936,18.5575914312487,18.5513116331508,18.551788246578,0.00213281528558582,18.5643997597013,0.00331454630322723
47 | 7.25478221511116,18.5257816398252,18.5656521894532,18.5227292410936,18.5714566674585,0.0457111209398136,18.5544774937076,0.0120271840375375
48 | 7.43093994666668,18.5547447326351,18.5195334326354,18.5349179931093,18.5820713647261,0.0336836270755157,18.5563353490799,0.00750787351010734
49 | 7.50494705066671,18.5604662283269,18.5671684167028,18.5564133877834,18.5696808855744,0.0616545808268711,18.5583136913205,0.00948953421018131
50 | 7.68110477274081,18.5694418236949,18.5112242984846,18.5487852701645,18.5810835591695,0.0684731747955084,18.5548631599234,0.00623661545157984
51 | 7.75511186488893,18.5481387618937,18.5263559304141,18.5388436463712,18.4966061176472,0.0361196914687753,18.5650441165026,0.0100793221832902
52 | 7.93126960592599,18.5344942778896,18.5553861340442,18.5466361455307,18.6213120273994,0.0985294625163078,18.5630625332219,0.0092009796337932
53 | 8.00527670044448,18.5562077086716,18.5472065305838,18.5433572258355,18.5970840277694,0.0458260116400197,18.5618095388714,0.00751794259404281
54 | 8.18143443200006,18.5472361837207,18.5449905470038,18.5722239125806,18.5903195577598,0.0678766368655488,18.5698626438528,0.00741529829594704
55 | 8.25544151229633,18.5456263186827,18.5470748575154,18.5679674152509,18.5550136968236,0.00645137266255915,18.5690106783907,0.00692850201038873
56 | 8.43159921066672,18.5428732685636,18.553915380597,18.5084077653486,18.5386932314377,0.0213110986864194,18.5652629695566,0.00566129593090012
57 | 8.50560631229632,18.5489438676115,18.5282370595513,18.551243556647,18.5552308890188,0.0135498776100576,18.5692212643741,0.0191769416240384
58 | 8.68176403437042,18.5649901953502,18.530235760623,18.5379110186313,18.4941281286038,0.0885351747972891,18.5689534312037,0.00544968844438369
59 | 8.75577110281483,18.554510866556,18.5727386126227,18.558164488833,18.6862964660561,0.081282608024776,18.5611947263895,0.00856172176535711
60 | 9.18209362488892,18.5432245935312,18.5550370293737,18.5421551370975,18.5540778138426,0.0527859937399626,18.5511194223028,0.0102602548596579
61 | 9.25610069333334,18.5475828322227,18.534449768215,18.5690581822654,18.5209402927344,0.0233320454601198,18.5694870799201,0.0102262135819955
62 | 9.43225841540743,18.548173408636,18.563840119393,18.5480218995715,18.5350612382339,0.0703313875943422,18.5580685195013,0.0152410020202681
63 | 9.50626550755561,18.5345295794331,18.5296641514354,18.5806037457966,18.5491152327246,0.00295040390919894,18.5769683123307,0.00471203349230513
64 | 9.68242319407409,18.5807764882747,18.5641730508058,18.5490765705939,18.5354269120785,0.0227896849624813,18.5620501633528,0.0148169371188335
65 | 9.75643026251856,18.5292425959111,18.5306067627064,18.540530926466,18.5800183002258,0.0738576146308333,18.5644408520759,0.00867916245522732
66 | 9.93258798222223,18.5508722992717,18.5545087175195,18.5469753426537,18.5352170998336,0.0250213630497456,18.573991727579,0.00533889049371997
67 | 10.006595053037,18.5408493639313,18.5289990734953,18.5624253252434,18.6270040754068,0.0639115944504738,18.5690024417712,0.00517361114636936
68 | 10.1827527608889,18.5501236358738,18.5432547797487,18.5331005198468,18.5524171191361,0.0406069706194103,18.5716193889192,0.00465990579290407
69 | 10.5069245985185,18.5409586616097,18.5319886869186,18.5438105928918,18.5628621968699,0.0116179539356381,18.5885800765758,0.011534692324484
70 | 10.6830822850371,18.5487867667802,18.5659105309089,18.6033156238295,18.5676869873518,0.043744198884815,18.570469858646,0.0104915347826452
71 | 10.7570893416297,18.5376891141176,18.5716775125042,18.5430999648969,18.4668218661464,0.0453147494699806,18.5840090914808,0.0137403462217677
72 | 10.9332470518519,18.5536042249926,18.5546561015614,18.5349260282157,18.6301409249524,0.0721810968825594,18.5641233076457,0.00587263297435276
73 | 11.0072541084445,18.5479652432903,18.5535084329554,18.5429766128583,18.5682853060466,0.0879812199622393,18.5663568587749,0.00984477441662157
74 | 11.5075835994075,18.5581601224785,18.5583863069206,18.5489521111866,18.462254659705,0.0873479148373008,18.5815633683939,0.00385140113468823
75 | 11.7577483425186,18.5468841458741,18.5184626994895,18.5391532518418,18.585988151904,0.0436564211267978,18.5763595374087,0.00325572038390436
76 | 


--------------------------------------------------------------------------------
/Session1/Tuesday/IntroStat.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import scipy\n",
 12 |     "import pandas as pd\n",
 13 |     "import numpy as np\n",
 14 |     "import math\n",
 15 |     "import statsmodels.api as sm\n",
 16 |     "from matplotlib import pyplot as plt"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# 1. Distributions: \n",
 24 |     "how to get to know better the distribution of the data, identify various issues, and check fits?\n",
 25 |     "\n",
 26 |     "## 1.1. Probability distributions\n",
 27 |     "\n",
 28 |     "Ex 1. QQ-PLOT BASICS. \n",
 29 |     "\n",
 30 |     "First, simulate 9 times 200 standard random normal variables, and inspect the variations in the qq-plots.\n",
 31 |     "Take a look at the help of qqplots in the statmodels.api module of Python using sm.qqplot? if necessary."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "fig = plt.figure(1)\n",
 43 |     "for ii in range(1, 10):\n",
 44 |     "    rv = scipy.stats.norm.rvs(0, 1, size = 200)\n",
 45 |     "    ax = fig.add_subplot(3,3,ii)\n",
 46 |     "    sm.qqplot(rv, line = 's', ax = ax)\n",
 47 |     "plt.show()"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "Ex 2. LINEARITY OF THE NORMAL DISTRIBUTION. \n",
 55 |     "\n",
 56 |     "Repeat the exercise using random normal variables with mean 3 and standard deviation 0.4, using now 3 times each of 4,20 and 200.\n",
 57 |     "\n",
 58 |     "The automatically added line on the qq-plot is estimated by taking the empirical mean and square-root empirical variance of the sample; these are the simplest estimators of the parameters of a normal sample. For a normal distribution, if X ~ N(0,1), then the variable  Y = m + sX ~ N(m, s^2). Therefore, if you plotted your sample Y against standard normal quantiles, a line using the true mean m as intersect and the true standard deviation s as slope should represent the truth, and probably should fit the sample well. \n",
 59 |     "\n",
 60 |     "Add a line with intersection 3 and slope 0.4 (representing the n = Infty perfect sample). Are the empirical mean and variance good estimators of the population mean and variance?   "
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "x = np.arange(-3.,3.,0.01)\n",
 72 |     "y = 3. + 0.4*x\n",
 73 |     "nsample = [4,4,4,20,20,20,200,200,200]\n",
 74 |     "fig = plt.figure(1)\n",
 75 |     "for ii in range(0,9):\n",
 76 |     "    rv = scipy.stats.norm.rvs(3., 0.4, size = nsample[ii])\n",
 77 |     "    ax = fig.add_subplot(3,3,ii+1)\n",
 78 |     "    sm.qqplot(rv, line = 's', ax = ax)\n",
 79 |     "    ax.plot(x, y, color='c')\n",
 80 |     "plt.show()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "Ex 3. RECOGNIZE DISTRIBUTIONAL FEATURES ON THE QQ-PLOTS.\n",
 88 |     "\n",
 89 |     "The qq-plot provides a powerful check of distributional assumptions. Use the template below to see the following distributions: Cauchy (heavy-tailed), chi-squared (much used), beta(0.5,2) (restricted to the interval [0,1]; contains the uniform distribution as beta(1,1)) and two Poisson distributions, with mean 3 and 250. \n",
 90 |     "\n",
 91 |     "This time, we use the option fit = True in sm.qqplot, so that the sample is standardized by its mean and standard error before plotting. \n",
 92 |     "\n",
 93 |     "Replace the random distribution in the codes, both for the comparison with the normal distribution and for the qq-plot (check the parameters at http://docs.scipy.org/doc/scipy/reference/stats.html). Compare the tail behavior on the plot of the density and on the qq-plot. "
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "x = np.linspace(scipy.stats.cauchy.ppf(0.005), scipy.stats.cauchy.ppf(0.995), 200)\n",
105 |     "#x = np.arange(scipy.stats.poisson.ppf(0.005, mu=250), scipy.stats.poisson.ppf(0.995, mu=250), 1)\n",
106 |     "\n",
107 |     "rv = scipy.stats.cauchy.rvs(size = 400)\n",
108 |     "\n",
109 |     "fig = plt.figure(3)\n",
110 |     "ax1 = fig.add_subplot(121)\n",
111 |     "plt.hist(rv, 20, normed=1, facecolor='y', alpha=0.75)\n",
112 |     "ax1.plot(x, scipy.stats.cauchy.pdf(x),'b-', lw=2)\n",
113 |     "#ax1.vlines(x, 0, scipy.stats.poisson.pmf(x, mu=250), colors='b', lw=5, alpha=0.2)\n",
114 |     "ax2 = fig.add_subplot(122)\n",
115 |     "sm.qqplot(rv, line = 's', ax = ax2)\n",
116 |     "\n",
117 |     "plt.show()\n",
118 |     "\n",
119 |     "# To see the extremes of the random variates:\n",
120 |     "#rv.min()\n",
121 |     "#rv.max()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "## 1.2. Estimating the parameters and an (incomplete) collection of frequent problems\n",
129 |     "\n",
130 |     "We will walk through some of the simplest complications that can affect real data, and see how to recognize them by the means of the quantile-quantile plot.\n",
131 |     "\n",
132 |     "Ex. 4. NUISANCES IN THE DATA\n",
133 |     "\n",
134 |     "(starter's guide for dataframes in pandas: http://pandas.pydata.org/pandas-docs/stable/dsintro.html, as well as http://www.scipy-lectures.org/packages/statistics/index.html)\n",
135 |     "\n",
136 |     "4.1. Outliers"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {
143 |     "collapsed": false
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "dfr = pd.read_csv(\"./data/IntroStat_demo.csv\")"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {
154 |     "collapsed": true
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "fig = plt.figure(1)\n",
159 |     "\n",
160 |     "plt.subplot(121)\n",
161 |     "n, bins, patches = plt.hist(dfr['mag.outlier'], 12, normed=1, facecolor='y', alpha=0.75)\n",
162 |     "ax = fig.add_subplot(122)\n",
163 |     "sm.qqplot(dfr['mag.outlier'], line = 's', ax = ax)\n",
164 |     "\n",
165 |     "plt.show()"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "Replace the line by one defined using the quantile estimators (a line passing through the first and third quartiles)."
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {
179 |     "collapsed": true
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "fig = plt.figure(1)\n",
184 |     "\n",
185 |     "plt.subplot(121)\n",
186 |     "plt.hist(dfr['mag.outlier'], 12, normed=1, facecolor='y', alpha=0.75)\n",
187 |     "ax = fig.add_subplot(122)\n",
188 |     "sm.qqplot(dfr['mag.outlier'], line = 'q', ax = ax)\n",
189 |     "\n",
190 |     "plt.show()"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "Some robust (regression) models, M-estimators: http://statsmodels.sourceforge.net/stable/rlm.html.\n",
198 |     "\n",
199 |     "4.2. Heteroscedasticity\n",
200 |     "\n",
201 |     "Use now the 'mag.het' column in place of 'mag.outlier'. Does the quantile-based or the moment-estimated line work?"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {
208 |     "collapsed": true
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "fig = plt.figure(1)\n",
213 |     "\n",
214 |     "plt.subplot(121)\n",
215 |     "plt.hist(dfr['mag.het'], 12, normed=1, facecolor='y', alpha=0.75)\n",
216 |     "ax = fig.add_subplot(122)\n",
217 |     "sm.qqplot(dfr['mag.het'], line = 'q', ax = ax)\n",
218 |     "\n",
219 |     "plt.show()"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "The sample is, in fact, heteroscedastic, and the (estimated) standard errors are given in the column 'mag.het.error'.\n",
227 |     "What can you do to check about the errors if you are in doubt about them? How can you check the normality of the sample?"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {
234 |     "collapsed": true
235 |    },
236 |    "outputs": [],
237 |    "source": [
238 |     "def std_fn(x, mean, std):\n",
239 |     "    res = (x-mean) / std\n",
240 |     "    return res"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "markdown",
245 |    "metadata": {},
246 |    "source": [
247 |     "The above standardization should lead to a homoscedastic standard normal sample (by the linearity of the normal distribution). Its QQ-plot should be now close to a line with slope 1."
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {
254 |     "collapsed": true
255 |    },
256 |    "outputs": [],
257 |    "source": [
258 |     "\n",
259 |     "w = dfr['mag.het.error']**(-2)\n",
260 |     "m = np.average(dfr['mag.het'], weights = w)\n",
261 |     "std_het = std_fn(x = dfr['mag.het'], mean = m, std = dfr['mag.het.error'])\n",
262 |     "\n",
263 |     "fig = plt.figure(1)\n",
264 |     "plt.subplot(121)\n",
265 |     "plt.hist(std_het, 12, normed=1, facecolor='grey', alpha=0.75)\n",
266 |     "ax = fig.add_subplot(122)\n",
267 |     "# The option line = '45' means a line with intersection 0 and slope 1.\n",
268 |     "sm.qqplot(std_het, line = '45', ax = ax)\n",
269 |     "\n",
270 |     "plt.show()"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "4.3. Any other uexpected effect\n",
278 |     "\n",
279 |     "Use now the 'mag5' column in place of 'mag.outlier'. How does the standardization work in this case?"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {
286 |     "collapsed": false
287 |    },
288 |    "outputs": [],
289 |    "source": [
290 |     "w = dfr['mag5.error']**(-2)\n",
291 |     "m = np.average(dfr['mag5'], weights = w)\n",
292 |     "std5 = std_fn(x = dfr['mag5'], mean = m, std = dfr['mag5.error'])\n",
293 |     "\n",
294 |     "fig = plt.figure(1)\n",
295 |     "plt.subplot(121)\n",
296 |     "plt.hist(std5, 12, normed=1, facecolor='grey', alpha=0.75)\n",
297 |     "ax = fig.add_subplot(122)\n",
298 |     "sm.qqplot(std5, line = '45', ax = ax)\n",
299 |     "\n",
300 |     "plt.show()"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "Use the 'time' column in the dataframe to plot the data."
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "metadata": {
314 |     "collapsed": true
315 |    },
316 |    "outputs": [],
317 |    "source": [
318 |     "fig = plt.figure(1)\n",
319 |     "\n",
320 |     "plt.plot(dfr['time'], dfr['mag5'], 'ro')\n",
321 |     "\n",
322 |     "plt.xlabel('Time')\n",
323 |     "plt.ylabel('Magnitude')\n",
324 |     "# this is just to extend a bit the plotting area, so that no points fall exactly on the border:\n",
325 |     "mn, mx = sorted(dfr['time'])[::len(dfr['time'])-1]\n",
326 |     "plt.xlim(mn - 0.05*(mx-mn), mx + 0.05*(mx-mn))\n",
327 |     "\n",
328 |     "plt.show()"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "# 2. Classical estimation and hypothesis testing\n",
336 |     "\n",
337 |     "The demo data set for this part is the Wesenheit index of the OGLE-III fundamental-mode and first overtone classical Cepheids. We'll try to estimate their period-luminosity relationship.\n",
338 |     "The Wesenheit index is defined as W = I - 1.55(V - I), and its main advantage over using simply the I or V photometry is that it is insensitive to extinction. It is denoted by 'W' among the data columns. Other columns are \n",
339 |     "'name', the identifier of the star; \n",
340 |     "'RA0' (in decimal hours) and 'Decl0' (in decimal degrees), celestial coordinates; \n",
341 |     "'Mode', the mode of the Cepheid ('F' indicates fundamental-mode, '1' indicates first overtone star); \n",
342 |     "'Cloud', indicating which Magellanic Cloud the star belongs to; \n",
343 |     "'logP1', the base-10 logarithm of the period in days; \n",
344 |     "'VI', the colour V-I.  \n",
345 |     "\n",
346 |     "Ex. 5. ORDINARY LEAST SQUARES REGRESSION (= GAUSSIAN MAXIMUM LIKELIHOOD WITH A MEAN DEPENDING ON A COVARIATE)\n",
347 |     "\n",
348 |     "5.1 MODEL FIT\n",
349 |     "\n",
350 |     "There are fundamental-mode (FU) and first overtone (FO) Cepheids both from the SMC and the LMC. Represent the fundamental and first overtone Cepheids' P-L relationship (W versus logP1) in two separate scatterplots, the LMC and SMC stars with different colours. What do you see? Fit a separate linear regression model to each of the distinct groups (to check the content of the resulting objects 'lmfit_lmc_fu' etc., see with dir(lmfit_lmc_fu) ). How would you decide whether the slopes are the same for stars of the same mode in the two Clouds? "
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "metadata": {
357 |     "collapsed": true
358 |    },
359 |    "outputs": [],
360 |    "source": [
361 |     "import statsmodels.formula.api as smf"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {
368 |     "collapsed": false
369 |    },
370 |    "outputs": [],
371 |    "source": [
372 |     "cep = pd.read_csv(\"./data/Cepheids.csv\")\n",
373 |     "cep[0:10]"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "collapsed": true
381 |    },
382 |    "outputs": [],
383 |    "source": [
384 |     "i_lmc = cep['Cloud'] == \"LMC\"\n",
385 |     "i_fu = cep['Mode'] == \"F\"\n",
386 |     "\n",
387 |     "fig = plt.figure(4)\n",
388 |     "ax1 = fig.add_subplot(211)\n",
389 |     "plt.plot(cep[i_lmc & i_fu]['logP1'], cep[i_lmc & i_fu]['W'], 'b*', alpha=0.3, label = 'LMC')\n",
390 |     "plt.plot(cep[-i_lmc & i_fu]['logP1'], cep[-i_lmc & i_fu]['W'], 'r*', alpha=0.3, label = 'SMC')\n",
391 |     "plt.legend(loc = 'best', numpoints = 1)\n",
392 |     "ax2 = fig.add_subplot(212)\n",
393 |     "plt.plot(cep[i_lmc & -i_fu]['logP1'], cep[i_lmc & -i_fu]['W'], 'b*', alpha=0.3, label = 'LMC')\n",
394 |     "plt.plot(cep[-i_lmc & -i_fu]['logP1'], cep[-i_lmc & -i_fu]['W'], 'r*', alpha=0.3, label = 'SMC')\n",
395 |     "plt.legend(loc = 'best', numpoints = 1)\n",
396 |     "\n",
397 |     "plt.show()"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {
404 |     "collapsed": true
405 |    },
406 |    "outputs": [],
407 |    "source": [
408 |     "lmfit_lmc_fu = smf.ols(formula = 'W ~ logP1', data = cep, subset = i_lmc & i_fu).fit()\n",
409 |     "lmfit_lmc_fo = smf.ols(formula = 'W ~ logP1', data = cep, subset = i_lmc & -i_fu).fit()\n",
410 |     "\n",
411 |     "lmfit_smc_fu = smf.ols(formula = 'W ~ logP1', data = cep, subset = -i_lmc & i_fu).fit()\n",
412 |     "lmfit_smc_fo = smf.ols(formula = 'W ~ logP1', data = cep, subset = -i_lmc & -i_fu).fit()"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {
419 |     "collapsed": false
420 |    },
421 |    "outputs": [],
422 |    "source": [
423 |     "print lmfit_smc_fo.summary()"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": null,
429 |    "metadata": {
430 |     "collapsed": false
431 |    },
432 |    "outputs": [],
433 |    "source": [
434 |     "cep['resid0'] = np.zeros(cep.shape[0])\n",
435 |     "cep['fitted0'] = np.zeros(cep.shape[0])\n",
436 |     "cep.loc[(i_lmc & i_fu),'resid0'] = lmfit_lmc_fu.resid\n",
437 |     "cep.loc[(-i_lmc & i_fu),'resid0'] = lmfit_smc_fu.resid\n",
438 |     "cep.loc[(i_lmc & -i_fu),'resid0'] = lmfit_lmc_fo.resid\n",
439 |     "cep.loc[(-i_lmc & -i_fu),'resid0'] = lmfit_smc_fo.resid\n",
440 |     "cep.loc[(i_lmc & i_fu),'fitted0'] = lmfit_lmc_fu.fittedvalues\n",
441 |     "cep.loc[(-i_lmc & i_fu),'fitted0'] = lmfit_smc_fu.fittedvalues\n",
442 |     "cep.loc[(i_lmc & -i_fu),'fitted0'] = lmfit_lmc_fo.fittedvalues\n",
443 |     "cep.loc[(-i_lmc & -i_fu),'fitted0'] = lmfit_smc_fo.fittedvalues\n",
444 |     "\n",
445 |     "cep.iloc[0:10]"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": null,
451 |    "metadata": {
452 |     "collapsed": true
453 |    },
454 |    "outputs": [],
455 |    "source": [
456 |     "logp_tmp = np.linspace(cep['logP1'].min(), cep['logP1'].max(), 500)\n",
457 |     "\n",
458 |     "fig = plt.figure(4)\n",
459 |     "ax1 = fig.add_subplot(211)\n",
460 |     "plt.plot(cep[i_lmc & i_fu]['logP1'], cep[i_lmc & i_fu]['W'], 'c*', alpha=0.2, label = 'LMC')\n",
461 |     "plt.plot(cep[-i_lmc & i_fu]['logP1'], cep[-i_lmc & i_fu]['W'], 'r*', alpha=0.2, label = 'SMC')\n",
462 |     "plt.plot(logp_tmp, lmfit_lmc_fu.params['Intercept'] + logp_tmp * lmfit_lmc_fu.params['logP1'], 'blue', lw = 1)\n",
463 |     "plt.plot(logp_tmp, lmfit_smc_fu.params['Intercept'] + logp_tmp * lmfit_smc_fu.params['logP1'], 'brown', lw = 1)\n",
464 |     "plt.legend(loc = 'best', numpoints = 1)\n",
465 |     "ax2 = fig.add_subplot(212)\n",
466 |     "plt.plot(cep[i_lmc & -i_fu]['logP1'], cep[i_lmc & -i_fu]['W'], 'c*', alpha=0.2, label = 'LMC')\n",
467 |     "plt.plot(cep[-i_lmc & -i_fu]['logP1'], cep[-i_lmc & -i_fu]['W'], 'r*', alpha=0.2, label = 'SMC')\n",
468 |     "plt.legend(loc = 'best', numpoints = 1)\n",
469 |     "plt.plot(logp_tmp, lmfit_lmc_fo.params['Intercept'] + logp_tmp * lmfit_lmc_fo.params['logP1'], 'blue', lw = 1)\n",
470 |     "plt.plot(logp_tmp, lmfit_smc_fo.params['Intercept'] + logp_tmp * lmfit_smc_fo.params['logP1'], 'brown', lw = 1)\n",
471 |     "\n",
472 |     "plt.show()"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "markdown",
477 |    "metadata": {},
478 |    "source": [
479 |     "5.2 MODEL DIAGNOSTICS: QQ-PLOT OF RESIDUALS\n",
480 |     "\n",
481 |     "Let's start with checking the distributional assumptions of the model: do the residuals  admit a normal distribution? Take a look at the output of the four linear models ( lmobject.summary() ). Can you see indications there of non-normality? Make and inspect the QQ-plot of the residuals, separately for the four groups of Cepheids. What do you find? What can be the reason of what you observe?"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "metadata": {
488 |     "collapsed": true
489 |    },
490 |    "outputs": [],
491 |    "source": [
492 |     "mn, mx = sorted(cep['resid0'])[::len(cep['resid0'])-1]\n",
493 |     "\n",
494 |     "fig = plt.figure(1)\n",
495 |     "plt.subplots_adjust(left=0.07, bottom=0.08, right=0.95, top=0.95, wspace=None, hspace=0.35)\n",
496 |     "\n",
497 |     "ax1 = fig.add_subplot(221)\n",
498 |     "sm.qqplot(cep[i_lmc & i_fu]['resid0'], line = 's', ax = ax1)\n",
499 |     "plt.title(\"LMC FU\")\n",
500 |     "plt.ylim(mn - 0.05*(mx-mn), mx + 0.05*(mx-mn))\n",
501 |     "ax2 = fig.add_subplot(222)\n",
502 |     "sm.qqplot(cep[-i_lmc & i_fu]['resid0'], line = 's', ax = ax2)\n",
503 |     "plt.title(\"SMC FU\")\n",
504 |     "plt.ylim(mn - 0.05*(mx-mn), mx + 0.05*(mx-mn))\n",
505 |     "ax3 = fig.add_subplot(223)\n",
506 |     "sm.qqplot(cep[i_lmc & -i_fu]['resid0'], line = 's', ax = ax3)\n",
507 |     "plt.title(\"LMC FO\")\n",
508 |     "plt.ylim(mn - 0.05*(mx-mn), mx + 0.05*(mx-mn))\n",
509 |     "ax4 = fig.add_subplot(224)\n",
510 |     "sm.qqplot(cep[-i_lmc & -i_fu]['resid0'], line = 's', ax = ax4)\n",
511 |     "plt.title(\"SMC FO\")\n",
512 |     "plt.ylim(mn - 0.05*(mx-mn), mx + 0.05*(mx-mn))\n",
513 |     "\n",
514 |     "plt.show()"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "markdown",
519 |    "metadata": {},
520 |    "source": [
521 |     "There are a few possible explanations.\n",
522 |     "\n",
523 |     "1. The Magellanic Clouds are extended in the line of sight. It is possible that we see an effect of the slightly different distances of stars towards the foreground and of those towards background.\n",
524 |     "2. The literature suggests that the P-L relationship can contain colour (V-I) terms, and can have dependence on metallicity. \n",
525 |     "3. There are also suggestions of either a break in the P-L relationships (at log(P) = 1 for FU and at at log(P) = 0.5 for FO) or the inclusion of a quadratic term. \n",
526 |     "4. Unidentified effect or naturally non-normally distributed errors in period and the Wesenheit index.\n",
527 |     "\n",
528 |     "First we check the first point up there. Create a map of the stars on the sky (plot of RA0 and Decl0), coloured according to the sign of the residuals; if there is an effect of distance, then negative residuals and positive residuals will be differently grouped, and hinting at the geometry of the Cloud. Do this separately for the four fits."
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "code",
533 |    "execution_count": null,
534 |    "metadata": {
535 |     "collapsed": true
536 |    },
537 |    "outputs": [],
538 |    "source": [
539 |     "i_posresid = (cep['resid0'] > 0)\n",
540 |     "    \n",
541 |     "fig = plt.figure(1)\n",
542 |     "\n",
543 |     "fig.add_subplot(221)\n",
544 |     "plt.plot(cep[-i_lmc & i_fu & i_posresid]['RA0'], cep[-i_lmc & i_fu & i_posresid]['Decl0'], 'bo', alpha=0.2)\n",
545 |     "plt.plot(cep[-i_lmc & i_fu & -i_posresid]['RA0'], cep[-i_lmc & i_fu & -i_posresid]['Decl0'], 'yo', alpha=0.2)\n",
546 |     "plt.title('SMC FU')\n",
547 |     "\n",
548 |     "fig.add_subplot(222)\n",
549 |     "plt.plot(cep[i_lmc & i_fu & i_posresid]['RA0'], cep[i_lmc & i_fu & i_posresid]['Decl0'], 'bo', alpha=0.2)\n",
550 |     "plt.plot(cep[i_lmc & i_fu & -i_posresid]['RA0'], cep[i_lmc & i_fu & -i_posresid]['Decl0'], 'yo', alpha=0.2)\n",
551 |     "plt.title('LMC FU')\n",
552 |     "\n",
553 |     "fig.add_subplot(223)\n",
554 |     "plt.plot(cep[-i_lmc & -i_fu & i_posresid]['RA0'], cep[-i_lmc & -i_fu & i_posresid]['Decl0'], 'bo', alpha=0.2)\n",
555 |     "plt.plot(cep[-i_lmc & -i_fu & -i_posresid]['RA0'], cep[-i_lmc & -i_fu & -i_posresid]['Decl0'], 'yo', alpha=0.2)\n",
556 |     "plt.title('SMC FU')\n",
557 |     "\n",
558 |     "fig.add_subplot(224)\n",
559 |     "plt.plot(cep[i_lmc & -i_fu & i_posresid]['RA0'], cep[i_lmc & -i_fu & i_posresid]['Decl0'], 'bo', alpha=0.2)\n",
560 |     "plt.plot(cep[i_lmc & -i_fu & -i_posresid]['RA0'], cep[i_lmc & -i_fu & -i_posresid]['Decl0'], 'yo', alpha=0.2)\n",
561 |     "plt.title('LMC FU')\n",
562 |     "\n",
563 |     "plt.show()"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "markdown",
568 |    "metadata": {},
569 |    "source": [
570 |     "5.2 RESIDUALS AGAINST FITTED VALUES AND COVARIATE\n",
571 |     "\n",
572 |     "After concluding on this point, we can do some further checks on the distribution. Statisticians usually check whether the variance of the response (or the residuals) depends on the fitted value. For example, if our response variable should be considered to be a Poisson variable, then its variance would be equal to the mean, which is varying with the covariate(s). Thus, in such a case, plotting the residuals against the fitted values, we would see a band narrow at small fitted values, and widening with increasing fitted values. For a homoscedastic normal distribution, we would find a band of constant width. Other patterns can hint to other distributions. \n",
573 |     "Plot the residuals versus the fitted value for each of the four fits. What do you think? Take into account the local number of the data: with more data within some fitted value bin, you see more of the extremes of the local distribution than with fewer data."
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": null,
579 |    "metadata": {
580 |     "collapsed": false
581 |    },
582 |    "outputs": [],
583 |    "source": [
584 |     "fig = plt.figure(1)\n",
585 |     "\n",
586 |     "fig.add_subplot(221)\n",
587 |     "plt.plot(cep[-i_lmc & i_fu]['fitted0'], cep[-i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n",
588 |     "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['fitted0'].min(), xmax = cep[-i_lmc & i_fu]['fitted0'].max(), lw = 2)\n",
589 |     "plt.title('SMC FU')\n",
590 |     "\n",
591 |     "fig.add_subplot(222)\n",
592 |     "plt.plot(cep[i_lmc & i_fu]['fitted0'], cep[i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n",
593 |     "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['fitted0'].min(), xmax = cep[-i_lmc & i_fu]['fitted0'].max(), lw = 2)\n",
594 |     "plt.title('LMC FU')\n",
595 |     "\n",
596 |     "fig.add_subplot(223)\n",
597 |     "plt.plot(cep[-i_lmc & -i_fu]['fitted0'], cep[-i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n",
598 |     "plt.hlines(0, xmin = cep[-i_lmc & -i_fu]['fitted0'].min(), xmax = cep[-i_lmc & -i_fu]['fitted0'].max(), lw = 2)\n",
599 |     "plt.title('SMC FO')\n",
600 |     "\n",
601 |     "fig.add_subplot(224)\n",
602 |     "plt.plot(cep[i_lmc & -i_fu]['fitted0'], cep[i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n",
603 |     "plt.hlines(0, xmin = cep[i_lmc & -i_fu]['fitted0'].min(), xmax = cep[i_lmc & -i_fu]['fitted0'].max(), lw = 2)\n",
604 |     "plt.title('LMC FO')\n",
605 |     "\n",
606 |     "plt.show()"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "markdown",
611 |    "metadata": {},
612 |    "source": [
613 |     "Another useful plot (which is generally used) is the plot of residuals against covariates. We can see the intervals of lack of fits, the bias, the necessity of more terms or a nonparametric model. Create this plot. Do you see a strong indication of quadratic terms or breaks in the model?"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": null,
619 |    "metadata": {
620 |     "collapsed": true
621 |    },
622 |    "outputs": [],
623 |    "source": [
624 |     "fig = plt.figure(1)\n",
625 |     "\n",
626 |     "fig.add_subplot(221)\n",
627 |     "plt.plot(cep[-i_lmc & i_fu]['logP1'], cep[-i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n",
628 |     "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['logP1'].min(), xmax = cep[-i_lmc & i_fu]['logP1'].max(), lw = 2)\n",
629 |     "plt.title('SMC FU')\n",
630 |     "\n",
631 |     "fig.add_subplot(222)\n",
632 |     "plt.plot(cep[i_lmc & i_fu]['logP1'], cep[i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n",
633 |     "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['logP1'].min(), xmax = cep[-i_lmc & i_fu]['logP1'].max(), lw = 2)\n",
634 |     "plt.title('LMC FU')\n",
635 |     "\n",
636 |     "fig.add_subplot(223)\n",
637 |     "plt.plot(cep[-i_lmc & -i_fu]['logP1'], cep[-i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n",
638 |     "plt.hlines(0, xmin = cep[-i_lmc & -i_fu]['logP1'].min(), xmax = cep[-i_lmc & -i_fu]['logP1'].max(), lw = 2)\n",
639 |     "plt.title('SMC FO')\n",
640 |     "\n",
641 |     "fig.add_subplot(224)\n",
642 |     "plt.plot(cep[i_lmc & -i_fu]['logP1'], cep[i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n",
643 |     "plt.hlines(0, xmin = cep[i_lmc & -i_fu]['logP1'].min(), xmax = cep[i_lmc & -i_fu]['logP1'].max(), lw = 2)\n",
644 |     "plt.title('LMC FO')\n",
645 |     "\n",
646 |     "plt.show()"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "markdown",
651 |    "metadata": {},
652 |    "source": [
653 |     "5.3 MODEL COMPARISON: IS V-I NECESSARY TO INCLUDE?\n",
654 |     "\n",
655 |     "Several authors propose the inclusion of a linear V-I term to the P-L relationship in its form using magnitudes of the stars. As we use the Wesenheit index, this is equivalent to allow for a correction term to the used coefficient 1.55 (recall that W = I - 1.55(V - I)). Do we need this term? First visualize."
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "code",
660 |    "execution_count": null,
661 |    "metadata": {
662 |     "collapsed": true
663 |    },
664 |    "outputs": [],
665 |    "source": [
666 |     "fig = plt.figure(1)\n",
667 |     "\n",
668 |     "fig.add_subplot(221)\n",
669 |     "plt.plot(cep[-i_lmc & i_fu]['VI'], cep[-i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n",
670 |     "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['VI'].min(), xmax = cep[-i_lmc & i_fu]['VI'].max(), lw = 2)\n",
671 |     "plt.title('SMC FU')\n",
672 |     "\n",
673 |     "fig.add_subplot(222)\n",
674 |     "plt.plot(cep[i_lmc & i_fu]['VI'], cep[i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n",
675 |     "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['VI'].min(), xmax = cep[-i_lmc & i_fu]['VI'].max(), lw = 2)\n",
676 |     "plt.title('LMC FU')\n",
677 |     "\n",
678 |     "fig.add_subplot(223)\n",
679 |     "plt.plot(cep[-i_lmc & -i_fu]['VI'], cep[-i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n",
680 |     "plt.hlines(0, xmin = cep[-i_lmc & -i_fu]['VI'].min(), xmax = cep[-i_lmc & -i_fu]['VI'].max(), lw = 2)\n",
681 |     "plt.title('SMC FO')\n",
682 |     "\n",
683 |     "fig.add_subplot(224)\n",
684 |     "plt.plot(cep[i_lmc & -i_fu]['VI'], cep[i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n",
685 |     "plt.hlines(0, xmin = cep[i_lmc & -i_fu]['VI'].min(), xmax = cep[i_lmc & -i_fu]['VI'].max(), lw = 2)\n",
686 |     "plt.title('LMC FO')\n",
687 |     "\n",
688 |     "plt.show()"
689 |    ]
690 |   },
691 |   {
692 |    "cell_type": "markdown",
693 |    "metadata": {},
694 |    "source": [
695 |     "Just to check on the literature, to see a frequent and sometimes unnoticed problem of linear models, and to use the model comparison techniques, we fit models with both logP1 and V - I. However, we should be careful. It cannot be excluded that the two explanatory variables directly depend on each other (actually, this can even be expected, since the Cepheids have a very constrained pulsation and stellar structure model). If such a relationship holds between two covariates in a linear model, then mathematically, the model can become ill-determined, and strongly unstable against small changes in the data. This is because in such a case, a change in the coefficient of one of these covariates can be compensated by a corresponding change in the coefficient of the other. So first use a scatterplot to see the logP1-VI relationship in the four Cepheid groups. What do you conclude?"
696 |    ]
697 |   },
698 |   {
699 |    "cell_type": "code",
700 |    "execution_count": null,
701 |    "metadata": {
702 |     "collapsed": true
703 |    },
704 |    "outputs": [],
705 |    "source": [
706 |     "fig = plt.figure(1)\n",
707 |     "\n",
708 |     "fig.add_subplot(221)\n",
709 |     "plt.plot(cep[-i_lmc & i_fu]['logP1'], cep[-i_lmc & i_fu]['VI'], 'g*', alpha=0.2)\n",
710 |     "plt.title('SMC FU')\n",
711 |     "\n",
712 |     "fig.add_subplot(222)\n",
713 |     "plt.plot(cep[i_lmc & i_fu]['logP1'], cep[i_lmc & i_fu]['VI'], 'g*', alpha=0.2)\n",
714 |     "plt.title('LMC FU')\n",
715 |     "\n",
716 |     "fig.add_subplot(223)\n",
717 |     "plt.plot(cep[-i_lmc & -i_fu]['logP1'], cep[-i_lmc & -i_fu]['VI'], 'g*', alpha=0.2)\n",
718 |     "plt.title('SMC FO')\n",
719 |     "\n",
720 |     "fig.add_subplot(224)\n",
721 |     "plt.plot(cep[i_lmc & -i_fu]['logP1'], cep[i_lmc & -i_fu]['VI'], 'g*', alpha=0.2)\n",
722 |     "plt.title('LMC FO')\n",
723 |     "\n",
724 |     "plt.show()"
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "markdown",
729 |    "metadata": {},
730 |    "source": [
731 |     "This problem is called the collinearity problem. A solution is to orthogonalize the variables; we perform this by regressing VI on logP1, and extracting the residuals of this model. The residuals now, by virtue of some statistical magic, are now uncorrelated with logP1, and can be used in a two-variate period-luminosity-color relationship without the risk of ending up with a singular model.\n",
732 |     "After fitting, check up on the significance table of the model parameters. "
733 |    ]
734 |   },
735 |   {
736 |    "cell_type": "code",
737 |    "execution_count": null,
738 |    "metadata": {
739 |     "collapsed": true
740 |    },
741 |    "outputs": [],
742 |    "source": [
743 |     "vi_lmfit_lmc_fu = smf.ols(formula = 'VI ~ logP1', data = cep, subset = i_lmc & i_fu).fit()\n",
744 |     "vi_lmfit_lmc_fo = smf.ols(formula = 'VI ~ logP1', data = cep, subset = i_lmc & -i_fu).fit()\n",
745 |     "\n",
746 |     "vi_lmfit_smc_fu = smf.ols(formula = 'VI ~ logP1', data = cep, subset = -i_lmc & i_fu).fit()\n",
747 |     "vi_lmfit_smc_fo = smf.ols(formula = 'VI ~ logP1', data = cep, subset = -i_lmc & -i_fu).fit()"
748 |    ]
749 |   },
750 |   {
751 |    "cell_type": "code",
752 |    "execution_count": null,
753 |    "metadata": {
754 |     "collapsed": false
755 |    },
756 |    "outputs": [],
757 |    "source": [
758 |     "print vi_lmfit_smc_fo.summary()"
759 |    ]
760 |   },
761 |   {
762 |    "cell_type": "markdown",
763 |    "metadata": {},
764 |    "source": [
765 |     "Next, add the new column 'resid_vi' as an additional variable to the models. Compare the different model comparison measures: the likelihood, the AIC and the BIC to those of models without resid_vi. As well, repeat the former plot, now superposing the new residuals in a new (transparent) colour. What do you see? Would you accept the necessity of including such a term into your models? Consider different aspects of the problem: improvement on the model as goodness-of-fit measures summarize, behaviour of the residuals, behaviour of the outliers, size of the effect, errors on the coefficients in the two models.  "
766 |    ]
767 |   },
768 |   {
769 |    "cell_type": "code",
770 |    "execution_count": null,
771 |    "metadata": {
772 |     "collapsed": true
773 |    },
774 |    "outputs": [],
775 |    "source": [
776 |     "cep['resid_vi'] = np.zeros(cep.shape[0])\n",
777 |     "cep.loc[(i_lmc & i_fu),'resid_vi'] = vi_lmfit_lmc_fu.resid\n",
778 |     "cep.loc[(-i_lmc & i_fu),'resid_vi'] = vi_lmfit_smc_fu.resid\n",
779 |     "cep.loc[(i_lmc & -i_fu),'resid_vi'] = vi_lmfit_lmc_fo.resid\n",
780 |     "cep.loc[(-i_lmc & -i_fu),'resid_vi'] = vi_lmfit_smc_fo.resid"
781 |    ]
782 |   },
783 |   {
784 |    "cell_type": "code",
785 |    "execution_count": null,
786 |    "metadata": {
787 |     "collapsed": true
788 |    },
789 |    "outputs": [],
790 |    "source": [
791 |     "cep[0:10]"
792 |    ]
793 |   },
794 |   {
795 |    "cell_type": "code",
796 |    "execution_count": null,
797 |    "metadata": {
798 |     "collapsed": true
799 |    },
800 |    "outputs": [],
801 |    "source": [
802 |     "lmfit_lmc_fu2 = smf.ols(formula = 'W ~ logP1 + resid_vi', data = cep, subset = i_lmc & i_fu).fit()\n",
803 |     "lmfit_lmc_fo2 = smf.ols(formula = 'W ~ logP1 + resid_vi', data = cep, subset = i_lmc & -i_fu).fit()\n",
804 |     "\n",
805 |     "lmfit_smc_fu2 = smf.ols(formula = 'W ~ logP1 + resid_vi', data = cep, subset = -i_lmc & i_fu).fit()\n",
806 |     "lmfit_smc_fo2 = smf.ols(formula = 'W ~ logP1 + resid_vi', data = cep, subset = -i_lmc & -i_fu).fit()"
807 |    ]
808 |   },
809 |   {
810 |    "cell_type": "code",
811 |    "execution_count": null,
812 |    "metadata": {
813 |     "collapsed": false
814 |    },
815 |    "outputs": [],
816 |    "source": [
817 |     "print lmfit_lmc_fu2.summary()"
818 |    ]
819 |   },
820 |   {
821 |    "cell_type": "code",
822 |    "execution_count": null,
823 |    "metadata": {
824 |     "collapsed": false
825 |    },
826 |    "outputs": [],
827 |    "source": [
828 |     "print lmfit_lmc_fo.bic\n",
829 |     "print lmfit_lmc_fo2.bic"
830 |    ]
831 |   },
832 |   {
833 |    "cell_type": "markdown",
834 |    "metadata": {},
835 |    "source": [
836 |     "Finally, for seeing how collinearity affects the model results, we fit models also using the original V-I, correlated with log(P). "
837 |    ]
838 |   },
839 |   {
840 |    "cell_type": "code",
841 |    "execution_count": null,
842 |    "metadata": {
843 |     "collapsed": true
844 |    },
845 |    "outputs": [],
846 |    "source": [
847 |     "lmfit_lmc_fu1 = smf.ols(formula = 'W ~ logP1 + VI', data = cep, subset = i_lmc & i_fu).fit()\n",
848 |     "lmfit_lmc_fo1 = smf.ols(formula = 'W ~ logP1 + VI', data = cep, subset = i_lmc & -i_fu).fit()\n",
849 |     "\n",
850 |     "lmfit_smc_fu1 = smf.ols(formula = 'W ~ logP1 + VI', data = cep, subset = -i_lmc & i_fu).fit()\n",
851 |     "lmfit_smc_fo1 = smf.ols(formula = 'W ~ logP1 + VI', data = cep, subset = -i_lmc & -i_fu).fit()"
852 |    ]
853 |   },
854 |   {
855 |    "cell_type": "code",
856 |    "execution_count": null,
857 |    "metadata": {
858 |     "collapsed": false
859 |    },
860 |    "outputs": [],
861 |    "source": [
862 |     "print lmfit_lmc_fu1.summary()\n",
863 |     "print lmfit_lmc_fu2.summary()"
864 |    ]
865 |   },
866 |   {
867 |    "cell_type": "markdown",
868 |    "metadata": {},
869 |    "source": [
870 |     "Comments:\n",
871 |     "\n",
872 |     "1. The fitted coefficient of VI and resid_vi in the models are the same, as well as their errors. There is no change in the log-likelihood, AIC and BIC either: model goodness is the same.\n",
873 |     "\n",
874 |     "2. Both the intersect and the coeff of log(P) have changed. We don't know a priori, which is the true value (if exists at all), but the error on both is smaller in the orthogonal models than in the collinear models.\n",
875 |     "\n",
876 |     "3. Computing the correlation matrix of the parameters, we find no correlation of resid_vi with the other parameters in the W ~ (logP1, resid_vi) model, versus a high correlation of VI with them in the W ~ (logP1, VI) model. This is a manifestation of the fact that in a collinear model, coefficients can take over (at least partly) the role of each other, and thus, their value is more sensitive to particular outliers and structures in the data than in an orthogonal model. \n",
877 |     "\n",
878 |     "4. That a correction using V-I causes some improvement on the outliers and on the distributional discrepancy, can also be seen on the values for skewness and kurtosis (for a normal distribution, these should be 0 and 3 respectively), even though the improvement is indeed tiny, and we did not get much closer to satisfy the distributional assumption to have valid asymptotics of the least squares models (using V-I isn't any help of course with the distance effect, and there might be other effects as well, for instance metallicity). The Jarque-Bera test shows the same.\n",
879 |     "\n",
880 |     "5. The condition number (ratio of the largest and the smallest eigenvalues of the X^T X matrix, where X is the \"design matrix\", here basically the data matrix with a first column of ones) is smaller in the orthogonalized model than in the collinear one. As a rule of thumb, condition number higher than 30 indicates dangerously strong collinearity. We here do not reach this limit even with the model W ~ (logP1, VI), but still, we can see the improvement on the condition number from this model to the  W ~ (logP1, resid_vi) one.\n",
881 |     "\n",
882 |     "6. See the log-likelihood surfaces, shown on the slides.\n",
883 |     "\n",
884 |     "7. Nevertheless, the discrepancy from normally distributed errors is so big that all estimated errors, all asymptotics are useless! \n",
885 |     "\n"
886 |    ]
887 |   },
888 |   {
889 |    "cell_type": "markdown",
890 |    "metadata": {},
891 |    "source": [
892 |     "In case there is still time, or someone finished the exercise very early, here are some more propositions to work on.\n",
893 |     "\n",
894 |     "Problem 1: Monte Carlo methods for errors on the estimates.\n",
895 |     "\n",
896 |     "The literature errors given in Soszynski et al 2008 are . \n",
897 |     "Do a subsampling: resample n0 stars from the n stars, and estimate again the PL relation (you can work with either the univariate or the bivariate model). Repeat subsampling and estimation R times. Is the distribution of the estimates normal, as the asymptotic theory says about the maximum likelihood estimator? Take the IQR or the MAD as est. of standard deviation, deflate it with the numeric factor sqrt(n0/n): is this consistent with what is given in the literature?\n",
898 |     "\n",
899 |     "Problem 2: Use heavy-tailed distributions for estimation.\n",
900 |     "\n",
901 |     "Assuming a (rescaled) t for the error distribution, can you find a plausible value for nu? (Use the qq-plots and the fact that with the right nu, the rescaled t-variates still would show a straight line pattern.) Compute and plot the likelihood. Find an estimate of the parameters and their errors within  are the errors now more realistic? \n"
902 |    ]
903 |   }
904 |  ],
905 |  "metadata": {
906 |   "celltoolbar": "Raw Cell Format",
907 |   "kernelspec": {
908 |    "display_name": "Python 2",
909 |    "language": "python",
910 |    "name": "python2"
911 |   },
912 |   "language_info": {
913 |    "codemirror_mode": {
914 |     "name": "ipython",
915 |     "version": 2
916 |    },
917 |    "file_extension": ".py",
918 |    "mimetype": "text/x-python",
919 |    "name": "python",
920 |    "nbconvert_exporter": "python",
921 |    "pygments_lexer": "ipython2",
922 |    "version": "2.7.11"
923 |   }
924 |  },
925 |  "nbformat": 4,
926 |  "nbformat_minor": 0
927 | }
928 | 


--------------------------------------------------------------------------------