├── README.md └── Session1 └── Tuesday ├── data └── IntroStat_demo.csv └── IntroStat.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # LSST-DSFP-Resources 2 | For computing/programming/statistical resources compiled by LSSTC Data Science Fellows 3 | -------------------------------------------------------------------------------- /Session1/Tuesday/data/IntroStat_demo.csv: -------------------------------------------------------------------------------- 1 | time,mag.hom,mag.outlier,mag.t,mag.het,mag.het.error,mag5,mag5.error 2 | 0,18.5589442683513,18.5306774114789,18.5605273552103,18.5674472421206,0.0673447662033141,18.5460560136707,0.00976609719927976 3 | 0.17615778844447,18.5311784194935,18.5515604297523,18.5499055533718,18.5473152136787,0.0171485664555803,18.5426048480591,0.00770946746896302 4 | 0.250164970666731,18.5536692551417,18.4680954415675,18.5446177769689,18.5641906767379,0.0206004055216908,18.5362693395877,0.00977726355227015 5 | 1.42698258962969,18.5394751614305,18.5373682418528,18.5608179688024,18.5541623997199,0.0743387433001772,18.5457818773318,0.0087049168517297 6 | 1.67714756266668,18.5754423277417,18.7905210141074,18.5561264918157,18.5147653876404,0.0274689674843103,18.5536441886995,0.0100835735266972 7 | 1.92731250014822,18.5688096791856,18.5484733020366,18.5307572973929,18.5511344995731,0.00128030302003026,18.5535616090684,0.00862268498232961 8 | 2.00131967288894,18.5306839699509,18.5738948687948,18.5923607846999,18.5463770669773,0.0385863671079278,18.5435323687151,0.00353901734083689 9 | 2.17747751822225,18.5404357191549,18.5838537499684,18.5550741257684,18.5096817311331,0.0311729908455163,18.547784233619,0.00730273392960781 10 | 2.25148467911117,18.5336487465924,18.5505136689393,18.5487974225727,18.6204548264426,0.030169404274784,18.5423081349491,0.00586801461150437 11 | 2.42764244385194,18.5559246236964,18.5562794305921,18.5379523235324,18.5621240466992,0.00825049160048366,18.5412702481465,0.0121236431161133 12 | 2.50164955970376,18.5501084203234,18.5557037579903,18.5529945336716,18.547843601537,0.0507529947673902,18.5452619740364,0.00735818068071379 13 | 2.67780726755558,18.5645197288917,18.5549543912184,18.5618490369254,18.5460361520644,0.0809831742430106,18.5600811582278,0.00613799327651888 14 | 2.75181445214821,18.5635795082143,18.548374238575,18.5531600560735,18.5210358782911,0.0934805805096403,18.5447121835436,0.00681950987245638 15 | 2.92797224059268,18.5589321160663,18.5415504934681,18.5500181376146,18.5564302249123,0.0121413861168548,18.5482663380215,0.00402238791936502 16 | 3.00197940148155,18.554400284808,18.5433372839744,18.5531433811103,18.5458148129442,0.0128612750209868,18.5464045389296,0.00496515945617352 17 | 3.42830205866676,18.546831141818,18.5392489410442,18.5445689926193,18.4163400443553,0.0893393303267658,18.5465421911513,0.00825006995218595 18 | 3.50230923140748,18.5557632083347,18.7706070325729,18.5419814301352,18.5956025884245,0.0608045848319307,18.5438209692729,0.0087205281991559 19 | 3.678466962963,18.544265974006,18.5626371757059,18.4650401520377,18.5828734311861,0.0547708112280816,18.552398885318,0.0064182670261062 20 | 3.75247411200007,18.5351829265835,18.522141743808,18.5717250651362,18.4845672560639,0.0829137690132484,18.5552325488267,0.00493270036606317 21 | 3.92863187674078,18.5516688603311,18.567852081761,18.5400049667626,18.5240258816669,0.0423200894379988,18.5487543275974,0.0065567976860378 22 | 4.00263900444452,18.5285122655126,19.0505896825117,18.5576683009049,18.5594464913454,0.0772306305123493,18.5586718452476,0.00875526584287117 23 | 4.17879676918523,18.5557929338106,18.5569551618032,18.5493963939202,18.5326367831398,0.0280512649798766,18.5607002282244,0.00557509450438661 24 | 4.25280389451854,18.5529326279236,18.5586263953885,18.5647347420499,18.5722450389334,0.0787957694847137,18.5468448441239,0.00775270203974073 25 | 4.42896166162967,18.5466721253703,18.5662731977358,18.5318617352833,18.5691142289703,0.0125670915469527,18.5433680008302,0.00947502961077787 26 | 4.50296879881489,18.5455843284892,18.5457739476833,18.543658236517,18.5550040797419,0.0425309023354203,18.5554296288972,0.00626346095601517 27 | 4.67912653037035,18.5409110697306,18.5496056417444,18.5510661058619,18.561286360916,0.0702062232652679,18.5594793550286,0.00723944009740642 28 | 4.75313367940748,18.5519659351588,18.5381468484592,18.5608269244419,18.5280668947586,0.0233837144449353,18.5515065815533,0.0066153948037125 29 | 4.92929143466671,18.5706305083806,18.537476364426,18.5156572585304,18.6086035990201,0.0518757134443149,18.5434085528152,0.00928274239492134 30 | 5.00329857185187,18.5461062937116,18.5067553864728,18.551052788274,18.4834255226187,0.0612845422700048,18.5504014218671,0.00756243632938453 31 | 5.17945630340745,18.5590888928021,18.5700617572974,18.5435521208434,18.5562747021189,0.0972205097321421,18.5539690640214,0.00450351228300857 32 | 5.25346344296304,18.5565274652536,18.5785041773708,18.5370502723415,18.4631089748217,0.0927823832491413,18.5480576832403,0.0100016004981465 33 | 5.42962116266671,18.5388473006871,18.5187224700964,18.5514549479656,18.4588274189247,0.0710915053263307,18.54330290055,0.00707806461260917 34 | 5.50362828800007,18.5375868547063,18.5524568672701,18.5483980877705,18.5418905128437,0.0468997861025855,18.5547896497908,0.00774302896781335 35 | 5.67978603140745,18.5536257470751,18.5593359659178,18.553944265792,18.4274408901564,0.0653245693072677,18.5717981769911,0.00722942938671263 36 | 5.75379315911113,18.5427818423028,18.5531113998726,18.5775096647854,18.5840574022885,0.0320707945851609,18.5588027590485,0.0050738080157706 37 | 5.92995090014819,18.5555780126327,18.5742190074774,18.5409538297753,18.5486975066292,0.0774393061175943,18.5605463263609,0.00969025293870255 38 | 6.00395802785187,18.5565287734809,18.5775463590886,18.5617849366835,18.5708663214191,0.0920553196454421,18.5599068939509,0.0109402676459909 39 | 6.25412282785192,18.5559035686907,18.5630972859709,18.5308566700525,18.5157921009237,0.0548952181823552,18.5604036521072,0.00453180586065513 40 | 6.43028060681485,18.5661313710837,18.5591448931475,18.5589808141151,18.5110368266975,0.0858319909777492,18.5535933735478,0.00978519182482998 41 | 6.50428772029636,18.5477366775425,18.523965847213,18.5372351265929,18.545297330534,0.0821733459597453,18.5631851541897,0.00812575237508486 42 | 6.68044546370379,18.5708228054324,18.5590228343367,18.542158981161,18.5888687475159,0.0922145419754088,18.5644907342443,0.0097660092222279 43 | 6.75445256770377,18.5311585305758,18.5601414916841,18.5885692358754,18.5677957938898,0.0548793095164001,18.5533484567386,0.0097874431665338 44 | 6.93061029925934,18.5362721280303,18.5565164042252,18.5362535880405,18.5333127716303,0.0767854744335636,18.5636281678286,0.00520761918669793 45 | 7.00461737955561,18.5543355018981,18.5199741555534,18.5348857719401,18.5118432333631,0.0414669126272202,18.5608657030309,0.00839216899655168 46 | 7.18077512296298,18.5602723554936,18.5575914312487,18.5513116331508,18.551788246578,0.00213281528558582,18.5643997597013,0.00331454630322723 47 | 7.25478221511116,18.5257816398252,18.5656521894532,18.5227292410936,18.5714566674585,0.0457111209398136,18.5544774937076,0.0120271840375375 48 | 7.43093994666668,18.5547447326351,18.5195334326354,18.5349179931093,18.5820713647261,0.0336836270755157,18.5563353490799,0.00750787351010734 49 | 7.50494705066671,18.5604662283269,18.5671684167028,18.5564133877834,18.5696808855744,0.0616545808268711,18.5583136913205,0.00948953421018131 50 | 7.68110477274081,18.5694418236949,18.5112242984846,18.5487852701645,18.5810835591695,0.0684731747955084,18.5548631599234,0.00623661545157984 51 | 7.75511186488893,18.5481387618937,18.5263559304141,18.5388436463712,18.4966061176472,0.0361196914687753,18.5650441165026,0.0100793221832902 52 | 7.93126960592599,18.5344942778896,18.5553861340442,18.5466361455307,18.6213120273994,0.0985294625163078,18.5630625332219,0.0092009796337932 53 | 8.00527670044448,18.5562077086716,18.5472065305838,18.5433572258355,18.5970840277694,0.0458260116400197,18.5618095388714,0.00751794259404281 54 | 8.18143443200006,18.5472361837207,18.5449905470038,18.5722239125806,18.5903195577598,0.0678766368655488,18.5698626438528,0.00741529829594704 55 | 8.25544151229633,18.5456263186827,18.5470748575154,18.5679674152509,18.5550136968236,0.00645137266255915,18.5690106783907,0.00692850201038873 56 | 8.43159921066672,18.5428732685636,18.553915380597,18.5084077653486,18.5386932314377,0.0213110986864194,18.5652629695566,0.00566129593090012 57 | 8.50560631229632,18.5489438676115,18.5282370595513,18.551243556647,18.5552308890188,0.0135498776100576,18.5692212643741,0.0191769416240384 58 | 8.68176403437042,18.5649901953502,18.530235760623,18.5379110186313,18.4941281286038,0.0885351747972891,18.5689534312037,0.00544968844438369 59 | 8.75577110281483,18.554510866556,18.5727386126227,18.558164488833,18.6862964660561,0.081282608024776,18.5611947263895,0.00856172176535711 60 | 9.18209362488892,18.5432245935312,18.5550370293737,18.5421551370975,18.5540778138426,0.0527859937399626,18.5511194223028,0.0102602548596579 61 | 9.25610069333334,18.5475828322227,18.534449768215,18.5690581822654,18.5209402927344,0.0233320454601198,18.5694870799201,0.0102262135819955 62 | 9.43225841540743,18.548173408636,18.563840119393,18.5480218995715,18.5350612382339,0.0703313875943422,18.5580685195013,0.0152410020202681 63 | 9.50626550755561,18.5345295794331,18.5296641514354,18.5806037457966,18.5491152327246,0.00295040390919894,18.5769683123307,0.00471203349230513 64 | 9.68242319407409,18.5807764882747,18.5641730508058,18.5490765705939,18.5354269120785,0.0227896849624813,18.5620501633528,0.0148169371188335 65 | 9.75643026251856,18.5292425959111,18.5306067627064,18.540530926466,18.5800183002258,0.0738576146308333,18.5644408520759,0.00867916245522732 66 | 9.93258798222223,18.5508722992717,18.5545087175195,18.5469753426537,18.5352170998336,0.0250213630497456,18.573991727579,0.00533889049371997 67 | 10.006595053037,18.5408493639313,18.5289990734953,18.5624253252434,18.6270040754068,0.0639115944504738,18.5690024417712,0.00517361114636936 68 | 10.1827527608889,18.5501236358738,18.5432547797487,18.5331005198468,18.5524171191361,0.0406069706194103,18.5716193889192,0.00465990579290407 69 | 10.5069245985185,18.5409586616097,18.5319886869186,18.5438105928918,18.5628621968699,0.0116179539356381,18.5885800765758,0.011534692324484 70 | 10.6830822850371,18.5487867667802,18.5659105309089,18.6033156238295,18.5676869873518,0.043744198884815,18.570469858646,0.0104915347826452 71 | 10.7570893416297,18.5376891141176,18.5716775125042,18.5430999648969,18.4668218661464,0.0453147494699806,18.5840090914808,0.0137403462217677 72 | 10.9332470518519,18.5536042249926,18.5546561015614,18.5349260282157,18.6301409249524,0.0721810968825594,18.5641233076457,0.00587263297435276 73 | 11.0072541084445,18.5479652432903,18.5535084329554,18.5429766128583,18.5682853060466,0.0879812199622393,18.5663568587749,0.00984477441662157 74 | 11.5075835994075,18.5581601224785,18.5583863069206,18.5489521111866,18.462254659705,0.0873479148373008,18.5815633683939,0.00385140113468823 75 | 11.7577483425186,18.5468841458741,18.5184626994895,18.5391532518418,18.585988151904,0.0436564211267978,18.5763595374087,0.00325572038390436 76 | -------------------------------------------------------------------------------- /Session1/Tuesday/IntroStat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import scipy\n", 12 | "import pandas as pd\n", 13 | "import numpy as np\n", 14 | "import math\n", 15 | "import statsmodels.api as sm\n", 16 | "from matplotlib import pyplot as plt" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# 1. Distributions: \n", 24 | "how to get to know better the distribution of the data, identify various issues, and check fits?\n", 25 | "\n", 26 | "## 1.1. Probability distributions\n", 27 | "\n", 28 | "Ex 1. QQ-PLOT BASICS. \n", 29 | "\n", 30 | "First, simulate 9 times 200 standard random normal variables, and inspect the variations in the qq-plots.\n", 31 | "Take a look at the help of qqplots in the statmodels.api module of Python using sm.qqplot? if necessary." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "fig = plt.figure(1)\n", 43 | "for ii in range(1, 10):\n", 44 | " rv = scipy.stats.norm.rvs(0, 1, size = 200)\n", 45 | " ax = fig.add_subplot(3,3,ii)\n", 46 | " sm.qqplot(rv, line = 's', ax = ax)\n", 47 | "plt.show()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "Ex 2. LINEARITY OF THE NORMAL DISTRIBUTION. \n", 55 | "\n", 56 | "Repeat the exercise using random normal variables with mean 3 and standard deviation 0.4, using now 3 times each of 4,20 and 200.\n", 57 | "\n", 58 | "The automatically added line on the qq-plot is estimated by taking the empirical mean and square-root empirical variance of the sample; these are the simplest estimators of the parameters of a normal sample. For a normal distribution, if X ~ N(0,1), then the variable Y = m + sX ~ N(m, s^2). Therefore, if you plotted your sample Y against standard normal quantiles, a line using the true mean m as intersect and the true standard deviation s as slope should represent the truth, and probably should fit the sample well. \n", 59 | "\n", 60 | "Add a line with intersection 3 and slope 0.4 (representing the n = Infty perfect sample). Are the empirical mean and variance good estimators of the population mean and variance? " 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "x = np.arange(-3.,3.,0.01)\n", 72 | "y = 3. + 0.4*x\n", 73 | "nsample = [4,4,4,20,20,20,200,200,200]\n", 74 | "fig = plt.figure(1)\n", 75 | "for ii in range(0,9):\n", 76 | " rv = scipy.stats.norm.rvs(3., 0.4, size = nsample[ii])\n", 77 | " ax = fig.add_subplot(3,3,ii+1)\n", 78 | " sm.qqplot(rv, line = 's', ax = ax)\n", 79 | " ax.plot(x, y, color='c')\n", 80 | "plt.show()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "Ex 3. RECOGNIZE DISTRIBUTIONAL FEATURES ON THE QQ-PLOTS.\n", 88 | "\n", 89 | "The qq-plot provides a powerful check of distributional assumptions. Use the template below to see the following distributions: Cauchy (heavy-tailed), chi-squared (much used), beta(0.5,2) (restricted to the interval [0,1]; contains the uniform distribution as beta(1,1)) and two Poisson distributions, with mean 3 and 250. \n", 90 | "\n", 91 | "This time, we use the option fit = True in sm.qqplot, so that the sample is standardized by its mean and standard error before plotting. \n", 92 | "\n", 93 | "Replace the random distribution in the codes, both for the comparison with the normal distribution and for the qq-plot (check the parameters at http://docs.scipy.org/doc/scipy/reference/stats.html). Compare the tail behavior on the plot of the density and on the qq-plot. " 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "x = np.linspace(scipy.stats.cauchy.ppf(0.005), scipy.stats.cauchy.ppf(0.995), 200)\n", 105 | "#x = np.arange(scipy.stats.poisson.ppf(0.005, mu=250), scipy.stats.poisson.ppf(0.995, mu=250), 1)\n", 106 | "\n", 107 | "rv = scipy.stats.cauchy.rvs(size = 400)\n", 108 | "\n", 109 | "fig = plt.figure(3)\n", 110 | "ax1 = fig.add_subplot(121)\n", 111 | "plt.hist(rv, 20, normed=1, facecolor='y', alpha=0.75)\n", 112 | "ax1.plot(x, scipy.stats.cauchy.pdf(x),'b-', lw=2)\n", 113 | "#ax1.vlines(x, 0, scipy.stats.poisson.pmf(x, mu=250), colors='b', lw=5, alpha=0.2)\n", 114 | "ax2 = fig.add_subplot(122)\n", 115 | "sm.qqplot(rv, line = 's', ax = ax2)\n", 116 | "\n", 117 | "plt.show()\n", 118 | "\n", 119 | "# To see the extremes of the random variates:\n", 120 | "#rv.min()\n", 121 | "#rv.max()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## 1.2. Estimating the parameters and an (incomplete) collection of frequent problems\n", 129 | "\n", 130 | "We will walk through some of the simplest complications that can affect real data, and see how to recognize them by the means of the quantile-quantile plot.\n", 131 | "\n", 132 | "Ex. 4. NUISANCES IN THE DATA\n", 133 | "\n", 134 | "(starter's guide for dataframes in pandas: http://pandas.pydata.org/pandas-docs/stable/dsintro.html, as well as http://www.scipy-lectures.org/packages/statistics/index.html)\n", 135 | "\n", 136 | "4.1. Outliers" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "dfr = pd.read_csv(\"./data/IntroStat_demo.csv\")" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "collapsed": true 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "fig = plt.figure(1)\n", 159 | "\n", 160 | "plt.subplot(121)\n", 161 | "n, bins, patches = plt.hist(dfr['mag.outlier'], 12, normed=1, facecolor='y', alpha=0.75)\n", 162 | "ax = fig.add_subplot(122)\n", 163 | "sm.qqplot(dfr['mag.outlier'], line = 's', ax = ax)\n", 164 | "\n", 165 | "plt.show()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "Replace the line by one defined using the quantile estimators (a line passing through the first and third quartiles)." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "fig = plt.figure(1)\n", 184 | "\n", 185 | "plt.subplot(121)\n", 186 | "plt.hist(dfr['mag.outlier'], 12, normed=1, facecolor='y', alpha=0.75)\n", 187 | "ax = fig.add_subplot(122)\n", 188 | "sm.qqplot(dfr['mag.outlier'], line = 'q', ax = ax)\n", 189 | "\n", 190 | "plt.show()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "Some robust (regression) models, M-estimators: http://statsmodels.sourceforge.net/stable/rlm.html.\n", 198 | "\n", 199 | "4.2. Heteroscedasticity\n", 200 | "\n", 201 | "Use now the 'mag.het' column in place of 'mag.outlier'. Does the quantile-based or the moment-estimated line work?" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "fig = plt.figure(1)\n", 213 | "\n", 214 | "plt.subplot(121)\n", 215 | "plt.hist(dfr['mag.het'], 12, normed=1, facecolor='y', alpha=0.75)\n", 216 | "ax = fig.add_subplot(122)\n", 217 | "sm.qqplot(dfr['mag.het'], line = 'q', ax = ax)\n", 218 | "\n", 219 | "plt.show()" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "The sample is, in fact, heteroscedastic, and the (estimated) standard errors are given in the column 'mag.het.error'.\n", 227 | "What can you do to check about the errors if you are in doubt about them? How can you check the normality of the sample?" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": true 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "def std_fn(x, mean, std):\n", 239 | " res = (x-mean) / std\n", 240 | " return res" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "The above standardization should lead to a homoscedastic standard normal sample (by the linearity of the normal distribution). Its QQ-plot should be now close to a line with slope 1." 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "\n", 259 | "w = dfr['mag.het.error']**(-2)\n", 260 | "m = np.average(dfr['mag.het'], weights = w)\n", 261 | "std_het = std_fn(x = dfr['mag.het'], mean = m, std = dfr['mag.het.error'])\n", 262 | "\n", 263 | "fig = plt.figure(1)\n", 264 | "plt.subplot(121)\n", 265 | "plt.hist(std_het, 12, normed=1, facecolor='grey', alpha=0.75)\n", 266 | "ax = fig.add_subplot(122)\n", 267 | "# The option line = '45' means a line with intersection 0 and slope 1.\n", 268 | "sm.qqplot(std_het, line = '45', ax = ax)\n", 269 | "\n", 270 | "plt.show()" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "4.3. Any other uexpected effect\n", 278 | "\n", 279 | "Use now the 'mag5' column in place of 'mag.outlier'. How does the standardization work in this case?" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "collapsed": false 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "w = dfr['mag5.error']**(-2)\n", 291 | "m = np.average(dfr['mag5'], weights = w)\n", 292 | "std5 = std_fn(x = dfr['mag5'], mean = m, std = dfr['mag5.error'])\n", 293 | "\n", 294 | "fig = plt.figure(1)\n", 295 | "plt.subplot(121)\n", 296 | "plt.hist(std5, 12, normed=1, facecolor='grey', alpha=0.75)\n", 297 | "ax = fig.add_subplot(122)\n", 298 | "sm.qqplot(std5, line = '45', ax = ax)\n", 299 | "\n", 300 | "plt.show()" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "Use the 'time' column in the dataframe to plot the data." 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "fig = plt.figure(1)\n", 319 | "\n", 320 | "plt.plot(dfr['time'], dfr['mag5'], 'ro')\n", 321 | "\n", 322 | "plt.xlabel('Time')\n", 323 | "plt.ylabel('Magnitude')\n", 324 | "# this is just to extend a bit the plotting area, so that no points fall exactly on the border:\n", 325 | "mn, mx = sorted(dfr['time'])[::len(dfr['time'])-1]\n", 326 | "plt.xlim(mn - 0.05*(mx-mn), mx + 0.05*(mx-mn))\n", 327 | "\n", 328 | "plt.show()" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "# 2. Classical estimation and hypothesis testing\n", 336 | "\n", 337 | "The demo data set for this part is the Wesenheit index of the OGLE-III fundamental-mode and first overtone classical Cepheids. We'll try to estimate their period-luminosity relationship.\n", 338 | "The Wesenheit index is defined as W = I - 1.55(V - I), and its main advantage over using simply the I or V photometry is that it is insensitive to extinction. It is denoted by 'W' among the data columns. Other columns are \n", 339 | "'name', the identifier of the star; \n", 340 | "'RA0' (in decimal hours) and 'Decl0' (in decimal degrees), celestial coordinates; \n", 341 | "'Mode', the mode of the Cepheid ('F' indicates fundamental-mode, '1' indicates first overtone star); \n", 342 | "'Cloud', indicating which Magellanic Cloud the star belongs to; \n", 343 | "'logP1', the base-10 logarithm of the period in days; \n", 344 | "'VI', the colour V-I. \n", 345 | "\n", 346 | "Ex. 5. ORDINARY LEAST SQUARES REGRESSION (= GAUSSIAN MAXIMUM LIKELIHOOD WITH A MEAN DEPENDING ON A COVARIATE)\n", 347 | "\n", 348 | "5.1 MODEL FIT\n", 349 | "\n", 350 | "There are fundamental-mode (FU) and first overtone (FO) Cepheids both from the SMC and the LMC. Represent the fundamental and first overtone Cepheids' P-L relationship (W versus logP1) in two separate scatterplots, the LMC and SMC stars with different colours. What do you see? Fit a separate linear regression model to each of the distinct groups (to check the content of the resulting objects 'lmfit_lmc_fu' etc., see with dir(lmfit_lmc_fu) ). How would you decide whether the slopes are the same for stars of the same mode in the two Clouds? " 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": { 357 | "collapsed": true 358 | }, 359 | "outputs": [], 360 | "source": [ 361 | "import statsmodels.formula.api as smf" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": { 368 | "collapsed": false 369 | }, 370 | "outputs": [], 371 | "source": [ 372 | "cep = pd.read_csv(\"./data/Cepheids.csv\")\n", 373 | "cep[0:10]" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "collapsed": true 381 | }, 382 | "outputs": [], 383 | "source": [ 384 | "i_lmc = cep['Cloud'] == \"LMC\"\n", 385 | "i_fu = cep['Mode'] == \"F\"\n", 386 | "\n", 387 | "fig = plt.figure(4)\n", 388 | "ax1 = fig.add_subplot(211)\n", 389 | "plt.plot(cep[i_lmc & i_fu]['logP1'], cep[i_lmc & i_fu]['W'], 'b*', alpha=0.3, label = 'LMC')\n", 390 | "plt.plot(cep[-i_lmc & i_fu]['logP1'], cep[-i_lmc & i_fu]['W'], 'r*', alpha=0.3, label = 'SMC')\n", 391 | "plt.legend(loc = 'best', numpoints = 1)\n", 392 | "ax2 = fig.add_subplot(212)\n", 393 | "plt.plot(cep[i_lmc & -i_fu]['logP1'], cep[i_lmc & -i_fu]['W'], 'b*', alpha=0.3, label = 'LMC')\n", 394 | "plt.plot(cep[-i_lmc & -i_fu]['logP1'], cep[-i_lmc & -i_fu]['W'], 'r*', alpha=0.3, label = 'SMC')\n", 395 | "plt.legend(loc = 'best', numpoints = 1)\n", 396 | "\n", 397 | "plt.show()" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": { 404 | "collapsed": true 405 | }, 406 | "outputs": [], 407 | "source": [ 408 | "lmfit_lmc_fu = smf.ols(formula = 'W ~ logP1', data = cep, subset = i_lmc & i_fu).fit()\n", 409 | "lmfit_lmc_fo = smf.ols(formula = 'W ~ logP1', data = cep, subset = i_lmc & -i_fu).fit()\n", 410 | "\n", 411 | "lmfit_smc_fu = smf.ols(formula = 'W ~ logP1', data = cep, subset = -i_lmc & i_fu).fit()\n", 412 | "lmfit_smc_fo = smf.ols(formula = 'W ~ logP1', data = cep, subset = -i_lmc & -i_fu).fit()" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": { 419 | "collapsed": false 420 | }, 421 | "outputs": [], 422 | "source": [ 423 | "print lmfit_smc_fo.summary()" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": null, 429 | "metadata": { 430 | "collapsed": false 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "cep['resid0'] = np.zeros(cep.shape[0])\n", 435 | "cep['fitted0'] = np.zeros(cep.shape[0])\n", 436 | "cep.loc[(i_lmc & i_fu),'resid0'] = lmfit_lmc_fu.resid\n", 437 | "cep.loc[(-i_lmc & i_fu),'resid0'] = lmfit_smc_fu.resid\n", 438 | "cep.loc[(i_lmc & -i_fu),'resid0'] = lmfit_lmc_fo.resid\n", 439 | "cep.loc[(-i_lmc & -i_fu),'resid0'] = lmfit_smc_fo.resid\n", 440 | "cep.loc[(i_lmc & i_fu),'fitted0'] = lmfit_lmc_fu.fittedvalues\n", 441 | "cep.loc[(-i_lmc & i_fu),'fitted0'] = lmfit_smc_fu.fittedvalues\n", 442 | "cep.loc[(i_lmc & -i_fu),'fitted0'] = lmfit_lmc_fo.fittedvalues\n", 443 | "cep.loc[(-i_lmc & -i_fu),'fitted0'] = lmfit_smc_fo.fittedvalues\n", 444 | "\n", 445 | "cep.iloc[0:10]" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": { 452 | "collapsed": true 453 | }, 454 | "outputs": [], 455 | "source": [ 456 | "logp_tmp = np.linspace(cep['logP1'].min(), cep['logP1'].max(), 500)\n", 457 | "\n", 458 | "fig = plt.figure(4)\n", 459 | "ax1 = fig.add_subplot(211)\n", 460 | "plt.plot(cep[i_lmc & i_fu]['logP1'], cep[i_lmc & i_fu]['W'], 'c*', alpha=0.2, label = 'LMC')\n", 461 | "plt.plot(cep[-i_lmc & i_fu]['logP1'], cep[-i_lmc & i_fu]['W'], 'r*', alpha=0.2, label = 'SMC')\n", 462 | "plt.plot(logp_tmp, lmfit_lmc_fu.params['Intercept'] + logp_tmp * lmfit_lmc_fu.params['logP1'], 'blue', lw = 1)\n", 463 | "plt.plot(logp_tmp, lmfit_smc_fu.params['Intercept'] + logp_tmp * lmfit_smc_fu.params['logP1'], 'brown', lw = 1)\n", 464 | "plt.legend(loc = 'best', numpoints = 1)\n", 465 | "ax2 = fig.add_subplot(212)\n", 466 | "plt.plot(cep[i_lmc & -i_fu]['logP1'], cep[i_lmc & -i_fu]['W'], 'c*', alpha=0.2, label = 'LMC')\n", 467 | "plt.plot(cep[-i_lmc & -i_fu]['logP1'], cep[-i_lmc & -i_fu]['W'], 'r*', alpha=0.2, label = 'SMC')\n", 468 | "plt.legend(loc = 'best', numpoints = 1)\n", 469 | "plt.plot(logp_tmp, lmfit_lmc_fo.params['Intercept'] + logp_tmp * lmfit_lmc_fo.params['logP1'], 'blue', lw = 1)\n", 470 | "plt.plot(logp_tmp, lmfit_smc_fo.params['Intercept'] + logp_tmp * lmfit_smc_fo.params['logP1'], 'brown', lw = 1)\n", 471 | "\n", 472 | "plt.show()" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": {}, 478 | "source": [ 479 | "5.2 MODEL DIAGNOSTICS: QQ-PLOT OF RESIDUALS\n", 480 | "\n", 481 | "Let's start with checking the distributional assumptions of the model: do the residuals admit a normal distribution? Take a look at the output of the four linear models ( lmobject.summary() ). Can you see indications there of non-normality? Make and inspect the QQ-plot of the residuals, separately for the four groups of Cepheids. What do you find? What can be the reason of what you observe?" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": { 488 | "collapsed": true 489 | }, 490 | "outputs": [], 491 | "source": [ 492 | "mn, mx = sorted(cep['resid0'])[::len(cep['resid0'])-1]\n", 493 | "\n", 494 | "fig = plt.figure(1)\n", 495 | "plt.subplots_adjust(left=0.07, bottom=0.08, right=0.95, top=0.95, wspace=None, hspace=0.35)\n", 496 | "\n", 497 | "ax1 = fig.add_subplot(221)\n", 498 | "sm.qqplot(cep[i_lmc & i_fu]['resid0'], line = 's', ax = ax1)\n", 499 | "plt.title(\"LMC FU\")\n", 500 | "plt.ylim(mn - 0.05*(mx-mn), mx + 0.05*(mx-mn))\n", 501 | "ax2 = fig.add_subplot(222)\n", 502 | "sm.qqplot(cep[-i_lmc & i_fu]['resid0'], line = 's', ax = ax2)\n", 503 | "plt.title(\"SMC FU\")\n", 504 | "plt.ylim(mn - 0.05*(mx-mn), mx + 0.05*(mx-mn))\n", 505 | "ax3 = fig.add_subplot(223)\n", 506 | "sm.qqplot(cep[i_lmc & -i_fu]['resid0'], line = 's', ax = ax3)\n", 507 | "plt.title(\"LMC FO\")\n", 508 | "plt.ylim(mn - 0.05*(mx-mn), mx + 0.05*(mx-mn))\n", 509 | "ax4 = fig.add_subplot(224)\n", 510 | "sm.qqplot(cep[-i_lmc & -i_fu]['resid0'], line = 's', ax = ax4)\n", 511 | "plt.title(\"SMC FO\")\n", 512 | "plt.ylim(mn - 0.05*(mx-mn), mx + 0.05*(mx-mn))\n", 513 | "\n", 514 | "plt.show()" 515 | ] 516 | }, 517 | { 518 | "cell_type": "markdown", 519 | "metadata": {}, 520 | "source": [ 521 | "There are a few possible explanations.\n", 522 | "\n", 523 | "1. The Magellanic Clouds are extended in the line of sight. It is possible that we see an effect of the slightly different distances of stars towards the foreground and of those towards background.\n", 524 | "2. The literature suggests that the P-L relationship can contain colour (V-I) terms, and can have dependence on metallicity. \n", 525 | "3. There are also suggestions of either a break in the P-L relationships (at log(P) = 1 for FU and at at log(P) = 0.5 for FO) or the inclusion of a quadratic term. \n", 526 | "4. Unidentified effect or naturally non-normally distributed errors in period and the Wesenheit index.\n", 527 | "\n", 528 | "First we check the first point up there. Create a map of the stars on the sky (plot of RA0 and Decl0), coloured according to the sign of the residuals; if there is an effect of distance, then negative residuals and positive residuals will be differently grouped, and hinting at the geometry of the Cloud. Do this separately for the four fits." 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": { 535 | "collapsed": true 536 | }, 537 | "outputs": [], 538 | "source": [ 539 | "i_posresid = (cep['resid0'] > 0)\n", 540 | " \n", 541 | "fig = plt.figure(1)\n", 542 | "\n", 543 | "fig.add_subplot(221)\n", 544 | "plt.plot(cep[-i_lmc & i_fu & i_posresid]['RA0'], cep[-i_lmc & i_fu & i_posresid]['Decl0'], 'bo', alpha=0.2)\n", 545 | "plt.plot(cep[-i_lmc & i_fu & -i_posresid]['RA0'], cep[-i_lmc & i_fu & -i_posresid]['Decl0'], 'yo', alpha=0.2)\n", 546 | "plt.title('SMC FU')\n", 547 | "\n", 548 | "fig.add_subplot(222)\n", 549 | "plt.plot(cep[i_lmc & i_fu & i_posresid]['RA0'], cep[i_lmc & i_fu & i_posresid]['Decl0'], 'bo', alpha=0.2)\n", 550 | "plt.plot(cep[i_lmc & i_fu & -i_posresid]['RA0'], cep[i_lmc & i_fu & -i_posresid]['Decl0'], 'yo', alpha=0.2)\n", 551 | "plt.title('LMC FU')\n", 552 | "\n", 553 | "fig.add_subplot(223)\n", 554 | "plt.plot(cep[-i_lmc & -i_fu & i_posresid]['RA0'], cep[-i_lmc & -i_fu & i_posresid]['Decl0'], 'bo', alpha=0.2)\n", 555 | "plt.plot(cep[-i_lmc & -i_fu & -i_posresid]['RA0'], cep[-i_lmc & -i_fu & -i_posresid]['Decl0'], 'yo', alpha=0.2)\n", 556 | "plt.title('SMC FU')\n", 557 | "\n", 558 | "fig.add_subplot(224)\n", 559 | "plt.plot(cep[i_lmc & -i_fu & i_posresid]['RA0'], cep[i_lmc & -i_fu & i_posresid]['Decl0'], 'bo', alpha=0.2)\n", 560 | "plt.plot(cep[i_lmc & -i_fu & -i_posresid]['RA0'], cep[i_lmc & -i_fu & -i_posresid]['Decl0'], 'yo', alpha=0.2)\n", 561 | "plt.title('LMC FU')\n", 562 | "\n", 563 | "plt.show()" 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": {}, 569 | "source": [ 570 | "5.2 RESIDUALS AGAINST FITTED VALUES AND COVARIATE\n", 571 | "\n", 572 | "After concluding on this point, we can do some further checks on the distribution. Statisticians usually check whether the variance of the response (or the residuals) depends on the fitted value. For example, if our response variable should be considered to be a Poisson variable, then its variance would be equal to the mean, which is varying with the covariate(s). Thus, in such a case, plotting the residuals against the fitted values, we would see a band narrow at small fitted values, and widening with increasing fitted values. For a homoscedastic normal distribution, we would find a band of constant width. Other patterns can hint to other distributions. \n", 573 | "Plot the residuals versus the fitted value for each of the four fits. What do you think? Take into account the local number of the data: with more data within some fitted value bin, you see more of the extremes of the local distribution than with fewer data." 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": { 580 | "collapsed": false 581 | }, 582 | "outputs": [], 583 | "source": [ 584 | "fig = plt.figure(1)\n", 585 | "\n", 586 | "fig.add_subplot(221)\n", 587 | "plt.plot(cep[-i_lmc & i_fu]['fitted0'], cep[-i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n", 588 | "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['fitted0'].min(), xmax = cep[-i_lmc & i_fu]['fitted0'].max(), lw = 2)\n", 589 | "plt.title('SMC FU')\n", 590 | "\n", 591 | "fig.add_subplot(222)\n", 592 | "plt.plot(cep[i_lmc & i_fu]['fitted0'], cep[i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n", 593 | "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['fitted0'].min(), xmax = cep[-i_lmc & i_fu]['fitted0'].max(), lw = 2)\n", 594 | "plt.title('LMC FU')\n", 595 | "\n", 596 | "fig.add_subplot(223)\n", 597 | "plt.plot(cep[-i_lmc & -i_fu]['fitted0'], cep[-i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n", 598 | "plt.hlines(0, xmin = cep[-i_lmc & -i_fu]['fitted0'].min(), xmax = cep[-i_lmc & -i_fu]['fitted0'].max(), lw = 2)\n", 599 | "plt.title('SMC FO')\n", 600 | "\n", 601 | "fig.add_subplot(224)\n", 602 | "plt.plot(cep[i_lmc & -i_fu]['fitted0'], cep[i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n", 603 | "plt.hlines(0, xmin = cep[i_lmc & -i_fu]['fitted0'].min(), xmax = cep[i_lmc & -i_fu]['fitted0'].max(), lw = 2)\n", 604 | "plt.title('LMC FO')\n", 605 | "\n", 606 | "plt.show()" 607 | ] 608 | }, 609 | { 610 | "cell_type": "markdown", 611 | "metadata": {}, 612 | "source": [ 613 | "Another useful plot (which is generally used) is the plot of residuals against covariates. We can see the intervals of lack of fits, the bias, the necessity of more terms or a nonparametric model. Create this plot. Do you see a strong indication of quadratic terms or breaks in the model?" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": null, 619 | "metadata": { 620 | "collapsed": true 621 | }, 622 | "outputs": [], 623 | "source": [ 624 | "fig = plt.figure(1)\n", 625 | "\n", 626 | "fig.add_subplot(221)\n", 627 | "plt.plot(cep[-i_lmc & i_fu]['logP1'], cep[-i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n", 628 | "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['logP1'].min(), xmax = cep[-i_lmc & i_fu]['logP1'].max(), lw = 2)\n", 629 | "plt.title('SMC FU')\n", 630 | "\n", 631 | "fig.add_subplot(222)\n", 632 | "plt.plot(cep[i_lmc & i_fu]['logP1'], cep[i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n", 633 | "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['logP1'].min(), xmax = cep[-i_lmc & i_fu]['logP1'].max(), lw = 2)\n", 634 | "plt.title('LMC FU')\n", 635 | "\n", 636 | "fig.add_subplot(223)\n", 637 | "plt.plot(cep[-i_lmc & -i_fu]['logP1'], cep[-i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n", 638 | "plt.hlines(0, xmin = cep[-i_lmc & -i_fu]['logP1'].min(), xmax = cep[-i_lmc & -i_fu]['logP1'].max(), lw = 2)\n", 639 | "plt.title('SMC FO')\n", 640 | "\n", 641 | "fig.add_subplot(224)\n", 642 | "plt.plot(cep[i_lmc & -i_fu]['logP1'], cep[i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n", 643 | "plt.hlines(0, xmin = cep[i_lmc & -i_fu]['logP1'].min(), xmax = cep[i_lmc & -i_fu]['logP1'].max(), lw = 2)\n", 644 | "plt.title('LMC FO')\n", 645 | "\n", 646 | "plt.show()" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "5.3 MODEL COMPARISON: IS V-I NECESSARY TO INCLUDE?\n", 654 | "\n", 655 | "Several authors propose the inclusion of a linear V-I term to the P-L relationship in its form using magnitudes of the stars. As we use the Wesenheit index, this is equivalent to allow for a correction term to the used coefficient 1.55 (recall that W = I - 1.55(V - I)). Do we need this term? First visualize." 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": { 662 | "collapsed": true 663 | }, 664 | "outputs": [], 665 | "source": [ 666 | "fig = plt.figure(1)\n", 667 | "\n", 668 | "fig.add_subplot(221)\n", 669 | "plt.plot(cep[-i_lmc & i_fu]['VI'], cep[-i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n", 670 | "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['VI'].min(), xmax = cep[-i_lmc & i_fu]['VI'].max(), lw = 2)\n", 671 | "plt.title('SMC FU')\n", 672 | "\n", 673 | "fig.add_subplot(222)\n", 674 | "plt.plot(cep[i_lmc & i_fu]['VI'], cep[i_lmc & i_fu]['resid0'], 'g*', alpha=0.2)\n", 675 | "plt.hlines(0, xmin = cep[-i_lmc & i_fu]['VI'].min(), xmax = cep[-i_lmc & i_fu]['VI'].max(), lw = 2)\n", 676 | "plt.title('LMC FU')\n", 677 | "\n", 678 | "fig.add_subplot(223)\n", 679 | "plt.plot(cep[-i_lmc & -i_fu]['VI'], cep[-i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n", 680 | "plt.hlines(0, xmin = cep[-i_lmc & -i_fu]['VI'].min(), xmax = cep[-i_lmc & -i_fu]['VI'].max(), lw = 2)\n", 681 | "plt.title('SMC FO')\n", 682 | "\n", 683 | "fig.add_subplot(224)\n", 684 | "plt.plot(cep[i_lmc & -i_fu]['VI'], cep[i_lmc & -i_fu]['resid0'], 'g*', alpha=0.2)\n", 685 | "plt.hlines(0, xmin = cep[i_lmc & -i_fu]['VI'].min(), xmax = cep[i_lmc & -i_fu]['VI'].max(), lw = 2)\n", 686 | "plt.title('LMC FO')\n", 687 | "\n", 688 | "plt.show()" 689 | ] 690 | }, 691 | { 692 | "cell_type": "markdown", 693 | "metadata": {}, 694 | "source": [ 695 | "Just to check on the literature, to see a frequent and sometimes unnoticed problem of linear models, and to use the model comparison techniques, we fit models with both logP1 and V - I. However, we should be careful. It cannot be excluded that the two explanatory variables directly depend on each other (actually, this can even be expected, since the Cepheids have a very constrained pulsation and stellar structure model). If such a relationship holds between two covariates in a linear model, then mathematically, the model can become ill-determined, and strongly unstable against small changes in the data. This is because in such a case, a change in the coefficient of one of these covariates can be compensated by a corresponding change in the coefficient of the other. So first use a scatterplot to see the logP1-VI relationship in the four Cepheid groups. What do you conclude?" 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": null, 701 | "metadata": { 702 | "collapsed": true 703 | }, 704 | "outputs": [], 705 | "source": [ 706 | "fig = plt.figure(1)\n", 707 | "\n", 708 | "fig.add_subplot(221)\n", 709 | "plt.plot(cep[-i_lmc & i_fu]['logP1'], cep[-i_lmc & i_fu]['VI'], 'g*', alpha=0.2)\n", 710 | "plt.title('SMC FU')\n", 711 | "\n", 712 | "fig.add_subplot(222)\n", 713 | "plt.plot(cep[i_lmc & i_fu]['logP1'], cep[i_lmc & i_fu]['VI'], 'g*', alpha=0.2)\n", 714 | "plt.title('LMC FU')\n", 715 | "\n", 716 | "fig.add_subplot(223)\n", 717 | "plt.plot(cep[-i_lmc & -i_fu]['logP1'], cep[-i_lmc & -i_fu]['VI'], 'g*', alpha=0.2)\n", 718 | "plt.title('SMC FO')\n", 719 | "\n", 720 | "fig.add_subplot(224)\n", 721 | "plt.plot(cep[i_lmc & -i_fu]['logP1'], cep[i_lmc & -i_fu]['VI'], 'g*', alpha=0.2)\n", 722 | "plt.title('LMC FO')\n", 723 | "\n", 724 | "plt.show()" 725 | ] 726 | }, 727 | { 728 | "cell_type": "markdown", 729 | "metadata": {}, 730 | "source": [ 731 | "This problem is called the collinearity problem. A solution is to orthogonalize the variables; we perform this by regressing VI on logP1, and extracting the residuals of this model. The residuals now, by virtue of some statistical magic, are now uncorrelated with logP1, and can be used in a two-variate period-luminosity-color relationship without the risk of ending up with a singular model.\n", 732 | "After fitting, check up on the significance table of the model parameters. " 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": null, 738 | "metadata": { 739 | "collapsed": true 740 | }, 741 | "outputs": [], 742 | "source": [ 743 | "vi_lmfit_lmc_fu = smf.ols(formula = 'VI ~ logP1', data = cep, subset = i_lmc & i_fu).fit()\n", 744 | "vi_lmfit_lmc_fo = smf.ols(formula = 'VI ~ logP1', data = cep, subset = i_lmc & -i_fu).fit()\n", 745 | "\n", 746 | "vi_lmfit_smc_fu = smf.ols(formula = 'VI ~ logP1', data = cep, subset = -i_lmc & i_fu).fit()\n", 747 | "vi_lmfit_smc_fo = smf.ols(formula = 'VI ~ logP1', data = cep, subset = -i_lmc & -i_fu).fit()" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": null, 753 | "metadata": { 754 | "collapsed": false 755 | }, 756 | "outputs": [], 757 | "source": [ 758 | "print vi_lmfit_smc_fo.summary()" 759 | ] 760 | }, 761 | { 762 | "cell_type": "markdown", 763 | "metadata": {}, 764 | "source": [ 765 | "Next, add the new column 'resid_vi' as an additional variable to the models. Compare the different model comparison measures: the likelihood, the AIC and the BIC to those of models without resid_vi. As well, repeat the former plot, now superposing the new residuals in a new (transparent) colour. What do you see? Would you accept the necessity of including such a term into your models? Consider different aspects of the problem: improvement on the model as goodness-of-fit measures summarize, behaviour of the residuals, behaviour of the outliers, size of the effect, errors on the coefficients in the two models. " 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": null, 771 | "metadata": { 772 | "collapsed": true 773 | }, 774 | "outputs": [], 775 | "source": [ 776 | "cep['resid_vi'] = np.zeros(cep.shape[0])\n", 777 | "cep.loc[(i_lmc & i_fu),'resid_vi'] = vi_lmfit_lmc_fu.resid\n", 778 | "cep.loc[(-i_lmc & i_fu),'resid_vi'] = vi_lmfit_smc_fu.resid\n", 779 | "cep.loc[(i_lmc & -i_fu),'resid_vi'] = vi_lmfit_lmc_fo.resid\n", 780 | "cep.loc[(-i_lmc & -i_fu),'resid_vi'] = vi_lmfit_smc_fo.resid" 781 | ] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": null, 786 | "metadata": { 787 | "collapsed": true 788 | }, 789 | "outputs": [], 790 | "source": [ 791 | "cep[0:10]" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": null, 797 | "metadata": { 798 | "collapsed": true 799 | }, 800 | "outputs": [], 801 | "source": [ 802 | "lmfit_lmc_fu2 = smf.ols(formula = 'W ~ logP1 + resid_vi', data = cep, subset = i_lmc & i_fu).fit()\n", 803 | "lmfit_lmc_fo2 = smf.ols(formula = 'W ~ logP1 + resid_vi', data = cep, subset = i_lmc & -i_fu).fit()\n", 804 | "\n", 805 | "lmfit_smc_fu2 = smf.ols(formula = 'W ~ logP1 + resid_vi', data = cep, subset = -i_lmc & i_fu).fit()\n", 806 | "lmfit_smc_fo2 = smf.ols(formula = 'W ~ logP1 + resid_vi', data = cep, subset = -i_lmc & -i_fu).fit()" 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": null, 812 | "metadata": { 813 | "collapsed": false 814 | }, 815 | "outputs": [], 816 | "source": [ 817 | "print lmfit_lmc_fu2.summary()" 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": null, 823 | "metadata": { 824 | "collapsed": false 825 | }, 826 | "outputs": [], 827 | "source": [ 828 | "print lmfit_lmc_fo.bic\n", 829 | "print lmfit_lmc_fo2.bic" 830 | ] 831 | }, 832 | { 833 | "cell_type": "markdown", 834 | "metadata": {}, 835 | "source": [ 836 | "Finally, for seeing how collinearity affects the model results, we fit models also using the original V-I, correlated with log(P). " 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": null, 842 | "metadata": { 843 | "collapsed": true 844 | }, 845 | "outputs": [], 846 | "source": [ 847 | "lmfit_lmc_fu1 = smf.ols(formula = 'W ~ logP1 + VI', data = cep, subset = i_lmc & i_fu).fit()\n", 848 | "lmfit_lmc_fo1 = smf.ols(formula = 'W ~ logP1 + VI', data = cep, subset = i_lmc & -i_fu).fit()\n", 849 | "\n", 850 | "lmfit_smc_fu1 = smf.ols(formula = 'W ~ logP1 + VI', data = cep, subset = -i_lmc & i_fu).fit()\n", 851 | "lmfit_smc_fo1 = smf.ols(formula = 'W ~ logP1 + VI', data = cep, subset = -i_lmc & -i_fu).fit()" 852 | ] 853 | }, 854 | { 855 | "cell_type": "code", 856 | "execution_count": null, 857 | "metadata": { 858 | "collapsed": false 859 | }, 860 | "outputs": [], 861 | "source": [ 862 | "print lmfit_lmc_fu1.summary()\n", 863 | "print lmfit_lmc_fu2.summary()" 864 | ] 865 | }, 866 | { 867 | "cell_type": "markdown", 868 | "metadata": {}, 869 | "source": [ 870 | "Comments:\n", 871 | "\n", 872 | "1. The fitted coefficient of VI and resid_vi in the models are the same, as well as their errors. There is no change in the log-likelihood, AIC and BIC either: model goodness is the same.\n", 873 | "\n", 874 | "2. Both the intersect and the coeff of log(P) have changed. We don't know a priori, which is the true value (if exists at all), but the error on both is smaller in the orthogonal models than in the collinear models.\n", 875 | "\n", 876 | "3. Computing the correlation matrix of the parameters, we find no correlation of resid_vi with the other parameters in the W ~ (logP1, resid_vi) model, versus a high correlation of VI with them in the W ~ (logP1, VI) model. This is a manifestation of the fact that in a collinear model, coefficients can take over (at least partly) the role of each other, and thus, their value is more sensitive to particular outliers and structures in the data than in an orthogonal model. \n", 877 | "\n", 878 | "4. That a correction using V-I causes some improvement on the outliers and on the distributional discrepancy, can also be seen on the values for skewness and kurtosis (for a normal distribution, these should be 0 and 3 respectively), even though the improvement is indeed tiny, and we did not get much closer to satisfy the distributional assumption to have valid asymptotics of the least squares models (using V-I isn't any help of course with the distance effect, and there might be other effects as well, for instance metallicity). The Jarque-Bera test shows the same.\n", 879 | "\n", 880 | "5. The condition number (ratio of the largest and the smallest eigenvalues of the X^T X matrix, where X is the \"design matrix\", here basically the data matrix with a first column of ones) is smaller in the orthogonalized model than in the collinear one. As a rule of thumb, condition number higher than 30 indicates dangerously strong collinearity. We here do not reach this limit even with the model W ~ (logP1, VI), but still, we can see the improvement on the condition number from this model to the W ~ (logP1, resid_vi) one.\n", 881 | "\n", 882 | "6. See the log-likelihood surfaces, shown on the slides.\n", 883 | "\n", 884 | "7. Nevertheless, the discrepancy from normally distributed errors is so big that all estimated errors, all asymptotics are useless! \n", 885 | "\n" 886 | ] 887 | }, 888 | { 889 | "cell_type": "markdown", 890 | "metadata": {}, 891 | "source": [ 892 | "In case there is still time, or someone finished the exercise very early, here are some more propositions to work on.\n", 893 | "\n", 894 | "Problem 1: Monte Carlo methods for errors on the estimates.\n", 895 | "\n", 896 | "The literature errors given in Soszynski et al 2008 are . \n", 897 | "Do a subsampling: resample n0 stars from the n stars, and estimate again the PL relation (you can work with either the univariate or the bivariate model). Repeat subsampling and estimation R times. Is the distribution of the estimates normal, as the asymptotic theory says about the maximum likelihood estimator? Take the IQR or the MAD as est. of standard deviation, deflate it with the numeric factor sqrt(n0/n): is this consistent with what is given in the literature?\n", 898 | "\n", 899 | "Problem 2: Use heavy-tailed distributions for estimation.\n", 900 | "\n", 901 | "Assuming a (rescaled) t for the error distribution, can you find a plausible value for nu? (Use the qq-plots and the fact that with the right nu, the rescaled t-variates still would show a straight line pattern.) Compute and plot the likelihood. Find an estimate of the parameters and their errors within are the errors now more realistic? \n" 902 | ] 903 | } 904 | ], 905 | "metadata": { 906 | "celltoolbar": "Raw Cell Format", 907 | "kernelspec": { 908 | "display_name": "Python 2", 909 | "language": "python", 910 | "name": "python2" 911 | }, 912 | "language_info": { 913 | "codemirror_mode": { 914 | "name": "ipython", 915 | "version": 2 916 | }, 917 | "file_extension": ".py", 918 | "mimetype": "text/x-python", 919 | "name": "python", 920 | "nbconvert_exporter": "python", 921 | "pygments_lexer": "ipython2", 922 | "version": "2.7.11" 923 | } 924 | }, 925 | "nbformat": 4, 926 | "nbformat_minor": 0 927 | } 928 | --------------------------------------------------------------------------------