├── README.md ├── U01 L01 - What is Statistics.ipynb ├── U02 - A Few Convergence Simulations.ipynb ├── U02 L03 - Parametric Statistical Models.ipynb ├── U02 L04 - Parametric Estimation and Confidence Intervals.ipynb ├── U02 L05 - Delta Method and Confidence Intervals.ipynb ├── U02 L06&07 - Hypothesis Testing, Type I & II Errors, Levels and P-values.ipynb ├── U03 L08 - Distance Measures Between Distributions.ipynb ├── U03 L09 - Introduction to Maximum Likelihood Estimation.ipynb ├── U03 L10 - Consistency of MLE, Covariance Matrices, and Multivariate Stats.ipynb └── U03 L11 - Fisher Info, Asym Normality of MLE, Method of Moments.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Notes for 18.6501x - Fundamentals of Statistics 2 | 3 | I'm going to be uploading all of my notes here in the form of jupyter notebooks. Don't worry if you don't have python or jupyter installed - github will render the pages for you! 4 | 5 | Hope this helps us all understand things a little better 6 | 7 | -Trace 8 | 9 | edit 8/24/2022: I apologize that my notes only cover half of the class, but I'm happy so many people have found them useful! 10 | -------------------------------------------------------------------------------- /U02 - A Few Convergence Simulations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 40, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import matplotlib.pyplot as plt\n", 11 | "import seaborn as sns\n", 12 | "import random\n", 13 | "import warnings\n", 14 | "import pandas as pd\n", 15 | "warnings.filterwarnings('ignore')" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 58, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "# n is used for realized sample size and dist_n is the n of simulated RV's\n", 25 | "\n", 26 | "p = 0.65\n", 27 | "n = 100000\n", 28 | "dist_n = 200\n", 29 | "bsamp = np.array([1 if random.random() <= p else 0 for i in range(n)])\n", 30 | "brvs = np.array([[1 if random.random() <= p else 0 for i in range(n)] for j in range(dist_n)])" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 73, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "image/png": "\n", 41 | "text/plain": [ 42 | "
" 43 | ] 44 | }, 45 | "metadata": { 46 | "needs_background": "light" 47 | }, 48 | "output_type": "display_data" 49 | }, 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "-------------------------------------------------\n", 55 | "This is a rough estimate/simulation of RnBar, the actual random variable\n", 56 | "and mean estimator of a Bernoulli(p=0.65) at n=200 samples (before data)\n", 57 | "-------------------------------------------------\n", 58 | "RnBar is roughly distributed N(p, p(1-p)/n)\n", 59 | "(p * (1-p) / n ~= 0.0011)\n", 60 | "-------------------------------------------------\n", 61 | " R RnBar\n", 62 | "mean 0.6500 0.649950\n", 63 | "variance 0.2275 0.001141\n", 64 | "\n", 65 | "-----------------------------------\n", 66 | "RnBar converges in probability to p\n", 67 | "-----------------------------------\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "# Convergence of RnBar, the sample mean of a Bernoulli(p = 0.65)\n", 73 | "\n", 74 | "rnbar_actual = np.mean(bsamp)\n", 75 | "rnbar_dist = np.mean(brvs, axis=0)\n", 76 | "\n", 77 | "plt.title('Simulated Distribution of RV RnBar @n=200', fontsize=14)\n", 78 | "sns.distplot(rnbar_dist, bins=20)\n", 79 | "plt.show()\n", 80 | "\n", 81 | "print('-------------------------------------------------')\n", 82 | "print('RnBar converges in distribution to N(0, p(1-p))')\n", 83 | "print(f'(p * (1-p) / n ~= {round(p * (1 - p) / 200, 4)})')\n", 84 | "print('-------------------------------------------------')\n", 85 | "table = pd.DataFrame({'R': [p, p * (1 - p)], 'RnBar': [np.mean(rnbar_dist), np.var(rnbar_dist)]})\n", 86 | "table.rename(index={0:'mean', 1:'variance'}, inplace=True)\n", 87 | "print(table)\n", 88 | "\n", 89 | "print('\\n-----------------------------------')\n", 90 | "print('RnBar converges in probability to p')\n", 91 | "print('-----------------------------------')\n", 92 | "print(f'estimated p: {rnbar_actual}')" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 74, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "image/png": "\n", 103 | "text/plain": [ 104 | "
" 105 | ] 106 | }, 107 | "metadata": { 108 | "needs_background": "light" 109 | }, 110 | "output_type": "display_data" 111 | }, 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "-------------------------------------------------\n", 117 | "sqrt(n) * (RnBar - p) converges in distribution to N(0, p(1-p))\n", 118 | "-------------------------------------------------\n", 119 | " R RnBar\n", 120 | "mean 0.6500 -0.000711\n", 121 | "variance 0.2275 0.228179\n", 122 | "0.2281787449875001\n", 123 | "0.8664640788861249\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "# Convergence of sqrt(n) (RnBar - p)\n", 129 | "\n", 130 | "rv = np.sqrt(dist_n) * (rnbar_dist - p)\n", 131 | "actual = np.sqrt(n) * (rnbar_actual - p)\n", 132 | "\n", 133 | "plt.title('Simulated Distribution of RV sqrt(n) * (RnBar - p) @n=200', fontsize=14)\n", 134 | "sns.distplot(rv, bins=20)\n", 135 | "plt.show()\n", 136 | "\n", 137 | "print('-------------------------------------------------')\n", 138 | "print('sqrt(n) * (RnBar - p) converges in distribution to N(0, p(1-p))')\n", 139 | "print('-------------------------------------------------')\n", 140 | "table = pd.DataFrame({'R': [p, p * (1 - p)], 'RnBar': [np.mean(rv), np.var(rv)]})\n", 141 | "table.rename(index={0:'mean', 1:'variance'}, inplace=True)\n", 142 | "print(table)\n" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.7.0" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 2 167 | } 168 | -------------------------------------------------------------------------------- /U02 L04 - Parametric Estimation and Confidence Intervals.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Fundamentals of Statistics Unit 2 Lecture 4 Notes " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 15, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import random\n", 18 | "import pandas as pd\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "import seaborn as sns\n", 21 | "import warnings\n", 22 | "warnings.filterwarnings('ignore')" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Table of Contents\n", 30 | "- Parameter Estimation Definitions\n", 31 | "- Bias of an Estimator\n", 32 | "- Jensen's Inequality\n", 33 | "- Variance of an Estimator\n", 34 | "- Quadratic Risk\n", 35 | "- Confidence Intervals\n", 36 | "\n", 37 | " Click here to render this page with nbviewer.jupyter.org and use bookmarks. " 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "# Parameter Estimation Definitions \n", 45 | "\n", 46 | "\n", 47 | "### Statistic \n", 48 | "\n", 49 | "Any measurable function of the sample, e.g. $\\bar{X}_n$, $max_i X_i, X_1 + log(1 + |X_n|)$, $s^2$ (sample variance), ...\n", 50 | "\n", 51 | "\"If you can compute it once I give you data, it is measurable.\"\n", 52 | "\n", 53 | "### Estimator of $\\theta$\n", 54 | "\n", 55 | "Any statistic whose expression does not depend on $\\theta$.\n", 56 | "\n", 57 | "### An estimator $\\hat{\\theta}_n$ of $\\theta$ is weakly (resp. strongly) consistent if\n", 58 | "\n", 59 | "$\\hat{\\theta}_n \\xrightarrow[n \\to \\infty]{\\mathbb{P}(resp. a.s.)} \\theta$ $(w.r.t. \\mathbb{P})$\n", 60 | "\n", 61 | "- Note: resp. here shows the relationships weakly is to $\\mathbb{P}$ as strongly is to a.s.\n", 62 | "\n", 63 | "### An estimator $\\hat{\\theta}_n$ of $\\theta$ is asymptotically normal if\n", 64 | "\n", 65 | "$\\sqrt{n}(\\hat{\\theta}_n - \\theta) \\xrightarrow[n \\to \\infty]{(d)} \\mathcal{N}(0, \\sigma^2)$\n", 66 | "\n", 67 | "- The quantity $\\sigma^2$ is then called asymptotic variance of $\\hat{\\theta}_n$.\n", 68 | "\n", 69 | "asymptotic variance as defined here is distinct from, and necessarily follows from:\n", 70 | "\n", 71 | "$var(\\hat{\\theta}_n)\\xrightarrow[n \\to \\infty]{} 0$\n", 72 | "\n", 73 | "- graphic example in 'Unit 1 Notes', 'Law of Large Numbers / Convergence'" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "# Bias of an Estimator \n", 81 | "\n", 82 | "Bias of an estimator $\\hat{\\theta}_n$ of $\\theta$:\n", 83 | " \n", 84 | "$bias(\\hat{\\theta}_n) = E[\\hat{\\theta}_n] - \\theta$\n", 85 | "\n", 86 | "- if an estimator has bias 0, it's unbiased\n", 87 | "\n", 88 | "Examples ($X_1, ..., X_n \\overset{iid}{\\sim} Ber(p)$):\n", 89 | "\n", 90 | "- $\\hat{p}_n = \\bar{X}_n$: $bias(\\hat{p}_n) = 0$\n", 91 | "- $\\hat{p}_n = X_1$: $bias(\\hat{p}_n) = 0$\n", 92 | "- $\\hat{p}_n = \\frac{X_1 + X_2}{2}$: $bias(\\hat{p}_n) = 0$\n", 93 | "- $\\hat{p}_n = \\sqrt{\\mathcal{I}(X_1 = 1, X_2 = 1)} = \\sqrt{Ber(p^2)} = Ber(p^2)$: $bias(\\hat{p}_n) = p^2 - p$\n", 94 | "\n", 95 | "Unbiased $\\neq$ best! The words are related in english, but the ideas are only loosely related here. The definition is exactly as stated.\n" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 31, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "Estimating theta = 0.5\n", 108 | "----------------------\n", 109 | "Estimator [sample mean] with bias 0 : 0.6\n", 110 | "Estimator [X1] with bias 0 : 1\n", 111 | "Estimator [(X1 + X2) / 2] with bias 0 : 1.0\n", 112 | "Estimator [sqrt(X1 == 1 & X2 == 1)] with bias -0.25 : 1.0\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "p = 0.5\n", 118 | "n = 10\n", 119 | "# generates the sum of n bernoullis w.p. p\n", 120 | "Xsum = np.random.binomial(n, p)\n", 121 | "\n", 122 | "est1 = Xsum / n\n", 123 | "# if Xsum = 6, est1 = 0.6, X1 == 1 and X2 == 1 each with probability 0.6\n", 124 | "X1, X2 = 1 if random.random() <= est1 else 0, 1 if random.random() <= est1 else 0\n", 125 | "est2 = X1\n", 126 | "est3 = (X1 + X2) / 2\n", 127 | "est4 = np.sqrt(X1 == 1 and X2 == 1)\n", 128 | "\n", 129 | "print('Estimating theta = 0.5')\n", 130 | "print('----------------------')\n", 131 | "print(f'Estimator [sample mean] with bias 0 : {round(est1, 2)}')\n", 132 | "print(f'Estimator [X1] with bias 0 : {round(est2, 2)}')\n", 133 | "print(f'Estimator [(X1 + X2) / 2] with bias 0 : {round(est3, 2)}')\n", 134 | "print(f'Estimator [sqrt(X1 == 1 & X2 == 1)] with bias {p**2 - p} : {round(est4, 2)}')" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "# Jensen's inequality \n", 142 | "\n", 143 | "A function $g: \\mathbb{R} \\to \\mathbb{R}$ is convex if for all pairs\n", 144 | "\n", 145 | "$g(tx_1 + (1 - t)x_2) \\leq tg(x_1) + (1 - t)g(x_2)$ for all $0 \\leq t \\leq 1$\n", 146 | "\n", 147 | "Geometrically, this means for all $x_1 \\leq x \\leq x_2$, the secant line connecting $(x_1, g(x_1))$ and $(x_2, g(x_2))$ is above, or dominates the graph of g on the domain between the x values." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 88, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "image/png": "\n", 158 | "text/plain": [ 159 | "
" 160 | ] 161 | }, 162 | "metadata": { 163 | "needs_background": "light" 164 | }, 165 | "output_type": "display_data" 166 | }, 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "To understand how this idea affects an rv X ~ Ber(p)...\n", 172 | " The secant line connects (E[X],g(E[X])) and ((1-E[X]), g(1-E[X])).\n", 173 | "The blue dot is (E[X], g(E[X]))\n", 174 | "The orange dot is (E[X], E[g(X)]) -- E[g(X)] is x2*g(x2) + x1*g(x1)\n", 175 | "The x coordinate of the orange dot is arbitrary, but if it were moved over to the right,\n", 176 | " you can see it corresponds to a y-value on the secant line.\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "def g(x):\n", 182 | " return x**2\n", 183 | "x = np.linspace(0, 1, 50)\n", 184 | "gvect = np.vectorize(g)\n", 185 | "y = gvect(x)\n", 186 | "\n", 187 | "plt.figure(figsize=(10,4))\n", 188 | "plt.subplot(1, 2, 1)\n", 189 | "plt.title('convex function f(x) = x^2 and secant lines', fontsize=14)\n", 190 | "plt.plot(x, y)\n", 191 | "plt.plot([0.15, 0.85], gvect([0.15, 0.85]))\n", 192 | "plt.plot([0.1, 0.9], gvect([0.1, 0.9]))\n", 193 | "plt.plot([0.35, 0.65], gvect([0.35, 0.65]))\n", 194 | "\n", 195 | "plt.subplot(1, 2, 2)\n", 196 | "plt.title('Jensen & Bernoulli (read below)')\n", 197 | "plt.plot(x, y)\n", 198 | "plt.plot([0.15, 0.85], gvect([0.15, 0.85]))\n", 199 | "plt.plot(0.15, g(0.15), 'o', c='b')\n", 200 | "plt.plot(0.15, 0.15 * g(0.15) + 0.85 * g(0.85), 'o', c='orange')\n", 201 | "plt.show()\n", 202 | "\n", 203 | "print('To understand how this idea affects an rv X ~ Ber(p)...\\n The secant line connects (E[X],g(E[X])) and ((1-E[X]), g(1-E[X])).')\n", 204 | "print('The blue dot is (E[X], g(E[X]))')\n", 205 | "print('The orange dot is (E[X], E[g(X)]) -- E[g(X)] is x2*g(x2) + x1*g(x1)')\n", 206 | "print('The x coordinate of the orange dot is arbitrary, but if it were moved over to the right,\\n you can see it corresponds to a y-value on the secant line.')\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "If $x_1 = 0$ and $x_2 = 1$, the inequality can be interpeted as follows. Let $X \\sim Ber(t)$, $0 \\leq t \\leq 1$, then the right and left hand sides of the inequality can be rewritten respectively as:\n", 214 | "\n", 215 | "- $g(t(0) + (1 - t)(1)) = g(1 - t) = g(E[X])$\n", 216 | "\n", 217 | "- $tg(x_1) + (1 - t)(x_2) = E[g(X)]$\n", 218 | "\n", 219 | "Which implies\n", 220 | "\n", 221 | "- $g(E[X]) \\leq E[g(X)]$ for any Bernoulli and convex g\n", 222 | "\n", 223 | "#### This is true for all random variables, and the reverse can be said when g is concave.\n", 224 | "\n", 225 | "- Visual/Memory aid: On the graph above the x values of the secant lines were chosen to correspond to possible Bernoulli $p$ and $1 - p$. $g(E[X])$ is always somewhere on the graph of $g$, and $E[g(x)]$ is always somewhere on the secant line dominating g." 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "# Variance of an Estimator \n", 233 | "\n", 234 | "- $var(X) = E[X^2] - (E[X])^2$\n", 235 | "\n", 236 | "Estimators are R.V.s, so their variance can be calculated.\n", 237 | "\n", 238 | "Example: remember that $var(aX) = a^2 var(X)$, so...\n", 239 | "\n", 240 | "- $var(\\bar{X}_n) = var(\\frac{X_1 + ... + X_n}{n}) = \\frac{1}{n^2} * var(X_1 + ... + X_n) = \\frac{n var(X)}{n^2} = \\frac{var(X)}{n}$\n", 241 | "\n", 242 | "Other examples $(X \\sim Ber(p))$:\n", 243 | "- $var(X_1) = p(1-p)$\n", 244 | "- $var(\\bar{X}_n) = \\frac{p(1-p)}{n}$\n", 245 | "- $var(\\frac{X_1 - X_2}{2}) = \\frac{p(1-p)}{2}$\n", 246 | "- $var(\\sqrt{X_1 = 1, X_2 = 1}) = var(Ber(p^2)) = p^2(1-p^2)$" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "# Quadratic Risk \n", 254 | "\n", 255 | "$R(\\hat{\\theta}_n) = E[(\\hat{\\theta}_n - \\theta)^2]$\n", 256 | "\n", 257 | "is known as quadratic risk in classical statistics and mean squared error in bayesian statistics.\n", 258 | "\n", 259 | "- low quadratic risk means that both bias and variance are small.\n", 260 | "\n", 261 | "Take the formula, add and subtract $E[\\hat{\\theta}_n]$ inside the expectation, since the two terms cancel out and it remains the same, then expand the square.\n", 262 | "\n", 263 | "$E[(\\hat{\\theta}_n - E[\\hat{\\theta}_n] + E[\\hat{\\theta}_n] - \\theta)^2]$\n", 264 | "\n", 265 | "$= E[(\\hat{\\theta}_n - E[\\hat{\\theta}_n])^2] \\quad + \\quad E[(E[\\hat{\\theta}_n] - \\theta)^2] \\quad + \\quad 2E[(\\hat{\\theta}_n - E[\\hat{\\theta}_n])(E[\\hat{\\theta}_n] - \\theta)]$\n", 266 | "\n", 267 | "... where\n", 268 | "\n", 269 | "$E[(\\hat{\\theta}_n - E[\\hat{\\theta}_n])^2]$ is the variance of $\\hat{\\theta}_n$\n", 270 | "\n", 271 | "$E[(E[\\hat{\\theta}_n] - \\theta)^2]$ is the $bias^2$ of $\\hat{\\theta}_n$\n", 272 | "\n", 273 | "and\n", 274 | "\n", 275 | "$(\\hat{\\theta}_n - E[\\hat{\\theta}_n]) = 0 \\rightarrow 2E[(\\hat{\\theta}_n - E[\\hat{\\theta}_n])(E[\\hat{\\theta}_n] - \\theta)] = 0$\n", 276 | "\n", 277 | "so, quadratic risk = $variance$ + $bias^2$" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "# Confidence Intervals \n", 285 | "\n", 286 | "Let $(E, (\\mathbb{P}_\\theta)_{\\theta \\in \\Theta})$ be a statistical model based on observations $X_1, ..., X_n$, and assume $\\Theta \\in \\mathbb{R}$. Let $\\alpha \\in (0, 1)$.\n", 287 | "\n", 288 | "- Confidence Interval of level $1 - \\alpha$ for $\\theta$: Any random (depending on $X_1, ..., X_n$) interval $\\mathcal{I}$ whose boundaries do not depend on $\\theta$ and such that:\n", 289 | "\n", 290 | "$\\mathbb{P}[\\mathcal{I} \\ni \\theta] \\geq 1 - \\alpha, \\quad \\forall \\theta \\in \\Theta$\n", 291 | "\n", 292 | "- Confidence Interval of asymptotic level $1 - \\alpha$ for $\\theta$: Any random interval $\\mathcal{I}$ whose boundaries do not depend on $\\theta$ and such that:\n", 293 | " \n", 294 | "$\\lim_{n \\to \\infty} \\mathbb{P}[\\mathcal{I} \\ni \\theta] \\geq 1 - \\alpha, \\quad \\forall \\theta \\in \\Theta$\n", 295 | "\n", 296 | "- Note: $\\mathcal{I} \\ni \\theta$ is the same as saying $\\theta \\in \\mathcal{I}$, but the reversal is used here to emphasize the randomness of $\\mathcal{I}$. Rather than 'theta in the CI', it's 'CI contains theta'.\n", 297 | "\n", 298 | "- for a code sim / graphic representation of CI and confidence level, showing how alpha is at first set, then manifests through iterations of estimation, see 'Unit 2 Lecture 3 Notes', 'Trinity of Statistical Inference'.\n", 299 | "\n", 300 | "\n", 301 | "### CI for the Kiss Example\n", 302 | "\n", 303 | "$R_1, ... R_n \\overset{iid}{\\rightarrow} Ber(p)$, for some unknown $p \\in (0, 1)$\n", 304 | "\n", 305 | "- Statistical model: $({0, 1}, (Ber(p))_{p \\in (0, 1)})$\n", 306 | "- Estimator for p: $\\hat{p} = \\bar{R}_n$\n", 307 | "\n", 308 | "$\\sqrt{n} \\frac{\\bar{R}_n - p}{\\sqrt{p(1-p)}} \\xrightarrow[n \\to \\infty]{(d)} \\mathbb{Z} = \\mathcal{N}(0, 1)$\n", 309 | "\n", 310 | "- $\\Phi_n(x) \\approx \\Phi(x)$ when $n$ becomes large. So, for all $x > 0$:\n", 311 | "\n", 312 | "$\\mathbb{P} [|\\bar{R}_n - p| \\geq x] \\approx 2 (1 - \\Phi(\\frac{x\\sqrt{n}}{\\sqrt{p(1-p)}}))$\n", 313 | "\n", 314 | "and\n", 315 | "\n", 316 | "$\\bar{R}_n \\in [p - \\frac{q_{\\alpha/2}\\sqrt{p(1-p)}}{\\sqrt{n}}, p + \\frac{q_{\\alpha/2}\\sqrt{p(1-p)}}{\\sqrt{n}}]$\n", 317 | "\n", 318 | "\n", 319 | "### Working the formula for x\n", 320 | "\n", 321 | "$2 (1 - \\Phi(\\frac{x\\sqrt{n}}{\\sqrt{p(1-p)}})) = \\alpha$\n", 322 | "\n", 323 | "$\\frac{x\\sqrt{n}}{\\sqrt{p(1-p)}} = \\Phi^{-1}(1 - \\frac{\\alpha}{2}) = q_{\\alpha / 2}$\n", 324 | "\n", 325 | "- $x = \\frac{q_{\\alpha / 2}\\sqrt{p(1-p)}}{\\sqrt{n}}$\n", 326 | "\n", 327 | "So, \n", 328 | "\n", 329 | "- $\\lim_{n \\to \\infty} \\mathbb{P}([\\bar{R}_n - x, \\bar{R}_n + x] \\ni p) = 1 - \\alpha$\n", 330 | "\n", 331 | "#### ...which is not yet a confidence interval because this $x$ depends on the unknown $p$.\n", 332 | "\n", 333 | "### Conservative bound\n", 334 | "\n", 335 | "- $p(1-p) \\leq \\frac{1}{4}$, given $p(1-p)$ is largest when $p = 0.5$\n", 336 | "- $\\sqrt{1/4} = 1/2$\n", 337 | "\n", 338 | "giving the asymptotic confidence interval\n", 339 | "\n", 340 | "$\\mathcal{I}_{conserv} = [\\bar{R}_n - \\frac{q_{\\alpha/2}}{2\\sqrt{n}}, \\bar{R}_n + \\frac{q_{\\alpha/2}}{2\\sqrt{n}}]$\n", 341 | "\n", 342 | "because as $n \\to \\infty$, the probability p is in the conservative confidence interval is at least $1 - \\alpha$" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [] 386 | } 387 | ], 388 | "metadata": { 389 | "kernelspec": { 390 | "display_name": "Python 3", 391 | "language": "python", 392 | "name": "python3" 393 | }, 394 | "language_info": { 395 | "codemirror_mode": { 396 | "name": "ipython", 397 | "version": 3 398 | }, 399 | "file_extension": ".py", 400 | "mimetype": "text/x-python", 401 | "name": "python", 402 | "nbconvert_exporter": "python", 403 | "pygments_lexer": "ipython3", 404 | "version": "3.7.0" 405 | } 406 | }, 407 | "nbformat": 4, 408 | "nbformat_minor": 2 409 | } 410 | -------------------------------------------------------------------------------- /U02 L06&07 - Hypothesis Testing, Type I & II Errors, Levels and P-values.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 2 Lectures 6 and 7 - Hypothesis Testing, Type I & II Error, Levels, and P-values" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Hypothesis Testing\n", 15 | "\n", 16 | "Asking binary questions of data in the form of the null hypothesis and the alternative hypothesis, which [suggest no effect or reinforce the status quo], and [suggest an effect or reject the status quo], respectively. " 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Modeling Assumptions\n", 24 | "\n", 25 | "Assumptions simplify the model to a family/families of distributions...\n", 26 | "\n", 27 | "$X_1, ..., X_n$ i.i.d. Poisson($\\lambda$)\n", 28 | "\n", 29 | "... in disjoint hypothesis spaces. \n", 30 | "\n", 31 | "$H_0: \\lambda \\in \\Theta_0$\n", 32 | "\n", 33 | "$H_A: \\lambda \\in \\Theta_1$\n", 34 | "\n", 35 | "$\\Theta_0 \\cap \\Theta_1 = \\emptyset$\n", 36 | "\n", 37 | "We can impose assumptions based on known status quo or by logical induction. Assumptions can also be made if we only care about a certain kind of result, for example making our hypothesis one-sided if we only care about $\\lambda$ being larger than any status quo $\\lambda_0$." 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### One-Sided Hypothesis\n", 45 | "\n", 46 | "Rejection interval is to the left or right of the parameter of interest under the null hypothesis, $H_0$. e.g.:\n", 47 | "\n", 48 | "$H_0: \\theta \\leq x$\n", 49 | "\n", 50 | "$H_A: \\theta > x$" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "### Two-Sided Hypothesis\n", 58 | "\n", 59 | "Rejection interval is a union of intervals to the left and right of the parameter of interest under the null hypothesis, $H_0$. e.g.: \n", 60 | "\n", 61 | "$H_0: \\theta = x$\n", 62 | "\n", 63 | "$H_A: \\theta \\neq x$" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### Treatment and Control\n", 71 | "\n", 72 | "If the status quo for our specific null hypothesis isn't well-established, we need a two-sample test so we can first establish a baseline - a measurement of random noise unrelated to the effect we expect in our alternate hypothesis. This group of samples is called the control group, and they establish our null hypothesis, $H_0$. The samples that are exposed to treatment, then, are those that have expected outcome in the alternate hypothesis, $H_A$. The treatment effect is the difference between the treatment and control group. But, rather than estimating the effect here, we're only interested in answering the binary question is there a significant difference? And, what is the $level$ of this significance?" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## The Test Statistic\n", 80 | "\n", 81 | "The test statistic seeks to answer the question if the null hypothesis is true, how likely is our parameter estimate? Where does our estimate fall on the standardized distribution of a $\\theta_0 \\in \\Theta_0$?\n", 82 | "\n", 83 | "If our hypothesis is one-sided and our null is fixed with some parameter $\\theta_0$ under $H_0$, and standard deviation $\\sigma_0$, and if $H_0$ is true,\n", 84 | "\n", 85 | "### $T = \\sqrt{n} \\frac{\\hat{\\theta}_n - \\theta_0}{\\sigma_{0}} \\xrightarrow[n \\to \\infty]{(d)} \\mathcal{N}(0, 1)$\n", 86 | "\n", 87 | "... the realization of this statistic is a realization from a standard gaussian.\n", 88 | "\n", 89 | "### $T = \\sqrt{n} |\\frac{\\hat{\\theta}_n - \\theta_0}{\\sigma_{0}}|$ \n", 90 | "\n", 91 | "... is then our two-sided test-statistic, since it gets the absolute standardized distance of $\\hat{\\theta}_n$ from $\\theta_0$.\n", 92 | "\n", 93 | "Question of note: what if $\\theta_0$ isn't fixed? If $\\sigma_{0}$ is estimated by $\\hat{\\theta}_n$, what is $\\sigma_0$ in relation to $\\hat{\\theta}_n$ asymptotically (as $n \\to \\infty$)? " 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "## The Test\n", 101 | "\n", 102 | "$\\psi$ is any indicator function designed to output [1] or [0], signifying [reject $H_0$] or [fail to reject $H_0$], with input $\\{X_1, ..., X_n\\}$, the set of our sample. You can see it as converting your test statistic (how likely is this set a set of realizations if $H_0$ is true?) into an answer to the question is the distance from $\\theta_0$ large enough to reject it? Our test can be anything, but to make it a good one, we need to be thoughtful." 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Which $\\theta_0 \\in \\Theta_0$?\n", 110 | "\n", 111 | "### Type I Error, $\\alpha_{\\psi}$\n", 112 | "\n", 113 | "The probability of a false positive, or:\n", 114 | "\n", 115 | "$\\mathbb{P}(\\psi = 1$ and $\\theta \\in \\Theta_0)$\n", 116 | "\n", 117 | "### Type II Error, $\\beta_{\\psi}$\n", 118 | "\n", 119 | "The probability of a false negative, or:\n", 120 | "\n", 121 | "$\\mathbb{P}(\\psi = 0$ and $\\theta \\in \\Theta_1)$\n", 122 | "\n", 123 | "### Choosing $\\theta_0$ based on error\n", 124 | "\n", 125 | "Consider\n", 126 | "\n", 127 | "$H_0: \\theta \\leq x$\n", 128 | "\n", 129 | "$H_A: \\theta > x$\n", 130 | "\n", 131 | "As $\\theta_0$ moves closer to $x$ from the left, its distribution moves with it, which increases $\\alpha$ since realizations overlap with realizations of distributions beyond $x$ - those with $\\theta_1 \\in \\Theta_1$ that might give $\\psi = 1$. \n", 132 | "\n", 133 | "At $\\theta_0 = x$, $\\alpha_{\\psi}$ is at its maximum. But, more importantly, the so is the...\n", 134 | "\n", 135 | "### power\n", 136 | "\n", 137 | "$\\pi_{\\psi} = inf_{\\theta_1}(1 - \\beta_{\\psi}(\\theta))$\n", 138 | "\n", 139 | "... which is the probability of rejecting a false $H_0$. It's probably easier to estimate values closer to $\\theta_0 = x$, for all $\\theta_1 \\in \\Theta_1$. This makes $x$ the ideal $\\theta_0$\n", 140 | "\n", 141 | "Note: \n", 142 | "\n", 143 | "$inf$ $(0, 2] = 0$\n", 144 | "\n", 145 | "$min$ $(0, 2] \\neq 0$\n", 146 | "\n", 147 | "... which allows us to include a $\\theta_0$ in our power calculation as if it were a $\\theta_1$\n", 148 | "\n", 149 | "## Level $\\alpha$\n", 150 | "\n", 151 | "To reduce our type I error $\\alpha_{\\psi}$, we need to put a buffer around $x$. How big? $\\alpha_{\\psi} = 0$ if $\\psi = 1$ is unattainable. All we have to do is move the rejection goalpost so far away that our estimator $\\hat{\\theta}_n$ will never match it. \n", 152 | "\n", 153 | "Our test has level $\\alpha$ if:\n", 154 | "\n", 155 | "- $\\mathbb{P}(\\psi = 1$ and $\\theta \\in \\Theta_0) \\leq \\alpha$\n", 156 | "\n", 157 | "or, more compact:\n", 158 | "\n", 159 | "- $\\alpha_{\\psi} \\leq \\alpha$\n", 160 | "\n", 161 | "So, your level $\\alpha$ as an upper bound on the Type I error you're allowing for the test to reject $H_0$.\n", 162 | "\n", 163 | "For our one-sided test from before, \n", 164 | "\n", 165 | "$\\psi$ has level $\\alpha$ if $\\psi = 1$ only when $T \\geq q_{\\alpha}$ for all $\\theta_0 \\in \\Theta_0$\n", 166 | "\n", 167 | "but, remember we already decied $\\theta_0 = x$, so we don't have to worry about all of $\\Theta_0$, which simplifies the test to\n", 168 | "\n", 169 | "$\\psi_{\\alpha} = 1$ when $T_{\\theta_0=x} \\geq q_{\\alpha}$, $\\psi = 0$ otherwise\n", 170 | "\n", 171 | "... which means we can find a $c$ such that $\\hat{\\theta}_n \\geq c$. \n", 172 | "\n", 173 | "- $f(\\hat{\\theta}_n) = T \\to f^{-1}(q_{\\alpha}) = c$\n", 174 | "\n", 175 | "and\n", 176 | "\n", 177 | "- $\\sqrt{n} \\frac{\\hat{\\theta}_n - \\theta_0}{\\sigma_{0}} = T_{\\theta_0 = x} \\geq \\sqrt{n} \\frac{c - \\theta_0}{\\sigma_{0}} = q_{\\alpha}$\n", 178 | "\n", 179 | "Note: the quantile changes if the test is two-sided. Also, mind the inequality and sign changes when $H_A: \\theta \\leq x$" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "## P-value\n", 187 | "\n", 188 | "This is the probability of a realization of a standard gaussian being at least as far from the mean (0) as $T$. It quantifies our confidence in rejecting the null - our minimum level $\\alpha$ for T - the smaller the better. \n", 189 | "\n", 190 | "Continuing with the aforementioned one-sided hypothesis, we get our p-value from the simple statistic\n", 191 | "\n", 192 | "- $1 - \\Phi(T)$" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "# Visualizations" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 1, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "import numpy as np\n", 209 | "import matplotlib.pyplot as plt\n", 210 | "import seaborn as sns\n", 211 | "import warnings\n", 212 | "from scipy.stats import norm\n", 213 | "warnings.filterwarnings('ignore')\n", 214 | "plt.style.use('bmh')" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 2, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "def std_normal(x):\n", 224 | " return 1/np.sqrt(2 * np.pi) * np.exp(-(x**2/2))" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 3, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "# unknown theta = 0.57 (p of Bernoulli)\n", 234 | "p = 0.57\n", 235 | "\n", 236 | "# get the sample and sample mean\n", 237 | "n = 200\n", 238 | "sample = np.array([1 if np.random.rand() <= p else 0 for i in range(n)])\n", 239 | "theta_hat = np.mean(sample)\n", 240 | "\n", 241 | "# null hypothesis: theta = 0.5; alternate: theta != 0.5\n", 242 | "theta_0 = 0.5\n", 243 | "\n", 244 | "# get the test statistic\n", 245 | "tstat = abs(np.sqrt(n) * ((theta_hat - theta_0) / np.sqrt(theta_0 * (1 - theta_0))))\n", 246 | "\n", 247 | "# the assumption is that the test statistic goes to a standard normal, so we impose the normal distribution rather than\n", 248 | "# estimating it. In fact, since the unkown parameter is 0.57, not 0.5, the distribution is asymptotically divergent. As\n", 249 | "# n gets larger, so does sqrt(n), which amplifies the 0.07 difference. This is precisely the effect that gives us greater\n", 250 | "# ability to reject with larger sample size.\n", 251 | "domain = np.linspace(-4, 4, 200)\n", 252 | "snorm = std_normal(domain)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 4, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "image/png": "\n", 263 | "text/plain": [ 264 | "
" 265 | ] 266 | }, 267 | "metadata": { 268 | "needs_background": "light" 269 | }, 270 | "output_type": "display_data" 271 | } 272 | ], 273 | "source": [ 274 | "sns.set_palette('husl')\n", 275 | "pal = sns.color_palette()\n", 276 | "\n", 277 | "plt.title('Test Statistic on the Standard Normal')\n", 278 | "ax = sns.lineplot(domain, snorm)\n", 279 | "plt.axvline(tstat, c=pal[0], label='test stat')\n", 280 | "plt.axvline(-tstat, c=pal[0])\n", 281 | "plt.axvline(1.96, c=pal[4], label='level 0.05')\n", 282 | "plt.axvline(-1.96, c=pal[4])\n", 283 | "ax.fill_between(domain, snorm, color=pal[3])\n", 284 | "plt.legend(facecolor='white')\n", 285 | "plt.show()" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 5, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "name": "stdout", 295 | "output_type": "stream", 296 | "text": [ 297 | "This test has level 0.05, so we only reject if T >= q(0.025)\n", 298 | "T = 1.7, q(0.025) = 1.96, so the test fails to reject at level 0.05!\n", 299 | "\n", 300 | "We can do the same test by inverting the test statistic function to change the quantile into some constant c, \n", 301 | "then compare that to theta hat\n", 302 | "theta hat = 0.56, (+) c = 0.569, so the test fails to reject at level 0.05!\n" 303 | ] 304 | }, 305 | { 306 | "data": { 307 | "image/png": "\n", 308 | "text/plain": [ 309 | "
" 310 | ] 311 | }, 312 | "metadata": { 313 | "needs_background": "light" 314 | }, 315 | "output_type": "display_data" 316 | } 317 | ], 318 | "source": [ 319 | "print('This test has level 0.05, so we only reject if T >= q(0.025)') \n", 320 | "print(f'T = {round(tstat,2)}, q(0.025) = 1.96, so the test {\"rejects\" if tstat > 1.96 else \"fails to reject\"} at level 0.05!')\n", 321 | "\n", 322 | "print('\\nWe can do the same test by inverting the test statistic function to change the quantile into some constant c, \\nthen compare that to theta hat')\n", 323 | "theta_0_plus_buffer = theta_0 + ((1.96 * np.sqrt(theta_0 * (1 - theta_0))) / np.sqrt(n))\n", 324 | "print(f'theta hat = {round(theta_hat,3)}, (+) c = {round(theta_0_plus_buffer, 3)}, so the test {\"rejects\" if theta_hat > theta_0_plus_buffer else \"fails to reject\"} at level 0.05!')\n", 325 | "\n", 326 | "x = np.linspace(0.4, 0.6, 100)\n", 327 | "plt.title('Different Enough? On the line of Bernoulli p')\n", 328 | "ax = sns.lineplot(x, [0 for i in x])\n", 329 | "plt.ylim(0, 1)\n", 330 | "plt.axvline(theta_0, c=pal[1], label='theta 0')\n", 331 | "plt.axvline(theta_hat, c=pal[2], label='theta hat')\n", 332 | "plt.axvline(theta_0_plus_buffer, c=pal[3], label='t0 + lvl 0.05 buffer')\n", 333 | "plt.axvline(1 - theta_0_plus_buffer, c=pal[3])\n", 334 | "ax.fill_between([1 - theta_0_plus_buffer, theta_0_plus_buffer], y1=[1, 1], color=pal[3], alpha=0.3)\n", 335 | "plt.legend()\n", 336 | "plt.show()" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 6, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "name": "stdout", 346 | "output_type": "stream", 347 | "text": [ 348 | "To get the p-value, which gives us our minimum level alpha for T, we calculate P(Z > T)\n", 349 | "Or, in this case, the two-sided hypothesis version P(|Z| > T)\n" 350 | ] 351 | }, 352 | { 353 | "data": { 354 | "image/png": "\n", 355 | "text/plain": [ 356 | "
" 357 | ] 358 | }, 359 | "metadata": { 360 | "needs_background": "light" 361 | }, 362 | "output_type": "display_data" 363 | } 364 | ], 365 | "source": [ 366 | "print('To get the p-value, which gives us our minimum level alpha for T, we calculate P(Z > T)')\n", 367 | "print('Or, in this case, the two-sided hypothesis version P(|Z| > T)')\n", 368 | "\n", 369 | "plt.title('P-Value = minimum level alpha for T')\n", 370 | "ax = sns.lineplot(domain, snorm)\n", 371 | "mask_r, mask_l = domain > tstat, domain < -tstat\n", 372 | "x_r, y_r, x_l, y_l = domain[mask_r], snorm[mask_r], domain[mask_l], snorm[mask_l]\n", 373 | "ax.fill_between(domain, snorm, color=pal[3])\n", 374 | "ax.fill_between(x_r, y1=y_r, color=pal[0])\n", 375 | "ax.fill_between(x_l, y1=y_l, color=pal[0])\n", 376 | "plt.show()" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 7, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "name": "stdout", 386 | "output_type": "stream", 387 | "text": [ 388 | "p-value = 0.09, which can be thought of as...\n", 389 | "- the minimum level alpha at which we can reject the null\n", 390 | "- the probability of Type I error for our specific test & test statistic.\n", 391 | "\n", 392 | " the last one isn't totally true, but it's close enough and can help you remember\n" 393 | ] 394 | } 395 | ], 396 | "source": [ 397 | "pval = round(2 * (1 - norm.cdf(tstat)), 3)\n", 398 | "print(f'p-value = {pval}, which can be thought of as...')\n", 399 | "print(f'- the minimum level alpha at which we can reject the null')\n", 400 | "print(f'- the probability of Type I error for our specific test & test statistic.')\n", 401 | "print('\\n the last one isn\\'t totally true, but it\\'s close enough and can help you remember')" 402 | ] 403 | } 404 | ], 405 | "metadata": { 406 | "kernelspec": { 407 | "display_name": "Python 3", 408 | "language": "python", 409 | "name": "python3" 410 | }, 411 | "language_info": { 412 | "codemirror_mode": { 413 | "name": "ipython", 414 | "version": 3 415 | }, 416 | "file_extension": ".py", 417 | "mimetype": "text/x-python", 418 | "name": "python", 419 | "nbconvert_exporter": "python", 420 | "pygments_lexer": "ipython3", 421 | "version": "3.7.0" 422 | } 423 | }, 424 | "nbformat": 4, 425 | "nbformat_minor": 2 426 | } 427 | -------------------------------------------------------------------------------- /U03 L08 - Distance Measures Between Distributions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 3 Lecture 8 - Distance Measures Between Distributions" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Total Variation (TV) Distance\n", 15 | "\n", 16 | "Gives us a notion of distance between some distributions.\n", 17 | "\n", 18 | "- $TV$ is symmetric, positive, definite, and it satisfies the triangle inequality.\n", 19 | "\n", 20 | "For two probability distributions $\\mathbb{P}_{\\theta}$, $\\mathbb{P}_{\\theta^{\\prime}}$ over the union of their supports $E$, $\\mathbb{P}_{\\theta}$ and $\\mathbb{P}_{\\theta^{\\prime}}$ are close by measure of TV if\n", 21 | "\n", 22 | "$|\\mathbb{P}_{\\theta}(A) - \\mathbb{P}_{\\theta^{\\prime}}(A)|$ is small $\\forall A \\subset E$\n", 23 | "\n", 24 | "So,\n", 25 | "\n", 26 | "- $TV(\\mathbb{P}_{\\theta}, \\mathbb{P}_{\\theta^{\\prime}}) = max_{A \\subset E} |\\mathbb{P}_{\\theta}(A) - \\mathbb{P}_{\\theta^{\\prime}}(A)|$\n", 27 | "\n", 28 | "with formulas\n", 29 | "\n", 30 | "- $TV(\\mathbb{P}_{\\theta}, \\mathbb{P}_{\\theta^{\\prime}}) = \\frac{1}{2} \\sum_{x \\in E} |p_{\\theta}(x) - p_{\\theta^{\\prime}}(x)|$ \n", 31 | "- $TV(\\mathbb{P}_{\\theta}, \\mathbb{P}_{\\theta^{\\prime}}) = \\frac{1}{2} \\int_{x \\in E} |f_{\\theta}(x) - f_{\\theta^{\\prime}}(x)| dx$ \n", 32 | "\n", 33 | "outputs a value $\\in [0, 1]$\n", 34 | "\n", 35 | "$TV$ doesn't provide a notion of distance between most distributions, since $TV($continuous, discrete$)$, $TV($continuous distributions with no overlap$)$, and discrete cases like $X \\sim Ber(p)$ and $X + \\epsilon$, $\\epsilon \\notin \\{-1, 0, 1\\}$ all have $TV=1$.\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Kullback-Leibler (KL) Divergence\n", 43 | "\n", 44 | "Gives us a notion of divergence of one distribution from another. Also known as 'relative entropy.'\n", 45 | "\n", 46 | "- $KL$ is not symmetric, positive, definite, and does not satisfy the triangle inequality.\n", 47 | "\n", 48 | "For a fixed, unknown $\\theta^*$, KL-Divergence has formulas\n", 49 | "\n", 50 | "- $KL(\\mathbb{P}_{\\theta^*}, \\mathbb{P}_{\\theta}) = \\sum_{x \\in E} p_{\\theta^*}(x) \\cdot ln(\\frac{p_{\\theta^*}(x)}{p_{\\theta}(x)})$\n", 51 | "\n", 52 | "- $KL(\\mathbb{P}_{\\theta^*}, \\mathbb{P}_{\\theta}) = \\int_{x \\in E} f_{\\theta^*}(x) \\cdot ln(\\frac{f_{\\theta^*}(x)}{f_{\\theta}(x)})$\n", 53 | "\n", 54 | "$= \\mathbb{E}_{\\theta^*} [ln(\\frac{p_{\\theta^*}(X)}{p_{\\theta}(X)})]$\n", 55 | "\n", 56 | "$= \\mathbb{E}_{\\theta^*}[ln(p_\\theta^*(X)] - \\mathbb{E}_{\\theta^*}[ln(p_{\\theta}(X)]$\n", 57 | "\n", 58 | "The first term in this difference is a constant, so minimizing this difference means minimizing\n", 59 | "\n", 60 | "- $\\hat{KL}(\\mathbb{P}_{\\theta^*}, \\mathbb{P}_{\\theta}) = c - \\frac{1}{n} \\sum ln(p_{\\theta}(X_i))$\n", 61 | "\n", 62 | "given $\\frac{1}{n} \\sum h(X_i) \\xrightarrow[]{D} \\mathbb{E}_{\\theta^*}[h(X)]$ for all $\\theta$.\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Maximum Likelihood Principle\n", 70 | "\n", 71 | "here $\\to$ means 'if $\\theta$ satisfies the previous, it equivalently satisfies this'\n", 72 | "\n", 73 | "$min_{\\theta \\in \\Theta} \\hat{KL}(\\mathbb{P}_{\\theta^*}, \\mathbb{P}_{\\theta})$\n", 74 | "\n", 75 | "$\\to min_{\\theta \\in \\Theta} - \\frac{1}{n} \\sum ln(p_{\\theta}(X_i))$\n", 76 | "\n", 77 | "$\\to max_{\\theta \\in \\Theta} \\frac{1}{n} \\sum ln(p_{\\theta}(X_i))$\n", 78 | "\n", 79 | "$\\to max_{\\theta \\in \\Theta} ln(\\prod p_{\\theta}(X_i))$\n", 80 | "\n", 81 | "$\\to max_{\\theta \\in \\Theta} \\prod p_{\\theta}(X_i)$\n", 82 | "\n", 83 | "This is the maximum of the likelihood function $L_n(\\theta | X_1, ..., X_n)$." 84 | ] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": "Python 3", 90 | "language": "python", 91 | "name": "python3" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 3 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.7.0" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 2 108 | } 109 | -------------------------------------------------------------------------------- /U03 L09 - Introduction to Maximum Likelihood Estimation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 3 Lecture 9 - Introduction to Maximum Likelihood Estimation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Method of Moments\n", 15 | "\n", 16 | "$E[X^k]$ is the $k$th moment of an R.V. $X$. \n", 17 | "\n", 18 | "### $\\frac{1}{n} \\sum_{i=1}^n X_i^k$ \n", 19 | "\n", 20 | "is the estimator for the $k$th moment. A combination of method of moments estimators can be used to estimate a parameter, such as in $\\sigma^2 = E[X^2] - (E[X])^2$. \n", 21 | "\n", 22 | "Provides consistent, but often biased estimators.\n", 23 | "\n", 24 | "## Maximum Likelihood Estimation\n", 25 | "\n", 26 | "- $L_n := L(\\theta | X_1, ..., X_n) = \\prod_{i=1}^n f(X_i | \\theta)$ \n", 27 | "\n", 28 | "takes a sample $\\{X_1,...,X_n\\}$ and a parameter $\\theta$ and gives the likelihood the sample was created from a distribution with PDF/PMF $f$. Geometrically, it finds the maximum, or peak, of the likelihood function.\n", 29 | "\n", 30 | "- Requires iid.\n", 31 | "\n", 32 | "- $\\hat{\\theta}_{MLE}$ is the $\\theta$ that maximizes $L_n$ $\\forall \\theta \\in \\Theta$.\n", 33 | "\n", 34 | "Examples:\n", 35 | "\n", 36 | "- Bernoulli: $p^{\\sum x_i}(1-p)^{n - \\sum x_i}$\n", 37 | "- Poisson: $\\dfrac{\\lambda^{\\sum x_i}}{\\prod x_i!}e^{-n \\lambda}$\n", 38 | "- Gaussian: $\\dfrac{1}{(\\sigma \\sqrt{2 \\pi})^n} exp(- \\dfrac{1}{2 \\sigma^2} \\sum (x_i - \\mu)^2)$\n", 39 | "- Exponential: $\\lambda^n exp(- \\lambda \\sum x_i)$\n", 40 | "\n", 41 | "### Estimator w/ Indicator \n", 42 | "\n", 43 | "An MLE estimator might have an indicator attatched to it, such as with\n", 44 | "\n", 45 | "$\\lambda^n exp(- \\lambda \\sum x_i) \\mathbb{1}(x_i > 0) = \\lambda^n exp(- \\lambda \\sum x_i) \\mathbb{1}(min(x_i) > 0)$\n", 46 | "\n", 47 | "However, since in this class the model is always well-specified, $x_i$ samples will always be $> 0$, making the indicator unnecessary. Only indicators that depend on the parameter are relevant.\n", 48 | "\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "### Concave, Convex\n", 56 | "\n", 57 | "If the second derivative of $L_n$ is negative (resp. positive) over all of $\\theta$, the function is strictly concave (resp. strictly convex). If $L_n$ is strictly concave and the argmax is finite, setting $\\frac{d}{d\\theta}[L_n] = 0$ finds the maximum. Then, solving for $\\theta$ gives $\\hat{\\theta}_{MLE}$.\n", 58 | "\n", 59 | "\n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | "
concave$g \\prime \\prime (x) \\leq 0$
strictly concave$g \\prime \\prime (x) < 0$
convex$g \\prime \\prime (x) \\geq 0$
strictly convex$g \\prime \\prime (x) > 0$
" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "### Log-likelihood\n", 84 | "\n", 85 | "Maximizing $ln(L_n)$ will also maximize $L_n$, since $ln$ is a monotonic function.\n", 86 | "\n", 87 | "If setting $\\frac{d}{d\\theta}[L_n] = 0$ finds that maximum (as it will for a strictly concave function, resp. min for convex), so does setting $\\frac{d}{d\\theta}[ln(L_n)] = 0$. And, since taking the derivative of a product is harder than taking the derivative of a sum, in this case using $ln$ is the preferred method of finding $\\hat{\\theta}_{MLE}$. Thinking geometrically might also lead to an intuitive answer." 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "### Gradient\n", 95 | "\n", 96 | "The gradient of a function is the multi-variable version of a derivative, or vector of partial derivatives. Geometrically, you can think of it as being the partial contribution of each variable to the direction of the function, just as the derivative does with one variable.\n", 97 | "\n", 98 | "$f(\\theta) = c_1 \\theta_1^2 + c_2 \\theta_2^3 + c_3 \\theta_3^2$\n", 99 | "\n", 100 | "$\\nabla f = \\begin{pmatrix}\n", 101 | "\\frac{df}{d\\theta_1} \\\\\n", 102 | "\\frac{df}{d\\theta_2} \\\\\n", 103 | "\\frac{df}{d\\theta_3}\n", 104 | "\\end{pmatrix}$\n", 105 | "\n", 106 | "$= \\begin{pmatrix}\n", 107 | "2 c_1 \\theta_1 \\\\\n", 108 | "3 c_2 \\theta_2^2 \\\\\n", 109 | "2 c_3 \\theta_3\n", 110 | "\\end{pmatrix}$\n", 111 | "\n", 112 | "If a function is strictly concave and all elements of the gradient are zero at a point, that point represents the maximum of the function. The same goes for being strictly convex and finding the minimum. \n", 113 | "\n", 114 | "Functions are sometimes more complicated. The most popular algorithm to search for the global minimum error of a prediction function is called 'gradient descent'. The process is like rolling a ball downhill, looking for the deepest pit on a hilly surface, with a blindfold on.\n", 115 | "\n", 116 | "" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "### Hessian Matrix\n", 124 | "\n", 125 | "The hessian is a square matrix of second-order partial derivatives of a function. Where the gradient describes local contribution to direction, the hessian describes local curvature.\n", 126 | "\n", 127 | "$H_{i, j} = \\dfrac{d^2 f}{d \\theta_i d \\theta_j}$\n", 128 | "\n", 129 | "$\\nabla f = \\begin{pmatrix}\n", 130 | "2 c_1 \\theta_1 \\\\\n", 131 | "3 c_2 \\theta_2^2 \\\\\n", 132 | "2 c_3 \\theta_3\n", 133 | "\\end{pmatrix}$\n", 134 | "\n", 135 | "$H_f = \\begin{bmatrix}\n", 136 | "2 c_1 & 0 & 0 \\\\\n", 137 | "0 & 6 c_2 \\theta & 0 \\\\\n", 138 | "0 & 0 & 2 c_3\n", 139 | "\\end{bmatrix}$\n", 140 | "\n", 141 | "#### Positive & Negative // Definite & Semi-Definite // Concave & Convex\n", 142 | "\n", 143 | "A $d$ x $d$ matrix $H$ is positive semi-definite (resp. negative semi-definite) if...\n", 144 | "\n", 145 | "$x^THx \\geq (resp. \\leq)$ $0$ $\\forall x \\in \\mathbb{R}^d$\n", 146 | "\n", 147 | "... where $x$ isn't a vector of all zeroes. Another way to think of it is that if you added any of the elements together in any combination, the result must be positive (resp. negative) or zero.\n", 148 | "\n", 149 | "- $\\leq 0$ $\\to$ negative semi-definite $\\to$ concave\n", 150 | "- $ < 0$ $\\to$ negative definite $\\to$ strictly concave \n", 151 | "- $\\geq 0$ $\\to$ positive semi-definite $\\to$ convex\n", 152 | "- $ > 0$ $\\to$ positive definite $\\to$ strictly convex \n", 153 | "\n", 154 | "If a matrix is definite, it is also semi-definite by definition.\n", 155 | "\n", 156 | "#### Determinant and Trace\n", 157 | "\n", 158 | "Another method of finding convexity/concavity if the matrix is 2x2 and symmetric:\n", 159 | "\n", 160 | "$H = \\begin{bmatrix}\n", 161 | "a & b \\\\\n", 162 | "c & d\n", 163 | "\\end{bmatrix}$\n", 164 | "\n", 165 | "- $det(H) = ad - bc$\n", 166 | "- $tr(H) = a + d$\n", 167 | "\n", 168 | "... where the trace is generally the sum of elements on the main diagonal.\n", 169 | "\n", 170 | "- $tr(H) \\leq 0$\n", 171 | "- $det(H) \\geq 0$\n", 172 | "\n", 173 | "would mean a matrix is negative semi-definite. \n", 174 | "\n", 175 | "I personally don't know if this works for larger matrices, though I know finding the determinant is a more complicated process without these conditions." 176 | ] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 3", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.7.0" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 2 200 | } 201 | -------------------------------------------------------------------------------- /U03 L10 - Consistency of MLE, Covariance Matrices, and Multivariate Stats.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 3 Lecture 10 - Constistency of MLE, Covariance Matrices, and Multivariate Statistics" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 3, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import random\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "import seaborn as sns\n", 20 | "import warnings\n", 21 | "warnings.filterwarnings('ignore')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Log-Likelihood = 0 doesn't always give MLE\n", 29 | "\n", 30 | "Sometimes the maximum of a function isn't found with the derivative. In those cases, you have to employ other methods to find the MLE. There isn't necessarily one prescription for this process. Example:\n", 31 | "\n", 32 | "$X_1, \\cdots, X_n \\sim Unif[0, \\theta]$\n", 33 | "\n", 34 | "$L_n(\\theta | X_1, \\cdots, X_n) = \\prod_n f_X(x|\\theta) = \\dfrac{1}{\\theta^n}$\n", 35 | "\n", 36 | "This function finds its argmax in $max(X_i)$, where the likelihood function isn't continuous, and the derivative isn't zero.\n", 37 | "\n", 38 | "- (Scripted Example 1)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Consistency of MLE\n", 46 | "\n", 47 | "Under mild regularity conditions (continuity in $\\theta$ of the pdf $p_{\\theta}$ almost everywhere),\n", 48 | "\n", 49 | "$\\hat{\\theta_n}^{MLE} \\xrightarrow[n \\to \\infty]{P} \\theta$\n", 50 | "\n", 51 | "which holds true in the multivariate case, meaning when $\\theta$ is a vector of parameters." 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Random Vector\n", 59 | "\n", 60 | "$X = \\begin{pmatrix}\n", 61 | "X^{(1)} \\\\\n", 62 | "\\vdots \\\\\n", 63 | "X^{(d)}\n", 64 | "\\end{pmatrix}$\n", 65 | "\n", 66 | "is a $d$ x $1$ random vector, or a random vector $\\in \\mathbb{R}^d$, where each $X^{(k)}$ is an RV. \n", 67 | "\n", 68 | "- The pdf of $X$ is the joint pdf of the elements\n", 69 | "- The cdf of $X$ is $P(X^{(1)} \\leq x^{(1)}, \\cdots, X^{(d)} \\leq x^{(d)})$ (similar to pdf of max, but $x$ is a vector)\n", 70 | "- $X$ converges in probability when all $X^{(k)}$ individually converge in probability. If $X$ = $\\hat{\\theta}_n$, a vector of estimators, all $X^{(k)}$ must be consistent estimators for $\\hat{\\theta}_n$ to be consistent.\n", 71 | "\n", 72 | "e.g.\n", 73 | "\n", 74 | "$X \\sim \\mathcal{N}(\\mu, \\sigma^2)$\n", 75 | "\n", 76 | "$\\hat{\\theta}_n = \\begin{pmatrix}\n", 77 | "\\bar{X}_n \\\\\n", 78 | "\\dfrac{\\sum_n(X_i - \\bar{X}_n)^2}{n}\n", 79 | "\\end{pmatrix}$\n", 80 | "\n", 81 | "converges in probability to\n", 82 | "\n", 83 | "$\\theta = \\begin{pmatrix}\n", 84 | "\\mu \\\\\n", 85 | "\\sigma^2 \n", 86 | "\\end{pmatrix}$\n", 87 | "\n", 88 | "- A random vector has a gaussian distribution iff all linear combinations of its elements are of that distribution. This isn't always the case." 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Covariance\n", 96 | "\n", 97 | "- $Cov(X, Y) = E[(X - E[X])(Y - E[Y])]$\n", 98 | "- $= E[(X)(Y - E[Y])]$\n", 99 | "- $= E[(X - E[X])(Y)]$\n", 100 | "- $= E[XY] - E[X]E[Y]$\n", 101 | "- $Cov(X, X) = Var(X)$" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Covariance & Independence\n", 109 | "\n", 110 | "- $X, Y$ are independent $\\Rightarrow Cov(X, Y) = 0$\n", 111 | "- $Cov(X, Y) = 0 \\nRightarrow X, Y$ are independent" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "### Covariance Matrix\n", 119 | "\n", 120 | "Covariance matrix $\\Sigma$ is the total representation of covariance between all of the elements of a random vector. It has the variances on the main diagonal and covariances symmetrically above and below.\n", 121 | "\n", 122 | "- $\\Sigma_{i, j} = Cov(X^{(i)}, X^{(j)})$\n", 123 | "\n", 124 | "e.g.\n", 125 | "\n", 126 | "$ M = \\begin{pmatrix}\n", 127 | "X \\\\\n", 128 | "Y \\\\\n", 129 | "Z\n", 130 | "\\end{pmatrix}$\n", 131 | "\n", 132 | "$\\Sigma_M = E[(M - E[M])(M - E[M])^T]$\n", 133 | "\n", 134 | "$= E\\begin{pmatrix}\\begin{pmatrix}\n", 135 | "X - E[X] \\\\\n", 136 | "Y - E[Y] \\\\\n", 137 | "Z - E[Z] \n", 138 | "\\end{pmatrix}\n", 139 | "\\begin{pmatrix}\n", 140 | "X - E[X] & Y - E[Y] & Z - E[Z] \n", 141 | "\\end{pmatrix}\\end{pmatrix}$\n", 142 | "\n", 143 | "$= \\begin{pmatrix}\n", 144 | "Cov(X, X) & Cov(X, Y) & Cov(X, Z) \\\\\n", 145 | "Cov(Y, X) & Cov(Y, Y) & Cov(Y, Z) \\\\\n", 146 | "Cov(Z, X) & Cov(Z, Y) & Cov(Z, Z)\n", 147 | "\\end{pmatrix}$\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "### Affine Transformation of a Covariance Matrix\n", 155 | "\n", 156 | "Univariate case\n", 157 | "\n", 158 | "$Var(aX + b) = a^2 Var(X)$\n", 159 | "\n", 160 | "The multivariate case is similar, but matrices don't have the distributive property, so...\n", 161 | "\n", 162 | "- $Var(AX + B) = A \\Sigma_X A^T$" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## Multivariate CLT\n", 170 | "\n", 171 | "- $\\sqrt{n}(\\bar{X}_n - \\mu) \\to \\mathcal{N}(0, \\Sigma)$\n", 172 | "\n", 173 | "Where $\\bar{X}_n$ is the average of a set of iid copies $\\{X_1, \\cdots, X_n\\}$ of random vector $X$ with $E[X] = \\mu$ and $Var(X) = \\Sigma$, which, put another way, makes $\\bar{X}_n$ a vector of averages. Each $\\bar{X_n}^{(k)}$ represents the mean of the elements of $X_k$.\n", 174 | "\n", 175 | "$X_k = \\begin{pmatrix}\n", 176 | "X^{(1)}_k \\\\\n", 177 | "\\vdots \\\\\n", 178 | "X^{(d)}_k\n", 179 | "\\end{pmatrix}$\n", 180 | "\n", 181 | "$\\bar{X}_n = \\begin{pmatrix}\n", 182 | "\\bar{X_n}^{(1)} \\\\\n", 183 | "\\vdots \\\\\n", 184 | "\\bar{X_n}^{(d)}\n", 185 | "\\end{pmatrix}$\n", 186 | "\n", 187 | "and \n", 188 | "\n", 189 | "- $\\sqrt{n} (\\Sigma^{-1/2}) (\\bar{X}_n - \\mu) \\to \\mathcal{N}(0, I_d)$\n", 190 | "\n", 191 | "Where $\\Sigma^{1/2}$ is the square root of $\\Sigma$, and it's the matrix s.t. $\\Sigma^{-1/2} \\Sigma^{-1/2} = \\Sigma^{-1}$. \n", 192 | "\n", 193 | "This tranformation corresponds to the single variable version\n", 194 | "\n", 195 | "$\\sqrt{n}(\\sigma^{-1})(\\bar{X}_n - \\mu) \\to \\mathcal{N}(0, 1)$" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "## Multivariate Delta Method\n", 203 | "\n", 204 | "Univariate with some RV $X$ with variance $\\sigma^2$ and parameter $\\theta$. If $\\hat{\\theta}_n$ is consistent,\n", 205 | "\n", 206 | "$\\sqrt{n}(\\hat{\\theta}_n - \\theta) \\to \\mathcal{N}(0, \\sigma^2)$\n", 207 | "\n", 208 | "$\\sqrt{n}(g(\\hat{\\theta}_n) - g(\\theta)) \\to \\mathcal{N}(0, g\\prime(E[X])^2\\sigma^2)$\n", 209 | "\n", 210 | "So, multivariate\n", 211 | "\n", 212 | "- $\\sqrt{n}(\\hat{\\theta}_n - \\theta) \\to \\mathcal{N}(0, \\Sigma)$\n", 213 | "\n", 214 | "- $\\sqrt{n}(g(\\hat{\\theta}_n) - g(\\theta)) \\to \\mathcal{N}(0, \\nabla g(\\theta)^T\\Sigma \\nabla g(\\theta))$\n", 215 | "\n", 216 | "Where $\\nabla g$ is the vector of first derivatives of $g$\n", 217 | "\n", 218 | "- NOTE: for the homework, pay very close attention to the variability of the language for this sort of problem. Here, we're talking about $g(\\hat{\\theta})$, where $g$ makes the transformation. But, think about what it means for $\\hat{\\theta}$ to instead be the transforming function $g$ of some estimator. You would rewrite the above above convergence formula to $g($estimator$)$... instead of $g(\\hat{\\theta})$. What are you then taking the derivative wrt? Also very important, what are we converging to, and what does that make $X$ in the above formula?" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "## Scripted Examples" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "### 1. Can't Always Find MLE with Derivative" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 52, 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "image/png": "\n", 243 | "text/plain": [ 244 | "
" 245 | ] 246 | }, 247 | "metadata": { 248 | "needs_background": "light" 249 | }, 250 | "output_type": "display_data" 251 | }, 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "The red line shows the actual b=5. The peak of Ln will always be at an x value less than b\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "n = 50\n", 262 | "Xn = np.random.uniform(0, 5, n)\n", 263 | "Xmax = max(Xn)\n", 264 | "x = np.linspace(4, 6, 100)\n", 265 | "Ln = np.array([0 if i < Xmax else 1/i**n for i in x])\n", 266 | "\n", 267 | "plt.title('Likelihood Function for b of a Uniform(0,5)', fontsize=16)\n", 268 | "plt.plot([5, 5], [0, 1], c='red', alpha=0.2)\n", 269 | "plt.plot(x, Ln)\n", 270 | "plt.ylim((0, max(Ln) + 1/20 * max(Ln)))\n", 271 | "plt.show()\n", 272 | "\n", 273 | "print('The red line shows the actual b=5. The peak of Ln will always be at an x value less than b')" 274 | ] 275 | } 276 | ], 277 | "metadata": { 278 | "kernelspec": { 279 | "display_name": "Python 3", 280 | "language": "python", 281 | "name": "python3" 282 | }, 283 | "language_info": { 284 | "codemirror_mode": { 285 | "name": "ipython", 286 | "version": 3 287 | }, 288 | "file_extension": ".py", 289 | "mimetype": "text/x-python", 290 | "name": "python", 291 | "nbconvert_exporter": "python", 292 | "pygments_lexer": "ipython3", 293 | "version": "3.7.0" 294 | } 295 | }, 296 | "nbformat": 4, 297 | "nbformat_minor": 2 298 | } 299 | -------------------------------------------------------------------------------- /U03 L11 - Fisher Info, Asym Normality of MLE, Method of Moments.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Unit 3 Lecture 11: Fisher Information, Asymptotic Normality of MLE, Method of Moments" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Fisher Information\n", 15 | "\n", 16 | "Fisher Information is a way of measuring the amount of information each i.i.d. variable $X_i$ carries about an unknown parameter $\\theta_j$ in the distribution of $X \\sim P(\\theta_0, \\cdots, \\theta_n)$. In other words, it's the amount of information each sample provides on a parameter of the sample's distribution. Geometrically, it tells you, on average, how curved the log-likelihood of $(x_1, \\theta)$ is.\n", 17 | "\n", 18 | "Fisher Information is calculated using likelihood function of the pdf of $X$ with one sample. The likelihood function with one sample is just the pdf.\n", 19 | "\n", 20 | "$X \\sim \\mathcal{N}(\\mu, \\sigma^2)$, $\\theta := \\sigma^2$\n", 21 | "\n", 22 | "$L(\\theta | X) = \\dfrac{1}{\\sqrt{2 \\pi \\theta}} \\exp(-\\frac{1}{2 \\theta} (X - \\mu)^2)$\n", 23 | "\n", 24 | "To get to the Fisher Information, we first take the derivative of the log-likelihood, just like in getting $\\hat{\\theta}_{MLE}$. \n", 25 | "\n", 26 | "$\\dfrac{\\partial \\mathcal{ln}(L)}{\\partial \\theta} = \\dfrac{-\\theta + X^2 - 2 \\mu X + \\mu^2}{2 \\theta^2}$\n", 27 | "\n", 28 | "From here, there are two ways to get the Fisher Information. We can get the variance of the first derivative, or we can just get the negative expectation of the second derivative.\n", 29 | "\n", 30 | "$\\dfrac{\\partial^2 \\mathcal{ln}(L)}{\\partial \\theta^2} = \\dfrac{\\theta - 2 X^2 + 4 \\mu X - 2 \\mu^2}{2 \\theta^3}$\n", 31 | "\n", 32 | "Now we get the negative expected value, where $\\mathbb{E}[X] = \\mu$ and $\\mathbb{E}[X^2] = \\mu^2 + \\sigma^2$, so...\n", 33 | "\n", 34 | "$I(\\theta) = -\\mathbb{E}[\\dfrac{\\partial^2 \\mathcal{log}(L)}{\\partial \\theta^2}] = -\\dfrac{\\theta - 2 (\\mu^2 + \\theta) + 4 (\\mu) (\\mu) - 2 (\\mu)^2}{2 \\theta^3} = \\dfrac{1}{2 \\theta^2} = \\dfrac{1}{2 \\sigma^4} = (Var(\\hat{\\sigma^2}_{MLE}))^{-1}$\n", 35 | "\n", 36 | "Note this works no matter what $\\mu$ is. Now, we know the asymptotic variance of the variance estimator is...\n", 37 | "\n", 38 | "$Var(\\hat{\\sigma^2}_{MLE}) = 2 \\sigma^4$\n", 39 | "\n", 40 | "And we know that of all estimators, the Maximum Likelihood Estimator has the smallest variance.\n", 41 | "\n", 42 | "The complete Fisher Information for the model is the covariance matrix:\n", 43 | "\n", 44 | "$I_L(\\mu, \\sigma^2) = \\begin{pmatrix}\n", 45 | "-\\mathbb{E}[\\dfrac{\\partial^2 \\mathcal{log}(L)}{\\partial \\mu^2}] & -\\mathbb{E}[\\dfrac{\\partial^2 \\mathcal{log}(L)}{\\partial \\mu \\partial \\sigma^2}] \\\\\n", 46 | "-\\mathbb{E}[\\dfrac{\\partial^2 \\mathcal{log}(L)}{\\partial \\sigma^2 \\mu}] & -\\mathbb{E}[\\dfrac{\\partial^2 \\mathcal{log}(L)}{\\partial (\\sigma^2)^2}]\n", 47 | "\\end{pmatrix} = \\begin {pmatrix}\n", 48 | "\\dfrac{1}{\\sigma^2} & 0 \\\\\n", 49 | "0 & \\dfrac{1}{2 \\sigma^4}\n", 50 | "\\end{pmatrix}$\n", 51 | "\n", 52 | "Some ambiguous, probably complicated conditions have to be met for the inverse of the Fisher Information to be equal to the variance of MLE. So far it's always true.\n", 53 | "\n", 54 | "### Parameter of a distribution depending on parameter of another distribution\n", 55 | "\n", 56 | "If variable $Y$ has a parameter that depends on a parameter of variable $X$...\n", 57 | "\n", 58 | "$X \\sim Exp(\\lambda)$\n", 59 | "\n", 60 | "$Y \\sim Ber(p(\\lambda))$\n", 61 | "\n", 62 | "where $z$ is some threshold on the exponential distribution such that everything $> z$ is recorded as a $1$ and everything $< z$ gets a $0$, it makes sense that we can get some information about $\\lambda$ from $p$. It's important, then, that we focus on $\\lambda$ in the process of looking for information about it.\n", 63 | "\n", 64 | "$L(\\lambda | Y) = e^{-\\lambda z Y}(1 - e^{-\\lambda z})^{1 - Y}$\n", 65 | "\n", 66 | "$\\mathscr{l} = \\mathcal{ln}(L(\\lambda | Y)) = -\\lambda z Y + (1 - Y) ln(1 - e^{-\\lambda z})$\n", 67 | "\n", 68 | "Since we want information about $\\lambda$, we take the derivative with respect to it, rather than taking the derivative wrt $p = e^{-\\lambda z}$. With a little more work, this eventually yields $I(\\lambda)$. It's also possible to use the delta method here, though that's definitely not the easiest way to get the asymptotic variance of $\\hat{\\lambda}$." 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Asymptotic Normality of MLE\n", 76 | "\n", 77 | "MLE estimators are asympotically normal *under some regularity conditions*. If an MLE estimator is asymptotically normal, that means (among many things) that we can apply the delta method to find the variance of $g(\\hat{\\theta}_{MLE})$.\n", 78 | "\n", 79 | "$\\sqrt{n} (\\hat{\\theta}_{MLE} - \\theta) \\to \\mathcal{N}(0, \\sigma^2)$\n", 80 | "\n", 81 | "means\n", 82 | "\n", 83 | "$\\sqrt{n} (g(\\hat{\\theta}_{MLE}) - g(\\theta)) \\to \\mathcal{N}(0, g\\prime(E[X])^2\\sigma^2)$\n", 84 | "\n", 85 | "and, multivariate\n", 86 | "\n", 87 | "$\\sqrt{n} (g(\\hat{\\theta}_{MLE}) - g(\\theta)) \\to \\mathcal{N}(0, \\nabla g(\\theta)^T\\Sigma_X \\nabla g(\\theta))$" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## Method of Moments\n", 95 | "\n", 96 | "$E[X^k]$ is the $k$th moment of an R.V. $X$. \n", 97 | "\n", 98 | "### $\\frac{1}{n} \\sum_{i=1}^n X_i^k = \\overline{X^k}_n$ \n", 99 | "\n", 100 | "is the estimator for the $k$th moment. A simple combination of method of moments estimators can be used to estimate a parameter, such as in $\\overline{X^2}_n - \\overline{X_n}^2 \\to \\sigma^2$ of a normal.\n", 101 | "\n", 102 | "The formulas for the $k$th moment are, of course\n", 103 | "\n", 104 | "$\\int_x x^k f_X(x) dx = \\mathbb{E}[X^k]$\n", 105 | "\n", 106 | "$\\sum_x x^k f_X(x) = \\mathbb{E}[X^k]$\n", 107 | "\n", 108 | "\n", 109 | "### Recovering parameters from moments\n", 110 | "\n", 111 | "If $\\psi$ is a function that maps the parameters of $X$ to a list of moments of $X$ \n", 112 | "\n", 113 | "- $\\psi(\\theta^{(X)}_1, \\cdots, \\theta^{(X)}_n) = (m_1, m_2, \\cdots) = (\\mathbb{E}[X], \\mathbb{E}[X^2], \\cdots)$,\n", 114 | "\n", 115 | "And if $\\psi$ is one-to-one, meaning given the output of $\\psi$, there could only be one set of parameters that generated that output, then\n", 116 | "\n", 117 | "- $\\psi^{-1}(m_1, m_2, \\cdots) = (\\theta^{(X)}_1, \\cdots, \\theta^{(X)}_n)$\n", 118 | "\n", 119 | "e.g. $X \\sim \\mathcal{N}(\\mu, \\sigma^2)$\n", 120 | "\n", 121 | "$\\psi_{(1, 2)}(\\mu, \\sigma^2) = (\\mu, \\mu^2 + \\sigma^2)$\n", 122 | "\n", 123 | "$\\psi^{-1}(\\mu, \\mu^2 + \\sigma^2) = (\\mu, \\sigma^2)$" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## MLE vs MoM\n", 131 | "\n", 132 | "- MoM provides consistent, but often biased estimators. Typically easier to compute than MLE (though in this class MLE more often than not has been MoM). Doesn't give a good result if the model isn't well specified since noisy data can throw a mean estimation far away from the true mean.\n", 133 | "- MLE is more accurate by measure of quadratic risk, and still gives good results if the model isn't well specified." 134 | ] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Python 3", 140 | "language": "python", 141 | "name": "python3" 142 | }, 143 | "language_info": { 144 | "codemirror_mode": { 145 | "name": "ipython", 146 | "version": 3 147 | }, 148 | "file_extension": ".py", 149 | "mimetype": "text/x-python", 150 | "name": "python", 151 | "nbconvert_exporter": "python", 152 | "pygments_lexer": "ipython3", 153 | "version": "3.7.0" 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 2 158 | } 159 | --------------------------------------------------------------------------------