├── .gitignore
├── AB_testing_project.ipynb
├── Final Project- Experiment Screenshot.png
├── README.md
├── data
    ├── Final Project Results - Control.csv
    ├── Final Project Results - Experiment.csv
    └── baseline_vals.csv
└── instructions.pdf


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | #ipython notebook
 9 | .ipynb_checkpoints
10 | 
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | #dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | #lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | 
29 | # PyInstaller
30 | #  Usually these files are written by a python script from a template
31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 | 
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 | 
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *,cover
48 | 
49 | # Translations
50 | *.mo
51 | *.pot
52 | 
53 | # Django stuff:
54 | *.log
55 | 
56 | # Sphinx documentation
57 | docs/_build/
58 | 
59 | # PyBuilder
60 | target/
61 | 


--------------------------------------------------------------------------------
/AB_testing_project.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {
   7 |     "collapsed": true
   8 |    },
   9 |    "outputs": [],
  10 |    "source": [
  11 |     "import pandas as pd\n",
  12 |     "import numpy as np"
  13 |    ]
  14 |   },
  15 |   {
  16 |    "cell_type": "code",
  17 |    "execution_count": 2,
  18 |    "metadata": {
  19 |     "collapsed": true
  20 |    },
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "pageviews = 5000"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "code",
  28 |    "execution_count": 3,
  29 |    "metadata": {
  30 |     "collapsed": false
  31 |    },
  32 |    "outputs": [
  33 |     {
  34 |      "data": {
  35 |       "text/html": [
  36 |        "<div>\n",
  37 |        "<table border=\"1\" class=\"dataframe\">\n",
  38 |        "  <thead>\n",
  39 |        "    <tr style=\"text-align: right;\">\n",
  40 |        "      <th></th>\n",
  41 |        "      <th>metric</th>\n",
  42 |        "      <th>baseline_val</th>\n",
  43 |        "    </tr>\n",
  44 |        "  </thead>\n",
  45 |        "  <tbody>\n",
  46 |        "    <tr>\n",
  47 |        "      <th>0</th>\n",
  48 |        "      <td>unique cookies to view page per day:</td>\n",
  49 |        "      <td>40000.000000</td>\n",
  50 |        "    </tr>\n",
  51 |        "    <tr>\n",
  52 |        "      <th>1</th>\n",
  53 |        "      <td>unique cookies to click \"start free trial\" per...</td>\n",
  54 |        "      <td>3200.000000</td>\n",
  55 |        "    </tr>\n",
  56 |        "    <tr>\n",
  57 |        "      <th>2</th>\n",
  58 |        "      <td>enrollments per day:</td>\n",
  59 |        "      <td>660.000000</td>\n",
  60 |        "    </tr>\n",
  61 |        "    <tr>\n",
  62 |        "      <th>3</th>\n",
  63 |        "      <td>click-through-probability on \"start free trial\":</td>\n",
  64 |        "      <td>0.080000</td>\n",
  65 |        "    </tr>\n",
  66 |        "    <tr>\n",
  67 |        "      <th>4</th>\n",
  68 |        "      <td>probability of enrolling, given click:</td>\n",
  69 |        "      <td>0.206250</td>\n",
  70 |        "    </tr>\n",
  71 |        "    <tr>\n",
  72 |        "      <th>5</th>\n",
  73 |        "      <td>probability of payment, given enroll:</td>\n",
  74 |        "      <td>0.530000</td>\n",
  75 |        "    </tr>\n",
  76 |        "    <tr>\n",
  77 |        "      <th>6</th>\n",
  78 |        "      <td>probability of payment, given click</td>\n",
  79 |        "      <td>0.109313</td>\n",
  80 |        "    </tr>\n",
  81 |        "  </tbody>\n",
  82 |        "</table>\n",
  83 |        "</div>"
  84 |       ],
  85 |       "text/plain": [
  86 |        "                                              metric  baseline_val\n",
  87 |        "0               unique cookies to view page per day:  40000.000000\n",
  88 |        "1  unique cookies to click \"start free trial\" per...   3200.000000\n",
  89 |        "2                               enrollments per day:    660.000000\n",
  90 |        "3   click-through-probability on \"start free trial\":      0.080000\n",
  91 |        "4             probability of enrolling, given click:      0.206250\n",
  92 |        "5              probability of payment, given enroll:      0.530000\n",
  93 |        "6                probability of payment, given click      0.109313"
  94 |       ]
  95 |      },
  96 |      "execution_count": 3,
  97 |      "metadata": {},
  98 |      "output_type": "execute_result"
  99 |     }
 100 |    ],
 101 |    "source": [
 102 |     "df_basevals = pd.read_csv(\"data/baseline_vals.csv\", index_col=False,header = None, names = ['metric','baseline_val'])\n",
 103 |     "df_basevals.metric = df_basevals.metric.map(lambda x: x.lower())\n",
 104 |     "df_basevals"
 105 |    ]
 106 |   },
 107 |   {
 108 |    "cell_type": "code",
 109 |    "execution_count": 4,
 110 |    "metadata": {
 111 |     "collapsed": false
 112 |    },
 113 |    "outputs": [
 114 |     {
 115 |      "data": {
 116 |       "text/plain": [
 117 |        "0.0202"
 118 |       ]
 119 |      },
 120 |      "execution_count": 4,
 121 |      "metadata": {},
 122 |      "output_type": "execute_result"
 123 |     }
 124 |    ],
 125 |    "source": [
 126 |     "round(np.sqrt((.206250*(1-.206250))/(5000*3200/40000)),4)"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "code",
 131 |    "execution_count": 5,
 132 |    "metadata": {
 133 |     "collapsed": false
 134 |    },
 135 |    "outputs": [
 136 |     {
 137 |      "data": {
 138 |       "text/plain": [
 139 |        "0.0551"
 140 |       ]
 141 |      },
 142 |      "execution_count": 5,
 143 |      "metadata": {},
 144 |      "output_type": "execute_result"
 145 |     }
 146 |    ],
 147 |    "source": [
 148 |     "round(np.sqrt((.53*(1-.53))/(5000*660/40000)),4)"
 149 |    ]
 150 |   },
 151 |   {
 152 |    "cell_type": "code",
 153 |    "execution_count": 6,
 154 |    "metadata": {
 155 |     "collapsed": false
 156 |    },
 157 |    "outputs": [
 158 |     {
 159 |      "data": {
 160 |       "text/plain": [
 161 |        "0.0156"
 162 |       ]
 163 |      },
 164 |      "execution_count": 6,
 165 |      "metadata": {},
 166 |      "output_type": "execute_result"
 167 |     }
 168 |    ],
 169 |    "source": [
 170 |     "round(np.sqrt((.109313*(1-.109313))/(5000*3200/40000)),4)"
 171 |    ]
 172 |   },
 173 |   {
 174 |    "cell_type": "markdown",
 175 |    "metadata": {},
 176 |    "source": [
 177 |     "## Pageviews without Bonferroni Correction ##"
 178 |    ]
 179 |   },
 180 |   {
 181 |    "cell_type": "markdown",
 182 |    "metadata": {},
 183 |    "source": [
 184 |     "### Gross Conversion ###\n",
 185 |     "\n",
 186 |     "- Baseline Conversion: 20.625%\n",
 187 |     "- Minimum Detectable Effect: 1%\n",
 188 |     "- alpha: 5%\n",
 189 |     "- beta: 20%\n",
 190 |     "- 1 - beta: 80%\n",
 191 |     "- sample size = 25,835 enrollments/group\n",
 192 |     "- Number of groups = 2 (experiment and control)\n",
 193 |     "- total sample size =  51,670 enrollments\n",
 194 |     "- clicks/pageview: 3200/40000 = .08 clicks/pageview\n",
 195 |     "- pageviews = 645,875\n",
 196 |     "\n",
 197 |     "\n",
 198 |     "\n",
 199 |     "### Retention ###\n",
 200 |     "\n",
 201 |     "- Baseline Conversion: 53%\n",
 202 |     "- Minimum Detectable Effect: 1%\n",
 203 |     "- alpha: 5%\n",
 204 |     "- beta: 20%\n",
 205 |     "- 1 - beta: 80%\n",
 206 |     "- sample size = 39,155 enrollments/group\n",
 207 |     "- Number of groups = 2 (experiment and control)\n",
 208 |     "- total sample size = 78,230 enrollments\n",
 209 |     "- enrollments/pageview: 660/40000 = .0165 enrollments/pageview\n",
 210 |     "- pageviews = 78,230/.0165 = 4,741,212\n",
 211 |     "\n",
 212 |     "### Net Conversion ###\n",
 213 |     "\n",
 214 |     "- Baseline Conversion: 10.9313%\n",
 215 |     "- Minimum Detectable Effect: .75%\n",
 216 |     "- alpha: 5%\n",
 217 |     "- beta: 20%\n",
 218 |     "- 1 - beta: 80%\n",
 219 |     "- sample size = 27,413 enrollments/group\n",
 220 |     "- Number of groups = 2 (experiment and control)\n",
 221 |     "- total sample size = 54,826\n",
 222 |     "- clicks/pageview: 3200/40000 = .08 clicks/pageview\n",
 223 |     "- pageviews = 685,325\n",
 224 |     "\n",
 225 |     "\n",
 226 |     "\n",
 227 |     "\n"
 228 |    ]
 229 |   },
 230 |   {
 231 |    "cell_type": "markdown",
 232 |    "metadata": {},
 233 |    "source": [
 234 |     "### Duration and Exposure ###"
 235 |    ]
 236 |   },
 237 |   {
 238 |    "cell_type": "code",
 239 |    "execution_count": 7,
 240 |    "metadata": {
 241 |     "collapsed": false
 242 |    },
 243 |    "outputs": [
 244 |     {
 245 |      "data": {
 246 |       "text/plain": [
 247 |        "118.5303"
 248 |       ]
 249 |      },
 250 |      "execution_count": 7,
 251 |      "metadata": {},
 252 |      "output_type": "execute_result"
 253 |     }
 254 |    ],
 255 |    "source": [
 256 |     "4741212.0/40000"
 257 |    ]
 258 |   },
 259 |   {
 260 |    "cell_type": "markdown",
 261 |    "metadata": {},
 262 |    "source": [
 263 |     "If we divert 100% off traffic, given 40,000 page views per day, the experiment would take 119 days.  That is a long time.  If we eliminate retention, we are left with Gross Conversion and Net Conversion.  This reduces the number of required pageviews to 685,325, and an 18 day experiment with 100% diversion.  There may be other experiments to run, so let's say 50% diversion for 35 days."
 264 |    ]
 265 |   },
 266 |   {
 267 |    "cell_type": "code",
 268 |    "execution_count": 8,
 269 |    "metadata": {
 270 |     "collapsed": false
 271 |    },
 272 |    "outputs": [
 273 |     {
 274 |      "data": {
 275 |       "text/plain": [
 276 |        "17.133125"
 277 |       ]
 278 |      },
 279 |      "execution_count": 8,
 280 |      "metadata": {},
 281 |      "output_type": "execute_result"
 282 |     }
 283 |    ],
 284 |    "source": [
 285 |     "685325.0/40000"
 286 |    ]
 287 |   },
 288 |   {
 289 |    "cell_type": "code",
 290 |    "execution_count": 9,
 291 |    "metadata": {
 292 |     "collapsed": false
 293 |    },
 294 |    "outputs": [],
 295 |    "source": [
 296 |     "df_control = pd.read_csv(\"data/Final Project Results - Control.csv\")\n",
 297 |     "df_experiment = pd.read_csv(\"data/Final Project Results - Experiment.csv\")"
 298 |    ]
 299 |   },
 300 |   {
 301 |    "cell_type": "code",
 302 |    "execution_count": 10,
 303 |    "metadata": {
 304 |     "collapsed": false
 305 |    },
 306 |    "outputs": [
 307 |     {
 308 |      "data": {
 309 |       "text/html": [
 310 |        "<div>\n",
 311 |        "<table border=\"1\" class=\"dataframe\">\n",
 312 |        "  <thead>\n",
 313 |        "    <tr style=\"text-align: right;\">\n",
 314 |        "      <th></th>\n",
 315 |        "      <th>Control</th>\n",
 316 |        "      <th>Experiment</th>\n",
 317 |        "    </tr>\n",
 318 |        "  </thead>\n",
 319 |        "  <tbody>\n",
 320 |        "    <tr>\n",
 321 |        "      <th>cookies</th>\n",
 322 |        "      <td>345543</td>\n",
 323 |        "      <td>344660</td>\n",
 324 |        "    </tr>\n",
 325 |        "    <tr>\n",
 326 |        "      <th>clicks</th>\n",
 327 |        "      <td>28378</td>\n",
 328 |        "      <td>28325</td>\n",
 329 |        "    </tr>\n",
 330 |        "    <tr>\n",
 331 |        "      <th>enrollments</th>\n",
 332 |        "      <td>3785</td>\n",
 333 |        "      <td>3423</td>\n",
 334 |        "    </tr>\n",
 335 |        "    <tr>\n",
 336 |        "      <th>payments</th>\n",
 337 |        "      <td>2033</td>\n",
 338 |        "      <td>1945</td>\n",
 339 |        "    </tr>\n",
 340 |        "  </tbody>\n",
 341 |        "</table>\n",
 342 |        "</div>"
 343 |       ],
 344 |       "text/plain": [
 345 |        "             Control  Experiment\n",
 346 |        "cookies       345543      344660\n",
 347 |        "clicks         28378       28325\n",
 348 |        "enrollments     3785        3423\n",
 349 |        "payments        2033        1945"
 350 |       ]
 351 |      },
 352 |      "execution_count": 10,
 353 |      "metadata": {},
 354 |      "output_type": "execute_result"
 355 |     }
 356 |    ],
 357 |    "source": [
 358 |     "results = {\"Control\":pd.Series([df_control.Pageviews.sum(),df_control.Clicks.sum(),\n",
 359 |     "                                  df_control.Enrollments.sum(),df_control.Payments.sum()],\n",
 360 |     "                                  index = [\"cookies\",\"clicks\",\"enrollments\",\"payments\"]),\n",
 361 |     "           \"Experiment\":pd.Series([df_experiment.Pageviews.sum(),df_experiment.Clicks.sum(),\n",
 362 |     "                               df_experiment.Enrollments.sum(),df_experiment.Payments.sum()],\n",
 363 |     "                               index = [\"cookies\",\"clicks\",\"enrollments\",\"payments\"])}\n",
 364 |     "df_results = pd.DataFrame(results)\n",
 365 |     "df_results"
 366 |    ]
 367 |   },
 368 |   {
 369 |    "cell_type": "markdown",
 370 |    "metadata": {},
 371 |    "source": [
 372 |     "### Sanity Checks"
 373 |    ]
 374 |   },
 375 |   {
 376 |    "cell_type": "markdown",
 377 |    "metadata": {},
 378 |    "source": [
 379 |     "#### Count Metrics"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "code",
 384 |    "execution_count": 11,
 385 |    "metadata": {
 386 |     "collapsed": false
 387 |    },
 388 |    "outputs": [
 389 |     {
 390 |      "data": {
 391 |       "text/html": [
 392 |        "<div>\n",
 393 |        "<table border=\"1\" class=\"dataframe\">\n",
 394 |        "  <thead>\n",
 395 |        "    <tr style=\"text-align: right;\">\n",
 396 |        "      <th></th>\n",
 397 |        "      <th>Control</th>\n",
 398 |        "      <th>Experiment</th>\n",
 399 |        "      <th>Total</th>\n",
 400 |        "      <th>Prob</th>\n",
 401 |        "      <th>StdErr</th>\n",
 402 |        "      <th>MargErr</th>\n",
 403 |        "      <th>CI_lower</th>\n",
 404 |        "      <th>CI_upper</th>\n",
 405 |        "      <th>Obs_val</th>\n",
 406 |        "      <th>Pass_Sanity</th>\n",
 407 |        "      <th>Diff</th>\n",
 408 |        "    </tr>\n",
 409 |        "  </thead>\n",
 410 |        "  <tbody>\n",
 411 |        "    <tr>\n",
 412 |        "      <th>cookies</th>\n",
 413 |        "      <td>345543</td>\n",
 414 |        "      <td>344660</td>\n",
 415 |        "      <td>690203</td>\n",
 416 |        "      <td>0.5</td>\n",
 417 |        "      <td>0.000602</td>\n",
 418 |        "      <td>0.001180</td>\n",
 419 |        "      <td>0.498820</td>\n",
 420 |        "      <td>0.501180</td>\n",
 421 |        "      <td>0.499360</td>\n",
 422 |        "      <td>True</td>\n",
 423 |        "      <td>0.001279</td>\n",
 424 |        "    </tr>\n",
 425 |        "    <tr>\n",
 426 |        "      <th>clicks</th>\n",
 427 |        "      <td>28378</td>\n",
 428 |        "      <td>28325</td>\n",
 429 |        "      <td>56703</td>\n",
 430 |        "      <td>0.5</td>\n",
 431 |        "      <td>0.002100</td>\n",
 432 |        "      <td>0.004116</td>\n",
 433 |        "      <td>0.495884</td>\n",
 434 |        "      <td>0.504116</td>\n",
 435 |        "      <td>0.499533</td>\n",
 436 |        "      <td>True</td>\n",
 437 |        "      <td>0.000935</td>\n",
 438 |        "    </tr>\n",
 439 |        "    <tr>\n",
 440 |        "      <th>enrollments</th>\n",
 441 |        "      <td>3785</td>\n",
 442 |        "      <td>3423</td>\n",
 443 |        "      <td>7208</td>\n",
 444 |        "      <td>0.5</td>\n",
 445 |        "      <td>0.005889</td>\n",
 446 |        "      <td>0.011543</td>\n",
 447 |        "      <td>0.488457</td>\n",
 448 |        "      <td>0.511543</td>\n",
 449 |        "      <td>0.474889</td>\n",
 450 |        "      <td>False</td>\n",
 451 |        "      <td>0.050222</td>\n",
 452 |        "    </tr>\n",
 453 |        "    <tr>\n",
 454 |        "      <th>payments</th>\n",
 455 |        "      <td>2033</td>\n",
 456 |        "      <td>1945</td>\n",
 457 |        "      <td>3978</td>\n",
 458 |        "      <td>0.5</td>\n",
 459 |        "      <td>0.007928</td>\n",
 460 |        "      <td>0.015538</td>\n",
 461 |        "      <td>0.484462</td>\n",
 462 |        "      <td>0.515538</td>\n",
 463 |        "      <td>0.488939</td>\n",
 464 |        "      <td>True</td>\n",
 465 |        "      <td>0.022122</td>\n",
 466 |        "    </tr>\n",
 467 |        "  </tbody>\n",
 468 |        "</table>\n",
 469 |        "</div>"
 470 |       ],
 471 |       "text/plain": [
 472 |        "             Control  Experiment   Total  Prob    StdErr   MargErr  CI_lower  \\\n",
 473 |        "cookies       345543      344660  690203   0.5  0.000602  0.001180  0.498820   \n",
 474 |        "clicks         28378       28325   56703   0.5  0.002100  0.004116  0.495884   \n",
 475 |        "enrollments     3785        3423    7208   0.5  0.005889  0.011543  0.488457   \n",
 476 |        "payments        2033        1945    3978   0.5  0.007928  0.015538  0.484462   \n",
 477 |        "\n",
 478 |        "             CI_upper   Obs_val Pass_Sanity      Diff  \n",
 479 |        "cookies      0.501180  0.499360        True  0.001279  \n",
 480 |        "clicks       0.504116  0.499533        True  0.000935  \n",
 481 |        "enrollments  0.511543  0.474889       False  0.050222  \n",
 482 |        "payments     0.515538  0.488939        True  0.022122  "
 483 |       ]
 484 |      },
 485 |      "execution_count": 11,
 486 |      "metadata": {},
 487 |      "output_type": "execute_result"
 488 |     }
 489 |    ],
 490 |    "source": [
 491 |     "df_results['Total']=df_results.Control + df_results.Experiment\n",
 492 |     "df_results['Prob'] = 0.5\n",
 493 |     "df_results['StdErr'] = np.sqrt((df_results.Prob * (1- df_results.Prob))/df_results.Total)\n",
 494 |     "df_results[\"MargErr\"] = 1.96 * df_results.StdErr\n",
 495 |     "df_results[\"CI_lower\"] = df_results.Prob - df_results.MargErr\n",
 496 |     "df_results[\"CI_upper\"] = df_results.Prob + df_results.MargErr\n",
 497 |     "df_results[\"Obs_val\"] = df_results.Experiment/df_results.Total\n",
 498 |     "df_results[\"Pass_Sanity\"] = df_results.apply(lambda x: (x.Obs_val > x.CI_lower) and (x.Obs_val < x.CI_upper),axis=1)\n",
 499 |     "df_results['Diff'] = abs((df_results.Experiment - df_results.Control)/df_results.Total)\n",
 500 |     "\n",
 501 |     "df_results"
 502 |    ]
 503 |   },
 504 |   {
 505 |    "cell_type": "markdown",
 506 |    "metadata": {},
 507 |    "source": [
 508 |     "#### Other Metrics"
 509 |    ]
 510 |   },
 511 |   {
 512 |    "cell_type": "code",
 513 |    "execution_count": 45,
 514 |    "metadata": {
 515 |     "collapsed": false
 516 |    },
 517 |    "outputs": [
 518 |     {
 519 |      "name": "stdout",
 520 |      "output_type": "stream",
 521 |      "text": [
 522 |       "(0.082125813574576823, 0.082182440666163759, 0.081266986844116651, 0.083097894488210866, 0.00046706827655464432, 0.0009154538220471028)\n"
 523 |      ]
 524 |     }
 525 |    ],
 526 |    "source": [
 527 |     "# click through probability (clicks/cookies)\n",
 528 |     "\n",
 529 |     "control_cookies = df_results.loc['cookies','Control']\n",
 530 |     "control_clicks = df_results.loc['clicks','Control']\n",
 531 |     "\n",
 532 |     "exp_cookies = df_results.loc['cookies','Experiment']\n",
 533 |     "exp_clicks = df_results.loc['clicks', 'Experiment']\n",
 534 |     "\n",
 535 |     "## control value \n",
 536 |     "cont_p_hat = control_clicks/control_cookies\n",
 537 |     "\n",
 538 |     "## observed value (experimental value)\n",
 539 |     "exp_p_hat = exp_clicks/exp_cookies\n",
 540 |     "\n",
 541 |     "## Standard Error\n",
 542 |     "SE_ClickProb = np.sqrt((cont_p_hat * (1- cont_p_hat))/control_cookies)\n",
 543 |     "\n",
 544 |     "\n",
 545 |     "## margin of error for 95% confidence interval (z = 1.96)\n",
 546 |     "\n",
 547 |     "ME_ClickProb = SE_ClickProb * 1.96\n",
 548 |     "\n",
 549 |     "## CI\n",
 550 |     "upper_ClickProb = exp_p_hat + ME_ClickProb\n",
 551 |     "lower_ClickProb = exp_p_hat - ME_ClickProb\n",
 552 |     "\n",
 553 |     "## Sane in the membrane (yes, it passes)\n",
 554 |     "print(cont_p_hat,exp_p_hat,lower_ClickProb,upper_ClickProb, SE_ClickProb, ME_ClickProb)\n"
 555 |    ]
 556 |   },
 557 |   {
 558 |    "cell_type": "markdown",
 559 |    "metadata": {},
 560 |    "source": [
 561 |     "### Evaluation Metric Results Calculations"
 562 |    ]
 563 |   },
 564 |   {
 565 |    "cell_type": "code",
 566 |    "execution_count": 12,
 567 |    "metadata": {
 568 |     "collapsed": false
 569 |    },
 570 |    "outputs": [],
 571 |    "source": [
 572 |     "df_control_notnull = df_control[pd.isnull(df_control.Enrollments) != True]\n",
 573 |     "df_experiment_notnull = df_experiment[pd.isnull(df_control.Enrollments) != True]"
 574 |    ]
 575 |   },
 576 |   {
 577 |    "cell_type": "code",
 578 |    "execution_count": 13,
 579 |    "metadata": {
 580 |     "collapsed": false
 581 |    },
 582 |    "outputs": [
 583 |     {
 584 |      "data": {
 585 |       "text/html": [
 586 |        "<div>\n",
 587 |        "<table border=\"1\" class=\"dataframe\">\n",
 588 |        "  <thead>\n",
 589 |        "    <tr style=\"text-align: right;\">\n",
 590 |        "      <th></th>\n",
 591 |        "      <th>Control</th>\n",
 592 |        "      <th>Experiment</th>\n",
 593 |        "    </tr>\n",
 594 |        "  </thead>\n",
 595 |        "  <tbody>\n",
 596 |        "    <tr>\n",
 597 |        "      <th>cookies</th>\n",
 598 |        "      <td>212163</td>\n",
 599 |        "      <td>211362</td>\n",
 600 |        "    </tr>\n",
 601 |        "    <tr>\n",
 602 |        "      <th>clicks</th>\n",
 603 |        "      <td>17293</td>\n",
 604 |        "      <td>17260</td>\n",
 605 |        "    </tr>\n",
 606 |        "    <tr>\n",
 607 |        "      <th>enrollments</th>\n",
 608 |        "      <td>3785</td>\n",
 609 |        "      <td>3423</td>\n",
 610 |        "    </tr>\n",
 611 |        "    <tr>\n",
 612 |        "      <th>payments</th>\n",
 613 |        "      <td>2033</td>\n",
 614 |        "      <td>1945</td>\n",
 615 |        "    </tr>\n",
 616 |        "  </tbody>\n",
 617 |        "</table>\n",
 618 |        "</div>"
 619 |       ],
 620 |       "text/plain": [
 621 |        "             Control  Experiment\n",
 622 |        "cookies       212163      211362\n",
 623 |        "clicks         17293       17260\n",
 624 |        "enrollments     3785        3423\n",
 625 |        "payments        2033        1945"
 626 |       ]
 627 |      },
 628 |      "execution_count": 13,
 629 |      "metadata": {},
 630 |      "output_type": "execute_result"
 631 |     }
 632 |    ],
 633 |    "source": [
 634 |     "results_notnull = {\"Control\":pd.Series([df_control_notnull.Pageviews.sum(),df_control_notnull.Clicks.sum(),\n",
 635 |     "                                  df_control_notnull.Enrollments.sum(),df_control_notnull.Payments.sum()],\n",
 636 |     "                                  index = [\"cookies\",\"clicks\",\"enrollments\",\"payments\"]),\n",
 637 |     "           \"Experiment\":pd.Series([df_experiment_notnull.Pageviews.sum(),df_experiment_notnull.Clicks.sum(),\n",
 638 |     "                               df_experiment_notnull.Enrollments.sum(),df_experiment_notnull.Payments.sum()],\n",
 639 |     "                               index = [\"cookies\",\"clicks\",\"enrollments\",\"payments\"])}\n",
 640 |     "df_results_notnull = pd.DataFrame(results_notnull)\n",
 641 |     "df_results_notnull"
 642 |    ]
 643 |   },
 644 |   {
 645 |    "cell_type": "code",
 646 |    "execution_count": 14,
 647 |    "metadata": {
 648 |     "collapsed": false
 649 |    },
 650 |    "outputs": [
 651 |     {
 652 |      "data": {
 653 |       "text/html": [
 654 |        "<div>\n",
 655 |        "<table border=\"1\" class=\"dataframe\">\n",
 656 |        "  <thead>\n",
 657 |        "    <tr style=\"text-align: right;\">\n",
 658 |        "      <th></th>\n",
 659 |        "      <th>Control</th>\n",
 660 |        "      <th>Experiment</th>\n",
 661 |        "      <th>Total</th>\n",
 662 |        "    </tr>\n",
 663 |        "  </thead>\n",
 664 |        "  <tbody>\n",
 665 |        "    <tr>\n",
 666 |        "      <th>cookies</th>\n",
 667 |        "      <td>212163</td>\n",
 668 |        "      <td>211362</td>\n",
 669 |        "      <td>423525</td>\n",
 670 |        "    </tr>\n",
 671 |        "    <tr>\n",
 672 |        "      <th>clicks</th>\n",
 673 |        "      <td>17293</td>\n",
 674 |        "      <td>17260</td>\n",
 675 |        "      <td>34553</td>\n",
 676 |        "    </tr>\n",
 677 |        "    <tr>\n",
 678 |        "      <th>enrollments</th>\n",
 679 |        "      <td>3785</td>\n",
 680 |        "      <td>3423</td>\n",
 681 |        "      <td>7208</td>\n",
 682 |        "    </tr>\n",
 683 |        "    <tr>\n",
 684 |        "      <th>payments</th>\n",
 685 |        "      <td>2033</td>\n",
 686 |        "      <td>1945</td>\n",
 687 |        "      <td>3978</td>\n",
 688 |        "    </tr>\n",
 689 |        "  </tbody>\n",
 690 |        "</table>\n",
 691 |        "</div>"
 692 |       ],
 693 |       "text/plain": [
 694 |        "             Control  Experiment   Total\n",
 695 |        "cookies       212163      211362  423525\n",
 696 |        "clicks         17293       17260   34553\n",
 697 |        "enrollments     3785        3423    7208\n",
 698 |        "payments        2033        1945    3978"
 699 |       ]
 700 |      },
 701 |      "execution_count": 14,
 702 |      "metadata": {},
 703 |      "output_type": "execute_result"
 704 |     }
 705 |    ],
 706 |    "source": [
 707 |     "df_results_notnull['Total']=df_results_notnull.Control + df_results_notnull.Experiment\n",
 708 |     "\n",
 709 |     "df_results_notnull"
 710 |    ]
 711 |   },
 712 |   {
 713 |    "cell_type": "code",
 714 |    "execution_count": 15,
 715 |    "metadata": {
 716 |     "collapsed": false
 717 |    },
 718 |    "outputs": [],
 719 |    "source": [
 720 |     "# experiment values\n",
 721 |     "\n",
 722 |     "enrollments_exp = df_results_notnull.loc[\"enrollments\"].Experiment\n",
 723 |     "clicks_exp = df_results_notnull.loc[\"clicks\"].Experiment\n",
 724 |     "payments_exp = df_results_notnull.loc[\"payments\"].Experiment\n",
 725 |     "\n",
 726 |     "# control values\n",
 727 |     "\n",
 728 |     "enrollments_cont = df_results_notnull.loc[\"enrollments\"].Control\n",
 729 |     "clicks_cont = df_results_notnull.loc[\"clicks\"].Control\n",
 730 |     "payments_cont = df_results_notnull.loc[\"payments\"].Control\n",
 731 |     "\n",
 732 |     "\n",
 733 |     "\n",
 734 |     "# metrics\n",
 735 |     "\n",
 736 |     "GrossConversion_exp = enrollments_exp/clicks_exp\n",
 737 |     "NetConversion_exp = payments_exp/clicks_exp\n",
 738 |     "GrossConversion_cont = enrollments_cont/clicks_cont\n",
 739 |     "NetConversion_cont = payments_cont/clicks_cont\n",
 740 |     "\n",
 741 |     "GrossConversion = (enrollments_exp + enrollments_cont)/(clicks_cont + clicks_exp)\n",
 742 |     "NetConversion = (payments_cont + payments_exp)/(clicks_cont + clicks_exp)\n",
 743 |     "\n",
 744 |     "\n"
 745 |    ]
 746 |   },
 747 |   {
 748 |    "cell_type": "code",
 749 |    "execution_count": 16,
 750 |    "metadata": {
 751 |     "collapsed": false
 752 |    },
 753 |    "outputs": [
 754 |     {
 755 |      "name": "stdout",
 756 |      "output_type": "stream",
 757 |      "text": [
 758 |       "GrossConversion: 0.208607067404 \n",
 759 |       "NetConversion:0.115127485312\n"
 760 |      ]
 761 |     }
 762 |    ],
 763 |    "source": [
 764 |     "print('GrossConversion: {} \\nNetConversion:{}'.format(GrossConversion,NetConversion))"
 765 |    ]
 766 |   },
 767 |   {
 768 |    "cell_type": "code",
 769 |    "execution_count": 17,
 770 |    "metadata": {
 771 |     "collapsed": false
 772 |    },
 773 |    "outputs": [
 774 |     {
 775 |      "data": {
 776 |       "text/plain": [
 777 |        "0.2188746891805933"
 778 |       ]
 779 |      },
 780 |      "execution_count": 17,
 781 |      "metadata": {},
 782 |      "output_type": "execute_result"
 783 |     }
 784 |    ],
 785 |    "source": [
 786 |     "GrossConversion_cont"
 787 |    ]
 788 |   },
 789 |   {
 790 |    "cell_type": "code",
 791 |    "execution_count": 18,
 792 |    "metadata": {
 793 |     "collapsed": false
 794 |    },
 795 |    "outputs": [
 796 |     {
 797 |      "data": {
 798 |       "text/plain": [
 799 |        "0.19831981460023174"
 800 |       ]
 801 |      },
 802 |      "execution_count": 18,
 803 |      "metadata": {},
 804 |      "output_type": "execute_result"
 805 |     }
 806 |    ],
 807 |    "source": [
 808 |     "GrossConversion_exp"
 809 |    ]
 810 |   },
 811 |   {
 812 |    "cell_type": "code",
 813 |    "execution_count": 19,
 814 |    "metadata": {
 815 |     "collapsed": true
 816 |    },
 817 |    "outputs": [],
 818 |    "source": [
 819 |     "def stats_prop(p_hat,z_score,N_cont,N_exp,diff):\n",
 820 |     "    std_err = np.sqrt((p_hat * (1- p_hat ))*(1/N_cont + 1/N_exp))\n",
 821 |     "    marg_err = z_score * std_err\n",
 822 |     "    ci_lower = diff - marg_err\n",
 823 |     "    ci_upper = diff + marg_err\n",
 824 |     "    \n",
 825 |     "    return std_err,marg_err,ci_lower,ci_upper\n",
 826 |     "    \n",
 827 |     "    "
 828 |    ]
 829 |   },
 830 |   {
 831 |    "cell_type": "code",
 832 |    "execution_count": 20,
 833 |    "metadata": {
 834 |     "collapsed": false
 835 |    },
 836 |    "outputs": [
 837 |     {
 838 |      "data": {
 839 |       "text/plain": [
 840 |        "-0.020554874580361565"
 841 |       ]
 842 |      },
 843 |      "execution_count": 20,
 844 |      "metadata": {},
 845 |      "output_type": "execute_result"
 846 |     }
 847 |    ],
 848 |    "source": [
 849 |     "GrossConversion_diff = GrossConversion_exp - GrossConversion_cont\n",
 850 |     "GrossConversion_diff"
 851 |    ]
 852 |   },
 853 |   {
 854 |    "cell_type": "code",
 855 |    "execution_count": 21,
 856 |    "metadata": {
 857 |     "collapsed": false
 858 |    },
 859 |    "outputs": [],
 860 |    "source": [
 861 |     "se_gross,me_gross,cil_gross,ciu_gross = stats_prop(GrossConversion,1.96,clicks_cont,\n",
 862 |     "                                                   clicks_exp,GrossConversion_diff)"
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "code",
 867 |    "execution_count": 22,
 868 |    "metadata": {
 869 |     "collapsed": false
 870 |    },
 871 |    "outputs": [
 872 |     {
 873 |      "name": "stdout",
 874 |      "output_type": "stream",
 875 |      "text": [
 876 |       "(0.0043716753852259364, 0.0085684837550428355, -0.029123358335404401, -0.01198639082531873)\n"
 877 |      ]
 878 |     }
 879 |    ],
 880 |    "source": [
 881 |     "print(se_gross,me_gross,cil_gross,ciu_gross)"
 882 |    ]
 883 |   },
 884 |   {
 885 |    "cell_type": "code",
 886 |    "execution_count": 23,
 887 |    "metadata": {
 888 |     "collapsed": false
 889 |    },
 890 |    "outputs": [
 891 |     {
 892 |      "data": {
 893 |       "text/plain": [
 894 |        "-0.0048737226745441675"
 895 |       ]
 896 |      },
 897 |      "execution_count": 23,
 898 |      "metadata": {},
 899 |      "output_type": "execute_result"
 900 |     }
 901 |    ],
 902 |    "source": [
 903 |     "NetConversion_diff = NetConversion_exp - NetConversion_cont\n",
 904 |     "NetConversion_diff"
 905 |    ]
 906 |   },
 907 |   {
 908 |    "cell_type": "code",
 909 |    "execution_count": 24,
 910 |    "metadata": {
 911 |     "collapsed": false
 912 |    },
 913 |    "outputs": [],
 914 |    "source": [
 915 |     "se_net,me_net,cil_net,ciu_net = stats_prop(NetConversion,1.96,clicks_cont,\n",
 916 |     "                                           clicks_exp,NetConversion_diff)"
 917 |    ]
 918 |   },
 919 |   {
 920 |    "cell_type": "code",
 921 |    "execution_count": 25,
 922 |    "metadata": {
 923 |     "collapsed": false
 924 |    },
 925 |    "outputs": [
 926 |     {
 927 |      "name": "stdout",
 928 |      "output_type": "stream",
 929 |      "text": [
 930 |       "(0.0034341335129324238, 0.0067309016853475505, -0.011604624359891718, 0.001857179010803383)\n"
 931 |      ]
 932 |     }
 933 |    ],
 934 |    "source": [
 935 |     "print(se_net,me_net,cil_net,ciu_net)"
 936 |    ]
 937 |   },
 938 |   {
 939 |    "cell_type": "code",
 940 |    "execution_count": 26,
 941 |    "metadata": {
 942 |     "collapsed": false
 943 |    },
 944 |    "outputs": [],
 945 |    "source": [
 946 |     "df_SignTest = pd.merge(df_control_notnull,df_experiment_notnull,on=\"Date\")\n",
 947 |     "df_SignTest['GrossConversion_cont'] = df_SignTest.Enrollments_x/df_SignTest.Clicks_x\n",
 948 |     "df_SignTest['GrossConversion_exp'] = df_SignTest.Enrollments_y/df_SignTest.Clicks_y\n",
 949 |     "df_SignTest['NetConversion_cont'] = df_SignTest.Payments_x/df_SignTest.Clicks_x\n",
 950 |     "df_SignTest['NetConversion_exp'] = df_SignTest.Payments_y/df_SignTest.Clicks_y\n",
 951 |     "\n",
 952 |     "cols = ['Date','GrossConversion_cont','GrossConversion_exp','NetConversion_cont','NetConversion_exp']\n"
 953 |    ]
 954 |   },
 955 |   {
 956 |    "cell_type": "code",
 957 |    "execution_count": 27,
 958 |    "metadata": {
 959 |     "collapsed": false
 960 |    },
 961 |    "outputs": [],
 962 |    "source": [
 963 |     "df_SignTest = df_SignTest[cols]"
 964 |    ]
 965 |   },
 966 |   {
 967 |    "cell_type": "code",
 968 |    "execution_count": 28,
 969 |    "metadata": {
 970 |     "collapsed": false
 971 |    },
 972 |    "outputs": [
 973 |     {
 974 |      "data": {
 975 |       "text/html": [
 976 |        "<div>\n",
 977 |        "<table border=\"1\" class=\"dataframe\">\n",
 978 |        "  <thead>\n",
 979 |        "    <tr style=\"text-align: right;\">\n",
 980 |        "      <th></th>\n",
 981 |        "      <th>Date</th>\n",
 982 |        "      <th>GrossConversion_cont</th>\n",
 983 |        "      <th>GrossConversion_exp</th>\n",
 984 |        "      <th>NetConversion_cont</th>\n",
 985 |        "      <th>NetConversion_exp</th>\n",
 986 |        "    </tr>\n",
 987 |        "  </thead>\n",
 988 |        "  <tbody>\n",
 989 |        "    <tr>\n",
 990 |        "      <th>0</th>\n",
 991 |        "      <td>Sat, Oct 11</td>\n",
 992 |        "      <td>0.195051</td>\n",
 993 |        "      <td>0.153061</td>\n",
 994 |        "      <td>0.101892</td>\n",
 995 |        "      <td>0.049563</td>\n",
 996 |        "    </tr>\n",
 997 |        "    <tr>\n",
 998 |        "      <th>1</th>\n",
 999 |        "      <td>Sun, Oct 12</td>\n",
1000 |        "      <td>0.188703</td>\n",
1001 |        "      <td>0.147771</td>\n",
1002 |        "      <td>0.089859</td>\n",
1003 |        "      <td>0.115924</td>\n",
1004 |        "    </tr>\n",
1005 |        "    <tr>\n",
1006 |        "      <th>2</th>\n",
1007 |        "      <td>Mon, Oct 13</td>\n",
1008 |        "      <td>0.183718</td>\n",
1009 |        "      <td>0.164027</td>\n",
1010 |        "      <td>0.104510</td>\n",
1011 |        "      <td>0.089367</td>\n",
1012 |        "    </tr>\n",
1013 |        "    <tr>\n",
1014 |        "      <th>3</th>\n",
1015 |        "      <td>Tue, Oct 14</td>\n",
1016 |        "      <td>0.186603</td>\n",
1017 |        "      <td>0.166868</td>\n",
1018 |        "      <td>0.125598</td>\n",
1019 |        "      <td>0.111245</td>\n",
1020 |        "    </tr>\n",
1021 |        "    <tr>\n",
1022 |        "      <th>4</th>\n",
1023 |        "      <td>Wed, Oct 15</td>\n",
1024 |        "      <td>0.194743</td>\n",
1025 |        "      <td>0.168269</td>\n",
1026 |        "      <td>0.076464</td>\n",
1027 |        "      <td>0.112981</td>\n",
1028 |        "    </tr>\n",
1029 |        "  </tbody>\n",
1030 |        "</table>\n",
1031 |        "</div>"
1032 |       ],
1033 |       "text/plain": [
1034 |        "          Date  GrossConversion_cont  GrossConversion_exp  NetConversion_cont  \\\n",
1035 |        "0  Sat, Oct 11              0.195051             0.153061            0.101892   \n",
1036 |        "1  Sun, Oct 12              0.188703             0.147771            0.089859   \n",
1037 |        "2  Mon, Oct 13              0.183718             0.164027            0.104510   \n",
1038 |        "3  Tue, Oct 14              0.186603             0.166868            0.125598   \n",
1039 |        "4  Wed, Oct 15              0.194743             0.168269            0.076464   \n",
1040 |        "\n",
1041 |        "   NetConversion_exp  \n",
1042 |        "0           0.049563  \n",
1043 |        "1           0.115924  \n",
1044 |        "2           0.089367  \n",
1045 |        "3           0.111245  \n",
1046 |        "4           0.112981  "
1047 |       ]
1048 |      },
1049 |      "execution_count": 28,
1050 |      "metadata": {},
1051 |      "output_type": "execute_result"
1052 |     }
1053 |    ],
1054 |    "source": [
1055 |     "df_SignTest.head()"
1056 |    ]
1057 |   },
1058 |   {
1059 |    "cell_type": "code",
1060 |    "execution_count": 29,
1061 |    "metadata": {
1062 |     "collapsed": true
1063 |    },
1064 |    "outputs": [],
1065 |    "source": [
1066 |     "df_SignTest['GC_Sign'] = df_SignTest.GrossConversion_cont - df_SignTest.GrossConversion_exp\n",
1067 |     "df_SignTest['NC_Sign'] = df_SignTest.NetConversion_cont - df_SignTest.NetConversion_exp"
1068 |    ]
1069 |   },
1070 |   {
1071 |    "cell_type": "code",
1072 |    "execution_count": 30,
1073 |    "metadata": {
1074 |     "collapsed": false
1075 |    },
1076 |    "outputs": [
1077 |     {
1078 |      "data": {
1079 |       "text/plain": [
1080 |        "23"
1081 |       ]
1082 |      },
1083 |      "execution_count": 30,
1084 |      "metadata": {},
1085 |      "output_type": "execute_result"
1086 |     }
1087 |    ],
1088 |    "source": [
1089 |     "len(df_SignTest)"
1090 |    ]
1091 |   },
1092 |   {
1093 |    "cell_type": "code",
1094 |    "execution_count": 31,
1095 |    "metadata": {
1096 |     "collapsed": false
1097 |    },
1098 |    "outputs": [
1099 |     {
1100 |      "data": {
1101 |       "text/plain": [
1102 |        "19"
1103 |       ]
1104 |      },
1105 |      "execution_count": 31,
1106 |      "metadata": {},
1107 |      "output_type": "execute_result"
1108 |     }
1109 |    ],
1110 |    "source": [
1111 |     "len(df_SignTest[df_SignTest.GC_Sign > 0])"
1112 |    ]
1113 |   },
1114 |   {
1115 |    "cell_type": "code",
1116 |    "execution_count": 32,
1117 |    "metadata": {
1118 |     "collapsed": false
1119 |    },
1120 |    "outputs": [
1121 |     {
1122 |      "data": {
1123 |       "text/plain": [
1124 |        "13"
1125 |       ]
1126 |      },
1127 |      "execution_count": 32,
1128 |      "metadata": {},
1129 |      "output_type": "execute_result"
1130 |     }
1131 |    ],
1132 |    "source": [
1133 |     "len(df_SignTest[df_SignTest.NC_Sign > 0])"
1134 |    ]
1135 |   }
1136 |  ],
1137 |  "metadata": {
1138 |   "kernelspec": {
1139 |    "display_name": "Python 2",
1140 |    "language": "python",
1141 |    "name": "python2"
1142 |   },
1143 |   "language_info": {
1144 |    "codemirror_mode": {
1145 |     "name": "ipython",
1146 |     "version": 2
1147 |    },
1148 |    "file_extension": ".py",
1149 |    "mimetype": "text/x-python",
1150 |    "name": "python",
1151 |    "nbconvert_exporter": "python",
1152 |    "pygments_lexer": "ipython2",
1153 |    "version": "2.7.10"
1154 |   }
1155 |  },
1156 |  "nbformat": 4,
1157 |  "nbformat_minor": 0
1158 | }
1159 | 


--------------------------------------------------------------------------------
/Final Project- Experiment Screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baumanab/udacity_ABTesting/36bc61e56e0d8485573ed17d4f6af9471e700f94/Final Project- Experiment Screenshot.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A/B Testing to Determine an Effective Intervention to Decrease Early Udacity Course Cancellation
  2 | 
  3 | ## Experiment Description
  4 | 
  5 | At the time of the experiment described herein, the Udacity course home pages have two options: "start free trial" and "access course materials."  Clicking "start free trial" prompts the user to enter their credit card information, subsequently enrolling them in a 14 day free trial of the course, after which they are automatically charged.  Users who click "access course materials" will be able to view course content but receive no coaching support, verified certificate, or project feedback.  
  6 | 
  7 | For this experiment Udacity tested a change wherein those users who clicked "start free trial" were asked how much time they were willing to devote to the course.  Users choosing 5 or more hours per week would be taken through the checkout process as usual.  For users indicating fewer than 5 hours per week a message would appear indicating the need for a greater time commitment to enable success and suggesting they might like to access the free content.  At this point the student would have the option to continue enrolling in the free trial or access the course materials for free.  
  8 | This screenshot shows the experiment:
  9 | 
 10 | ![Experiment Screenshot](https://github.com/baumanab/udacity_ABTesting/Final Project- Experiment Screenshot.png)
 11 | 
 12 | The rationale for this change is that diverting students as a function of time to devote to study might improve the overall student expereince and the coaches' capacity to support students who are likely to complete the course, without significantly reducing the number of students who continue past the free trial.
 13 | 
 14 | ## Experiment Design
 15 | 
 16 | The initial unit of diversion to the conrol and experiment groups is a unique cookie.  However, once a student enrolls in the free trial, they are tracked by user-id.  The same user-id can't enroll more than once.  Users who don't enroll are not tracked by user-id.  Note that the uniqueness of a cookie is determined per day.
 17 | 
 18 | 
 19 | ### Metric Choice
 20 | 
 21 | _**Invariant Metrics:** number of cookies, number of clicks, click-through-probability_
 22 | 
 23 | _**Evaluation Metrics:** gross conversion, retention, net conversion_
 24 | 
 25 | 
 26 | #### Invariant Metrics
 27 | 
 28 | Invariant metrics were chosen due to their expected property of being....well, invariant.  One would expect similiar distribution into control and experiment for the following metrics.  If one were to find that this is not the case, it could be indicative of feeding mogwai after midnight.
 29 | 
 30 | **Number of Cookies:** The number of unique cookies to visit the course overview page.  This is the unit of diversion and even distribution amongst the control and experiment groups is expected.  It is therefore appropriate as an invariant metric.
 31 | 
 32 | **Number of Clicks:** The number of users (tracked as unique cookies at this stage) to click the free trial buttion. This is appropriate as an invariant metric but not an evaluation metrice.  Equal distribution amongst the experiment and control groups would be expected since at this point in the funnel the experience is the same for all users and therefore elements of the experiment would not be expected to impact clicking the "start free trial" button.    
 33 | 
 34 | **Click-through-probability:** Unique cookies to click the "start free trial" button per unique cookies to view the course overview page. Equal distribution amongst the experiment and control groups would be expected since at this point in the funnel the experience is the same for all users and therefore elements of the experiment would not be expected to impact clicking the "start free trial" button.    
 35 |  
 36 | 
 37 | #### Evaluation Metrics
 38 | 
 39 | Evaluation metrics were chosen since there is the possibility of different distributions between experiment and control groups as a function of the experiment.  Each evaluation metric is associated with a minimum difference (dmin) that must be observed for consideration in the decision to launch the experiment. The ultimate goal is to minimize student frustration and satisfaction and to most effectively use limited coaching resources.  Cancelling early may be one indication of frustration or low satisfaction and the more students enrolled in the course who do not make at least one payment, much less finish the course, the less coaching resources are being used effectively.  With this in mind, in order to consider launching the experiment either of the following must be observed:
 40 | 
 41 | - Increased retention (more students staying beyond the free trial in the experiment group)
 42 | - Decreased Gross Conversion coupled to increased Net Conversion (less students enrolling in the free trial but more students staying beyond the free trial)
 43 | 
 44 | 
 45 | **Gross Conversion:**  This is the number of user-ids to complete checkout and enroll in the free trial per unique cookie to click the "start free trial" button.  dmin = 0.01
 46 | 
 47 | 
 48 | **Retention:**  This is the number of user-ids to remain enrolled past the 14 day trial period, making at least one payment, per number of user-ids to complete checkout.  The practical minimum difference (dmin) = 0.01
 49 | 
 50 | 
 51 | **Net Conversion:** this is the number of user-ids to remain enrolled past the 14 day trial, making at least one payment, per the number of unique cookies to click the "start free trial" button.  dmin = 0.0075
 52 | 
 53 | #### Unused Metrics
 54 | 
 55 | **Number of user-ids:** The number of users to enroll in the free trial.  This is not a suitable invariant metric and while it could be used as an evalution metric, it is not ideal.  User-ids are tracked only after enrolling in the free trial and equal distribution between the control and experimental branches would not be expected.  User-id count could be used to evaluate how many enrollments stayed beyond the 14 day free trial boundary, but since it isn't normalized, I have elected not to use it.  
 56 | 
 57 | 
 58 | ## Measuring Standard Deviation
 59 | 
 60 | **_Analytical Estimate of Standard Deviation_**
 61 | 
 62 | | Evaluation Metric | Standard Deviation |
 63 | |:-------------------:|:--------------------:|
 64 | | Gross Conversion  | .0202 |
 65 | | Retention         | .0549 |
 66 | | Net Conversion    | .0156 |
 67 | 
 68 | The analytical estimate of standard devation tends to be near the empirically determined standard deviation for those cases in which the unit of diversion is equal to the unit of analysis.  This is the case for Gross Conversion and Net Conversion, but not Retention.  If we do ultimately decide to use Retention, then we should calculate the empirical variability.
 69 | 
 70 | 
 71 | ## Sizing
 72 | 
 73 | The following calcations are based on [baseline conversion data](data/baseline_vals.csv).
 74 | 
 75 | ### Number of Samples vs. Power
 76 | 
 77 | My intial approach will not deploy the Bonferroni correction, that decision will be made based on my final choice of evaluation metrics and associated criteria. Pageviews required for each metric were calculated using an alpha value of 0.05 and beta value of 0.2.
 78 | 
 79 | **_Pageviews for Each Evaluation Metric to Achieve Target Statistical Power_**
 80 | 
 81 | #### Gross Conversion
 82 | 
 83 | - Baseline Conversion: 20.625%
 84 | - Minimum Detectable Effect: 1%
 85 | - alpha: 5%
 86 | - beta: 20%
 87 | - 1 - beta: 80%
 88 | - sample size = 25,835 enrollments/group
 89 | - Number of groups = 2 (experiment and control)
 90 | - total sample size =  51,670 enrollments
 91 | - clicks/pageview: 3200/40000 = .08 clicks/pageview
 92 | - pageviews = 645,875
 93 | 
 94 | 
 95 | 
 96 | #### Retention
 97 | 
 98 | - Baseline Conversion: 53%
 99 | - Minimum Detectable Effect: 1%
100 | - alpha: 5%
101 | - beta: 20%
102 | - 1 - beta: 80%
103 | - sample size = 39,155 enrollments/group
104 | - Number of groups = 2 (experiment and control)
105 | - total sample size = 78,230 enrollments
106 | - enrollments/pageview: 660/40000 = .0165 enrollments/pageview
107 | - pageviews = 78,230/.0165 = 4,741,212
108 | 
109 | #### Net Conversion
110 | 
111 | - Baseline Conversion: 10.9313%
112 | - Minimum Detectable Effect: .75%
113 | - alpha: 5%
114 | - beta: 20%
115 | - 1 - beta: 80%
116 | - sample size = 27,413 enrollments/group
117 | - Number of groups = 2 (experiment and control)
118 | - total sample size = 54,826
119 | - clicks/pageview: 3200/40000 = .08 clicks/pageview
120 | - pageviews = 685,325
121 | 
122 | _Pageviews Required:  4,741,212_
123 | 
124 | 
125 | ### Duration vs. Exposure
126 | 
127 | If we divert 100% of traffic, given 40,000 page views per day, the experiment would take ~ 119 days.  If we eliminate retention, we are left with Gross Conversion and Net Conversion.  This reduces the number of required pageviews to 685,325, and an ~ 18 day experiment with 100% diversion and ~ 35 days given 50% diversion.  
128 | 
129 | A 119 day experiment with 100% diversion of traffic presents both a business risk (potential for: frustrated students, lower conversion and retention, and inefficient use of coaching resources) and an opportunity risk (performing other experiments).  However, in general, this is not a risky experiment as the change would not be expected to cause a precipitous drop in enrollment.  In terms of timing, an 18 day experiment is more reasonable, but % diversion may be scaled down depending on other experiments of interest to be performed concurrently.
130 | 
131 | 
132 | ## Experiment Analysis
133 | 
134 | The experimental data can be found in the following links:
135 | 
136 | - [experiment group](data/Final Project Results - Experiment.csv)
137 | - [control group](data/Final Project Results - Control.csv)
138 | 
139 | ### Sanity Checks
140 | 
141 | For invariant metrics we expect equal diversion into the experiment and control group.  We will test this at the 95% confidence interval.
142 | 
143 | | Metric | Expected Value | Observed Value | CI Lower Bound | CI Upper Bound | Result |
144 | |:------:|:--------------:|:--------------:|:--------------:|:--------------:|:------:|
145 | | Number of Cookies | 0.5000 | 0.5006 | 0.4988 | 0.5012 | Pass |
146 | | Number of clicks on "start free trial" | 0.5000 | 0.5005 | 0.4959 | 0.5042 | Pass |
147 | | Click-through-probability | 0.0821 | 0.0822 | 0.0812 | 0.0830 | Pass | 
148 | 
149 | 
150 | ### Result Analysis
151 | 
152 | 95% Confidence interval for the difference between the experiment and control group for evaluation metrics.
153 | 
154 | | Metric | dmin | Observed Difference | CI Lower Bound | CI Upper Bound | Result |
155 | |:------:|:--------------:|:--------------:|:--------------:|:--------------:|:------:|
156 | | Gross Conversion | 0.01 | -0.0205 | -.0291 | -.0120 | Satistically and Practically Significant |
157 | | Net Conversion | 0.0075 | -0.0048 | -0.0116 | 0.0019 | Neither Statistically nor Practically Significant |
158 | 
159 | 
160 | ### Sign Tests
161 | 
162 | | Metric | p-value for sign test | Statistically Significant @ alpha .05? |
163 | |:------:|:--------------:|:--------------:|
164 | | Gross Conversion | 0.0026 | Yes |
165 | | Net Conversion | 0.6776 | No |
166 | 
167 | ## Summary
168 | 
169 | An experiment was conducted in which potential Udacity students were diverted by cookie into two groups, experiment and control.  The experiment group was asked to input the amount of time they are willing to devote to study, after clicking a "start free trial button", whereas the control group was not.  Three invariant metrics (Number of Cookies, Number of clicks on "start free trial", and Click-Through-Probability) were chosen for purposes of validation and sanity checking while Gross Conversion (enrollment/cookie) and Net Conversion (payments/cookie) served as evaluation metrics.  The null hypothesis is that there is no difference in the evaluation metrics between the groups, futhermore, a practical signifcance threshold was set for each metric.  The requirement for launching the experiment is that the null hypothesis must be rejected for ALL evaluation metrics and that the difference between branches must meet or exceed the practical signficance threshold.  Because our acceptance criteria requires statiscally signifcant differences for ALL evaluation metrics, the use of the Bonferonni correction is not appropriate.  The Bonferonni correction is a method for controlling for type I errors (false positives) when using multiple metrics in which relevance of ANY of the metrics matches the hypothesis.  In this case the risk of type I errors increases as the number of metrics increases (signifcance by random chance).  In our case in which ALL metrics must be relevant to launch, the risk of type II errors (false negatives) increases as the number of metrics increases, so it stands to reason that controlling for false positives is not consistent with our acceptance criteria. 
170 | 
171 | Analysis revealed the expected equal distribution of cookies into the control and experimental groups, for the invariant metrics, at the 95% CI.  A difference in gross conversion was found to be statistically signficant at the 95% CI, and the null hypothesis was rejected.  Gross conversion also met the practical signficance threshold.  Net Conversion was found to be neither statistically nor practically signficant at the 95% CI.     
172 | 
173 | 
174 | ## Recommendation
175 | 
176 | This experiment was designed to determine whether filtering students as a function of study time commitment would improve the overall student experience and the coaches' capacity to support students who are likely to complete the course, without significantly reducing the number of students who continue past the free trial. A statistically and practically signficant decrease in Gross Conversion was observed but with no significant differences in Net Conversion. This translates to a decrease in enrollment not coupled to an increase in students staying for the requisite 14 days to trigger payment.  Considering this, my recomendation is not to launch, but rather to pursue other experiments.
177 | 
178 | 
179 | ## Follow-Up Experiment
180 | 
181 | The construct of student frustration could be assigned an operational definition of "cancel early," where a convenient definition and measure of early cancellation is prior to the end of the 14 day trial period in which payment is triggered.  An early cancellation is not necessarily indicative of frustration but could be from other causes, such as a course not being aligned to the students needs or expectations in terms of content. For preventng early cancellation there are two primary logical timepoint opportunities for intervention, (1) pre-enrollment, and (2) post-enrollment but pre-payment. 
182 | 
183 | The first opportunity for intervention was explored above wherein a poll regarding time commitment was used as to filter out students likely to become frustrated.  This filter focused only on time commitment to the class and did not address other reasons why a student might become frustrated and cancel early. Even if the student was sincere in their response and dilligent in their study, they may become frustrated if they don't have the suggested pre-requisite skills and experience.  That is, their committed time may not be enough if they don't come in with the pre-requisite skill set.  Adding a checklist of pre-requisite skills to the popup regarding time commitment may be informative.  This experiment would leverage the infrastrucure and data pipeline of the original experiment and be set up in the same way as the original, including the unit of diversion.  The only difference would be the information in the form.  If the student's answer meets the time and pre-requisite requirements (radiobox checklist) they are directed to enroll in the free trial, otherwise they are encouraged to access the free version.  This experiment would be low cost in terms of resources and may increase the selectivity of the pre-enrollment filter.  A succesful experiment would be one in which there is a signficant decrease in Gross Conversion coupled to a significant increase in Net Conversion.
184 | 
185 | A variety of approaches could be used to intervene post-enrollment but pre-payment and could be deployed concurrently with pre-enrollment intervention.  An ideal approach would be one which minimizes the use of additional coaching resources to best meet the original intent of the intervention.  An effective approach may be to employ peer coaching/guidance by means of team formation.  If a student has a team of other students which they could consult, discuss coursework and frustrations with, and be accountable to, they may be more likely to stick out the growing pains and stay for the long term.  The experiment would function in the following manner.
186 | 
187 | **Setup:** Upon enrollment students will either be randomly assigned to a control group in which they are not funnelled into a team, or an experiment group in which they are.
188 | 
189 | **Null Hypothesis:** Participation in a team will not increase the number of students enrolled beyond the 14 day free trial period by a significant amount.
190 | 
191 | **Unit of Diversion:** The unit of diversion will be user-id as the change takes place after a student creates an account and enrolls in a course.
192 | 
193 | **Invariant Metrics:**  The invariant metric will be user-id, since an equal distribution between experiment and control would be expected as a property of the setup.
194 | 
195 | **Evaluation Metrics:** The evaluation metric willl be Retention.  A statistically and practically significant increase in Retention would indicate that the change is succesful.
196 | 
197 | If a statistically and practically signifcant positive change in Retention is observed, assuming an acceptable impact on overall Udacity resources (setting up and maintaining teams will require resource use), the experiment will be launched.
198 | 
199 | 


--------------------------------------------------------------------------------
/data/Final Project Results - Control.csv:
--------------------------------------------------------------------------------
 1 | Date,Pageviews,Clicks,Enrollments,Payments
 2 | "Sat, Oct 11",7723,687,134,70
 3 | "Sun, Oct 12",9102,779,147,70
 4 | "Mon, Oct 13",10511,909,167,95
 5 | "Tue, Oct 14",9871,836,156,105
 6 | "Wed, Oct 15",10014,837,163,64
 7 | "Thu, Oct 16",9670,823,138,82
 8 | "Fri, Oct 17",9008,748,146,76
 9 | "Sat, Oct 18",7434,632,110,70
10 | "Sun, Oct 19",8459,691,131,60
11 | "Mon, Oct 20",10667,861,165,97
12 | "Tue, Oct 21",10660,867,196,105
13 | "Wed, Oct 22",9947,838,162,92
14 | "Thu, Oct 23",8324,665,127,56
15 | "Fri, Oct 24",9434,673,220,122
16 | "Sat, Oct 25",8687,691,176,128
17 | "Sun, Oct 26",8896,708,161,104
18 | "Mon, Oct 27",9535,759,233,124
19 | "Tue, Oct 28",9363,736,154,91
20 | "Wed, Oct 29",9327,739,196,86
21 | "Thu, Oct 30",9345,734,167,75
22 | "Fri, Oct 31",8890,706,174,101
23 | "Sat, Nov 1",8460,681,156,93
24 | "Sun, Nov 2",8836,693,206,67
25 | "Mon, Nov 3",9437,788,,
26 | "Tue, Nov 4",9420,781,,
27 | "Wed, Nov 5",9570,805,,
28 | "Thu, Nov 6",9921,830,,
29 | "Fri, Nov 7",9424,781,,
30 | "Sat, Nov 8",9010,756,,
31 | "Sun, Nov 9",9656,825,,
32 | "Mon, Nov 10",10419,874,,
33 | "Tue, Nov 11",9880,830,,
34 | "Wed, Nov 12",10134,801,,
35 | "Thu, Nov 13",9717,814,,
36 | "Fri, Nov 14",9192,735,,
37 | "Sat, Nov 15",8630,743,,
38 | "Sun, Nov 16",8970,722,,


--------------------------------------------------------------------------------
/data/Final Project Results - Experiment.csv:
--------------------------------------------------------------------------------
 1 | Date,Pageviews,Clicks,Enrollments,Payments
 2 | "Sat, Oct 11",7716,686,105,34
 3 | "Sun, Oct 12",9288,785,116,91
 4 | "Mon, Oct 13",10480,884,145,79
 5 | "Tue, Oct 14",9867,827,138,92
 6 | "Wed, Oct 15",9793,832,140,94
 7 | "Thu, Oct 16",9500,788,129,61
 8 | "Fri, Oct 17",9088,780,127,44
 9 | "Sat, Oct 18",7664,652,94,62
10 | "Sun, Oct 19",8434,697,120,77
11 | "Mon, Oct 20",10496,860,153,98
12 | "Tue, Oct 21",10551,864,143,71
13 | "Wed, Oct 22",9737,801,128,70
14 | "Thu, Oct 23",8176,642,122,68
15 | "Fri, Oct 24",9402,697,194,94
16 | "Sat, Oct 25",8669,669,127,81
17 | "Sun, Oct 26",8881,693,153,101
18 | "Mon, Oct 27",9655,771,213,119
19 | "Tue, Oct 28",9396,736,162,120
20 | "Wed, Oct 29",9262,727,201,96
21 | "Thu, Oct 30",9308,728,207,67
22 | "Fri, Oct 31",8715,722,182,123
23 | "Sat, Nov 1",8448,695,142,100
24 | "Sun, Nov 2",8836,724,182,103
25 | "Mon, Nov 3",9359,789,,
26 | "Tue, Nov 4",9427,743,,
27 | "Wed, Nov 5",9633,808,,
28 | "Thu, Nov 6",9842,831,,
29 | "Fri, Nov 7",9272,767,,
30 | "Sat, Nov 8",8969,760,,
31 | "Sun, Nov 9",9697,850,,
32 | "Mon, Nov 10",10445,851,,
33 | "Tue, Nov 11",9931,831,,
34 | "Wed, Nov 12",10042,802,,
35 | "Thu, Nov 13",9721,829,,
36 | "Fri, Nov 14",9304,770,,
37 | "Sat, Nov 15",8668,724,,
38 | "Sun, Nov 16",8988,710,,


--------------------------------------------------------------------------------
/data/baseline_vals.csv:
--------------------------------------------------------------------------------
1 | Unique cookies to view page per day:,40000
2 | "Unique cookies to click ""Start free trial"" per day:",3200
3 | Enrollments per day:,660
4 | "Click-through-probability on ""Start free trial"":",0.08
5 | "Probability of enrolling, given click:",0.20625
6 | "Probability of payment, given enroll:",0.53
7 | "Probability of payment, given click",0.1093125


--------------------------------------------------------------------------------
/instructions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/baumanab/udacity_ABTesting/36bc61e56e0d8485573ed17d4f6af9471e700f94/instructions.pdf


--------------------------------------------------------------------------------
	metric	baseline_val
0	unique cookies to view page per day:	40000.000000
1	unique cookies to click \"start free trial\" per...	3200.000000
2	enrollments per day:	660.000000
3	click-through-probability on \"start free trial\":	0.080000
4	probability of enrolling, given click:	0.206250
5	probability of payment, given enroll:	0.530000
6	probability of payment, given click	0.109313
	Control	Experiment
cookies	345543	344660
clicks	28378	28325
enrollments	3785	3423
payments	2033	1945
	Control	Experiment
cookies	212163	211362
clicks	17293	17260
enrollments	3785	3423
payments	2033	1945
	Date	GrossConversion_cont	GrossConversion_exp	NetConversion_cont	NetConversion_exp
0	Sat, Oct 11	0.195051	0.153061	0.101892	0.049563
1	Sun, Oct 12	0.188703	0.147771	0.089859	0.115924
2	Mon, Oct 13	0.183718	0.164027	0.104510	0.089367
3	Tue, Oct 14	0.186603	0.166868	0.125598	0.111245
4	Wed, Oct 15	0.194743	0.168269	0.076464	0.112981