├── .gitignore ├── data ├── Chinook_Sqlite.sqlite ├── Ulysses.txt ├── animals.txt ├── animals2.txt ├── class.txt ├── crash.json ├── major.txt ├── north_carolina_bicycle_crash_data_heatmap_.json ├── reed.xml ├── student.txt └── student_class.txt ├── exams ├── ExtrraCredit-Solutions.ipynb ├── ExtrraCredit.ipynb ├── HtWt.csv ├── Midterm-Revised-Solutions-Final.ipynb ├── Midterm-Revised.ipynb ├── dexp.png └── xy.csv ├── homework ├── Homework01.ipynb ├── Homework01_Solutions.ipynb ├── Homework02.ipynb ├── Homework02_Solutions.ipynb ├── Homework03.ipynb ├── Homework03_Solutions.ipynb ├── Homework04.ipynb ├── Homework04_Solutions.ipynb ├── Homework05.ipynb ├── Homework05_Solutions.ipynb ├── Homework06.ipynb ├── Homework06_Solutions.ipynb ├── Homework07.ipynb ├── Homework07_Solutions.ipynb ├── Homework08.ipynb ├── Homework08_Solutions.ipynb ├── Homework09.ipynb ├── Homework09_Solutions.ipynb ├── Homework10.ipynb ├── Homework10_Solutions.ipynb ├── milkmaid.png ├── mystery.txt ├── pubmed.pic ├── x1d.npy └── x2d.npy ├── images ├── Scraping data.png └── hw2_q4.png ├── lectures ├── 01_Introduction_To_Python.ipynb ├── 02A_Functions.ipynb ├── 02B_Strings.ipynb ├── 02C_IO.ipynb ├── 02D_Classes.ipynb ├── 03A_Numbers.ipynb ├── 03B_Graphics.ipynb ├── 04A_Data.ipynb ├── 04B_SQL.ipynb ├── 05_Machine_Learning.ipynb ├── 06_LinearAlgebra1.ipynb ├── 07_LinearAlgebra2.ipynb ├── 08_LinearAlgebraExamples.ipynb ├── 09_PCA.ipynb ├── 10_SymbolicAlgebra.ipynb ├── 11_OptimizationOneDimension.ipynb ├── 12_MultivariateOptimizationAlgorithms.ipynb ├── 13_Optimization.ipynb ├── 14_ExpectationMaximization.ipynb ├── 15A_RandomNumbers.ipynb ├── 15B_ResamplingAndSimulation.ipynb ├── 15C_MonteCarloIntegration.ipynb ├── 16A_MCMC.ipynb ├── 16B_AuxiliaryVariableMCMC.ipynb ├── 16C_PyMC3.ipynb ├── 16D_PyStan.ipynb ├── 17A_C_Crash_Course.ipynb ├── 17B_C_InOneLecture.ipynb ├── 17C_C++_Primer_Solutions.ipynb ├── 17D_Review_C_C++.ipynb ├── 18A_CodeOptimization.ipynb ├── 18B_Foreing_Language_Interface.ipynb ├── 18C_Numba.ipynb ├── 18D_Cython.ipynb ├── 18E_Benchmarks.ipynb ├── 18F_Optimization_Bakeoff.ipynb ├── 19A_Parallel_Programming.ipynb ├── 19B_Threads_Processses_Concurrency.ipynb ├── 19C_IPyParallel.ipynb ├── 20A_Intermediate_Sized_Data.ipynb ├── 20B_Big_Data_Structures.ipynb ├── 21A_Introduction_To_Spark.ipynb ├── 21B_Efficiency_In_Spark.ipynb ├── 21C_Spark_SQL.ipynb ├── 21D_Spark_MLib.ipynb ├── 21E_Spark_And_Sklearn.ipynb ├── 21F_Spark_GraphX.ipynb ├── 21G_Spark_Streaming,ipynb ├── 21H_Spark_Cloud.ipynb ├── Customizing_Jupyter.ipynb ├── ExercisesForLab01-Solutions.ipynb ├── ExercisesForLab01.ipynb ├── Extra_Packages.ipynb ├── HtWt.csv ├── Lagrange_multiplier.png ├── Local_Installation.ipynb ├── Makefile ├── Spark01.ipynb ├── Spark02.ipynb ├── Spark03.ipynb ├── Spark04.ipynb ├── Stuff.ipynb ├── Template01.ipynb ├── commutative.png ├── conf.py ├── data │ ├── Portrait.txt │ ├── Ulysses.txt │ ├── adult.data.txt │ ├── adult.names.txt │ ├── adult.test.txt │ ├── sonar.all-data.txt │ ├── sonar.mines.txt │ ├── sonar.names.txt │ └── sonar.rocks.txt ├── em.png ├── index.rst ├── jensen.png ├── julia_benchmarks.pic ├── looking_glass.txt ├── make.bat ├── mcmc.png ├── my_module.py ├── radon.csv ├── spectral.png └── sphinx-readme ├── misc ├── Customizing_Jupyter.ipynb ├── Local_Installation.ipynb ├── Recommended_Books.ipynb ├── Spark_Test_Drive.ipynb ├── TopicCoverageForMidterm.ipynb └── old-exams │ ├── Midterm-Revised.ipynb │ ├── Midterm-Sample-Revised.ipynb │ └── milkmaid.png ├── projects ├── FinalProject.ipynb └── FinalProjectGuide.ipynb └── syllabus.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.npy 2 | *.npyz 3 | *main* 4 | *.so 5 | *.dylib 6 | *.json 7 | *.txt 8 | ex[0-9]* 9 | *.csv 10 | Untitled* 11 | *.o 12 | *.c 13 | *.cpp 14 | *.h 15 | *.hpp 16 | *.pyx 17 | *pxd 18 | *.pic 19 | trace* 20 | .DS_Store 21 | RISE/ 22 | *ipynb_checkpoints* 23 | *mplstyle* 24 | lectures/_build/ 25 | data/ 26 | web/ 27 | 28 | # Byte-compiled / optimized / DLL files 29 | __pycache__/ 30 | *.py[cod] 31 | 32 | # C extensions 33 | *.so 34 | 35 | # Distribution / packaging 36 | .Python 37 | env/ 38 | build/ 39 | develop-eggs/ 40 | dist/ 41 | downloads/ 42 | eggs/ 43 | .eggs/ 44 | lib/ 45 | lib64/ 46 | parts/ 47 | sdist/ 48 | var/ 49 | *.egg-info/ 50 | .installed.cfg 51 | *.egg 52 | 53 | # PyInstaller 54 | # Usually these files are written by a python script from a template 55 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 56 | *.manifest 57 | *.spec 58 | 59 | # Installer logs 60 | pip-log.txt 61 | pip-delete-this-directory.txt 62 | 63 | # Unit test / coverage reports 64 | htmlcov/ 65 | .tox/ 66 | .coverage 67 | .coverage.* 68 | .cache 69 | nosetests.xml 70 | coverage.xml 71 | *,cover 72 | 73 | # Translations 74 | *.mo 75 | *.pot 76 | 77 | # Django stuff: 78 | *.log 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | -------------------------------------------------------------------------------- /data/Chinook_Sqlite.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/data/Chinook_Sqlite.sqlite -------------------------------------------------------------------------------- /data/animals.txt: -------------------------------------------------------------------------------- 1 | name|species|age|weight 2 | arun|cat|5|7.3 3 | bob|bird|2|1.5 4 | coco|cat|2|5.5 5 | dumbo|elephant|23|454 6 | elmo|dog|5|11 7 | fido|dog|3|24.5 8 | gumba|bird|2|2.7 -------------------------------------------------------------------------------- /data/animals2.txt: -------------------------------------------------------------------------------- 1 | 2 | name|species|age|weight 3 | arun|cat|5|7.3 4 | bob|bird|2|1.5 5 | coco|cat|2|5.5 6 | dumbo|elephant|23|454 7 | elmo|dog|5|11 8 | fido|dog|3|24.5 9 | gumba|bird|2|2.7 10 | -------------------------------------------------------------------------------- /data/class.txt: -------------------------------------------------------------------------------- 1 | class_id,code,name,credits 2 | 1,ANT01,Introduction to Hobbits,4 3 | 2,MAT802,Abstrct Nonsense,8 4 | 3,ENG234,Jabberwocky,2 5 | 4,STA007,Statistics for Secret Agens,4 6 | 5,PHY211,Physics of Star Wars,4 -------------------------------------------------------------------------------- /data/major.txt: -------------------------------------------------------------------------------- 1 | major_id,name 2 | 1,Computer Science 3 | 2,Physics 4 | 3,Statisitcs 5 | 4,English 6 | 5,History -------------------------------------------------------------------------------- /data/student.txt: -------------------------------------------------------------------------------- 1 | 2 | student_id,first,last,email,major_id 3 | 1,frodo,baggins,frodo.baggins@duke.edu,1 4 | 2,bilbo,baggins,b_baggins@duke.edu,3 5 | 3,golum,golum,golum.golum@duke.edu,2 6 | 4,gandalf,white,g.white@duke.edu,5 7 | 5,gandalf,grey,g.grey@duke.edu,6 8 | 6,saruman,wise,s.wise@duke.edu,2 -------------------------------------------------------------------------------- /data/student_class.txt: -------------------------------------------------------------------------------- 1 | student_id,class_id 2 | 1,3 3 | 1,4 4 | 2,1 5 | 2,4 6 | 3,1 7 | 3,2 8 | 3,3 9 | 3,5 10 | 4,2 11 | 4,5 -------------------------------------------------------------------------------- /exams/HtWt.csv: -------------------------------------------------------------------------------- 1 | male,height,weight 2 | 0,63.2,168.7 3 | 0,68.7,169.8 4 | 0,64.8,176.6 5 | 0,67.9,246.8 6 | 1,68.9,151.6 7 | 1,67.8,158.0 8 | 1,68.2,168.6 9 | 0,64.8,137.2 10 | 1,64.3,177.0 11 | 0,64.7,128.0 12 | 1,66.9,168.4 13 | 1,66.9,136.2 14 | 1,67.1,160.3 15 | 1,70.2,233.9 16 | 1,67.4,171.7 17 | 1,71.1,185.5 18 | 0,63.4,177.6 19 | 1,66.9,132.9 20 | 0,71.0,140.1 21 | 1,70.4,151.9 22 | 0,59.5,147.2 23 | 1,70.4,159.0 24 | 0,61.5,113.0 25 | 1,74.5,194.5 26 | 0,65.3,145.1 27 | 1,68.8,196.5 28 | 0,67.2,148.9 29 | 1,68.7,132.9 30 | 0,60.0,168.4 31 | 0,62.5,146.2 32 | 1,72.0,236.4 33 | 1,67.9,140.0 34 | 1,65.1,156.2 35 | 1,63.5,178.7 36 | 1,68.2,147.5 37 | 0,64.6,97.7 38 | 1,68.1,189.6 39 | 0,66.2,221.9 40 | 0,62.8,168.1 41 | 0,65.3,143.1 42 | 0,65.8,217.7 43 | 0,68.7,133.2 44 | 0,63.8,96.5 45 | 1,70.6,270.6 46 | 0,61.5,137.2 47 | 0,61.9,124.2 48 | 0,65.1,128.3 49 | 1,68.7,203.6 50 | 0,57.6,132.4 51 | 1,66.3,189.4 52 | 1,69.0,174.0 53 | 0,63.4,163.3 54 | 1,69.5,183.5 55 | 1,67.8,193.8 56 | 0,61.6,119.7 57 | 1,71.2,157.4 58 | 1,67.4,146.1 59 | 0,66.1,128.3 60 | 1,70.7,179.1 61 | 0,67.0,140.0 62 | 1,66.8,202.2 63 | 1,69.9,169.4 64 | 0,57.7,122.8 65 | 0,62.5,248.5 66 | 1,66.6,154.4 67 | 0,60.6,140.2 68 | 1,70.4,141.6 69 | 0,66.4,144.4 70 | 0,62.3,116.2 71 | 1,73.3,175.0 72 | -------------------------------------------------------------------------------- /exams/Midterm-Revised.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Mid-term Exam\n", 8 | "====\n", 9 | "\n", 10 | "This is a **closed book** exam except for the 1 page cheatsheet. You can use the help function within Jupyer (e.g. `range?`) and links from the Help Menu, but not use any other external reference or search engine. " 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Run this to get all necessary imports\n", 18 | "\n", 19 | "Not all the imports need to be used to solve the problems. I am just including everything that I think could be useful." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import numpy.random as rng\n", 32 | "import scipy.linalg as la\n", 33 | "import scipy.stats as stats\n", 34 | "import scipy.optimize as opt\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "import seaborn as sns\n", 37 | "import pandas as pd\n", 38 | "from pandas import DataFrame, Series\n", 39 | "from sympy import symbols, integrate, exp, oo\n", 40 | "%matplotlib inline" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "#### Set random number seed so that answers are the same" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "rng.seed(123)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "**Question 1 (10 points)**\n", 66 | "\n", 67 | "Using the `iris` dataset, answer the following questions:\n", 68 | "\n", 69 | "- Find the mean, min and max values of all four measurements (sepal.length, sepal.width, petal.length, petal.width) for each species\n", 70 | "- Find the average petal.width for rows where the petal.length is less than the sepal.width" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 2, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "iris = sns.load_dataset('iris')" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "\n", 93 | "\n", 94 | "\n" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "**Question 2 (10 points)**\n", 102 | "\n", 103 | "Write a function `peek(df, n)` to display a random selection of $n$ rows of any dataframe (without repitition). Use it to show 5 random rows from the iris data set. The function should take as inputs a dataframe and an integer." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "\n", 115 | "\n", 116 | "\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "**Question 3 (30 points)**\n", 124 | "\n", 125 | "You are given the following set of data\n", 126 | "\n", 127 | "```python\n", 128 | "x = np.arange(10)\n", 129 | "y = np.array([ 1.58873597, 7.55101533, 10.71372171, 7.90123225,\n", 130 | " -2.05877605, -12.40257359, -28.64568712, -46.39822281,\n", 131 | " -68.15488905, -97.16032044])\n", 132 | "```\n", 133 | "\n", 134 | "- Find the least squares solution by solving the normal equations $A^T A \\hat{x} = A^T y$ - use `scipy.linalg.solve`. (5 points)\n", 135 | "\n", 136 | "- Write your own **gradient descent** optimization function to find the least squares solution for the coefficients $\\beta$ of a quadratic polynomial. Do **not** use a gradient descent algorithm from a package such as `scipy-optimize` or `scikit-learn`. You can use a simple for loop - start with the parameters `beta = np.zeros(3)` with a learning rate $\\alpha = 0.0001$ and run for 100000 iterations. (15 points)\n", 137 | "\n", 138 | "- Plot the data together with the fitted polynomial from the first and second solutions in separate subplots. (10 points)\n", 139 | "\n" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 3, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "x = np.arange(10)\n", 151 | "y = np.array([ 1.58873597, 7.55101533, 10.71372171, 7.90123225,\n", 152 | " -2.05877605, -12.40257359, -28.64568712, -46.39822281,\n", 153 | " -68.15488905, -97.16032044])" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "\n", 165 | "\n", 166 | "\n" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "**Question 4 (20 points)**\n", 174 | "\n", 175 | "Consider the following system of equations:\n", 176 | "\n", 177 | "$$\\begin{align*}\n", 178 | "2x_1& - x_2& +x_1 &=& 6\\\\\n", 179 | "-x_1& +2x_2& - x_3 &=& 2\\\\\n", 180 | " x_1 & -x_2& + x_3 &=& 1\n", 181 | "\\end{align*}$$\n", 182 | "\n", 183 | "1. Write the system in matrix form $Ax=b$ and define these in numpy or scipy.\n", 184 | "2. Show that $A$ is positive-definite\n", 185 | "3. Use the appropriate matrix decomposition function in numpy and back-substitution to solve the system. Remember to use the structure of the problem to determine the appropriate decomposition.\n" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "collapsed": true 193 | }, 194 | "outputs": [], 195 | "source": [ 196 | "\n", 197 | "\n", 198 | "\n" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "**Question 5 (10 points)**\n", 206 | "\n", 207 | "Let\n", 208 | "\n", 209 | "$A = \\left(\\begin{matrix}2 & -1 &1\\\\-1& 2& -1 \\\\1&-1& 1\n", 210 | "\\end{matrix}\\right) \\;\\;\\;\\;\\;\\;\\textrm{ and }\\;\\;\\;\\;\\;\\; v = \\left(\\begin{matrix}1 \\\\ 1 \\\\2\\end{matrix}\\right)$\n", 211 | "\n", 212 | "Find $w$ such that $w$ is conjugate to $v$ under $A$. You may use *basic* linear algebra in scipy or numpy - i.e. matrix products." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": false 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "\n", 224 | "\n", 225 | "\n" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "**Question 6 (20 points)**\n", 233 | "\n", 234 | "- The Cauchy distribution is given by \n", 235 | "$$\n", 236 | "f(x) = \\frac{1}{\\pi (1 + x^2)}, \\ \\ -\\infty \\lt x \\lt \\infty \n", 237 | "$$\n", 238 | "\n", 239 | "Integrate the tail probability $P(X > 2)$ using Monte Carlo integration with 1 million samples from the uniform distribution using an appropriate change of variables (10 points)\n", 240 | "\n", 241 | "- Estimate the following integral using Monte Carlo integration and 1 million draws. Hint: See figure. (10 points)\n", 242 | "\n", 243 | "$$\n", 244 | "\\int_{-\\infty}^{\\infty} x^2 \\frac{1}{2}e^{-|x|} dx\n", 245 | "$$\n", 246 | "\n", 247 | "![Hint](./dexp.png)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "\n", 259 | "\n", 260 | "\n" 261 | ] 262 | } 263 | ], 264 | "metadata": { 265 | "kernelspec": { 266 | "display_name": "Python 3", 267 | "language": "python", 268 | "name": "python3" 269 | }, 270 | "language_info": { 271 | "codemirror_mode": { 272 | "name": "ipython", 273 | "version": 3 274 | }, 275 | "file_extension": ".py", 276 | "mimetype": "text/x-python", 277 | "name": "python", 278 | "nbconvert_exporter": "python", 279 | "pygments_lexer": "ipython3", 280 | "version": "3.5.1" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 0 285 | } 286 | -------------------------------------------------------------------------------- /exams/dexp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/exams/dexp.png -------------------------------------------------------------------------------- /exams/xy.csv: -------------------------------------------------------------------------------- 1 | x,y 2 | 0.0,0.9143693966994388 3 | 0.1,3.597345446583586 4 | 0.2,3.4829784980519922 5 | 0.30000000000000004,2.2937052860819085 6 | 0.4,3.821399748031464 7 | 0.5,66.51436537097152 8 | 0.6000000000000001,3.1733207566069264 9 | 0.7000000000000001,5.771087371143823 10 | 0.8,8.065936258705534 11 | 0.9,6.533259597734899 12 | 1.0,7.321113848377946 13 | -------------------------------------------------------------------------------- /homework/Homework01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**Due: 4 PM on Wednesday, 27 Jan 2016**" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Instructions\n", 15 | "-----\n", 16 | "\n", 17 | "Write code to solve all 10 problems. Each problem is worth 10 points. The grading rubric includes the following criteria:\n", 18 | "\n", 19 | "- Correctness\n", 20 | "- Readability\n", 21 | "- Efficiency\n", 22 | "\n", 23 | "Please do not copy answwrs found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc is OK. Some discussion with friends is fine too - but again, do not just copy thier answer. \n", 24 | "\n", 25 | "**Honor Code: By submitting this assignment, you certify that this is your origianl work.**" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Overwriting data/animals.txt\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "%%file ../data/animals.txt\n", 45 | "name|species|age|weight\n", 46 | "arun|cat|5|7.3\n", 47 | "bob|bird|2|1.5\n", 48 | "coco|cat|2|5.5\n", 49 | "dumbo|elephant|23|454\n", 50 | "elmo|dog|5|11\n", 51 | "fido|dog|3|24.5\n", 52 | "gumba|bird|2|2.7" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "**Q1.** Using only the Unix shell commands, find only rows showing the 3rd, 4th and 5th heaviest animals in the file animals.txt." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "collapsed": true 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "\n", 71 | "\n", 72 | "\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "**Q2.** Using only the Unix shell commands, find all files in the current directory and all its subdirecotries that contain the word elephant regardless of case." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "\n", 91 | "\n", 92 | "\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "**Q3.** Using only the Python standard library, find only rows showing the 3rd, 4th and 5th heaviest animals in the file animals.txt" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "\n", 111 | "\n", 112 | "\n" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "**Q4.** Using only the Python standard library, find all files in the current directory and all its sub-directories that contain the word `elephant` regardless of case." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 9, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "./Homework01.ipynb\n", 134 | "./.ipynb_checkpoints/Homework01-checkpoint.ipynb\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "# Not graded but here is a possile solution.\n", 140 | "for dirpath, dirnames, filenames in os.walk('.'):\n", 141 | " for filename in filenames:\n", 142 | " path = os.path.join(dirpath, filename)\n", 143 | " with open(path) as f:\n", 144 | " text = f.read()\n", 145 | " if 'elephant' in text.lower():\n", 146 | " print(path)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "**Q5.** Starting with `range(1, 20)`, make a list of the squares of each odd number in the following ways\n", 154 | "\n", 155 | "- With a for loop\n", 156 | "- Using a list comprehension\n", 157 | "- Using map and filter\n", 158 | "\n", 159 | "The answer should be [1, 9, 25, 49, 81, 121, 169, 225, 289, 361]" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": true 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "\n", 171 | "\n", 172 | "\n", 173 | "\n" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "**Q6.** If we list all the natural numbers below 10 that are multiples of 3 or 5, we get 3, 5, 6 and 9. The sum of these multiples is 23. (Euler problem #1)\n", 181 | "\n", 182 | "Write a program to find the sum of all the multiples of 3 or 5 below 1000.\n", 183 | "\n", 184 | "The answer sould be 233168." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "\n", 196 | "\n", 197 | "\n", 198 | "\n" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "**Q7**. A palindromic number reads the same both ways. The largest palindrome made from the product of two 2-digit numbers is 9009 = 91 × 99.\n", 206 | "\n", 207 | "Write a program to find the largest palindrome made from the product of two 3-digit numbers. (Euler problem #4)\n", 208 | "\n", 209 | "The answer should be 906609 = 913 × 993. (Thanks to Ilan Man for catching the error)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "\n", 221 | "\n", 222 | "\n", 223 | "\n" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "**Q8.** The sum of the squares of the first ten natural numbers is,\n", 231 | "$$\n", 232 | "1^2 + 2^2 + ... + 10^2 = 385\n", 233 | "$$\n", 234 | "The square of the sum of the first ten natural numbers is,\n", 235 | "$$\n", 236 | "(1 + 2 + ... + 10)^2 = 55^2 = 3025\n", 237 | "$$\n", 238 | "Hence the difference between the sum of the squares of the first ten natural numbers and the square of the sum is 3025 − 385 = 2640.\n", 239 | "\n", 240 | "Write a program to find the difference between the sum of the squares of the first one hundred natural numbers and the square of the sum. (Euler problem #6)\n", 241 | "\n", 242 | "The answer should be 25164150." 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "\n", 254 | "\n", 255 | "\n", 256 | "\n" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "**Q9.** Problem 8: The four adjacent digits in the 1000-digit number that have the greatest product are 9 × 9 × 8 × 9 = 5832.\n", 264 | "```\n", 265 | "73167176531330624919225119674426574742355349194934\n", 266 | "96983520312774506326239578318016984801869478851843\n", 267 | "85861560789112949495459501737958331952853208805511\n", 268 | "12540698747158523863050715693290963295227443043557\n", 269 | "66896648950445244523161731856403098711121722383113\n", 270 | "62229893423380308135336276614282806444486645238749\n", 271 | "30358907296290491560440772390713810515859307960866\n", 272 | "70172427121883998797908792274921901699720888093776\n", 273 | "65727333001053367881220235421809751254540594752243\n", 274 | "52584907711670556013604839586446706324415722155397\n", 275 | "53697817977846174064955149290862569321978468622482\n", 276 | "83972241375657056057490261407972968652414535100474\n", 277 | "82166370484403199890008895243450658541227588666881\n", 278 | "16427171479924442928230863465674813919123162824586\n", 279 | "17866458359124566529476545682848912883142607690042\n", 280 | "24219022671055626321111109370544217506941658960408\n", 281 | "07198403850962455444362981230987879927244284909188\n", 282 | "84580156166097919133875499200524063689912560717606\n", 283 | "05886116467109405077541002256983155200055935729725\n", 284 | "71636269561882670428252483600823257530420752963450\n", 285 | "```\n", 286 | "Write a program to find the thirteen adjacent digits in the 1000-digit number that have the greatest product. What is the value of this product? (Euler problem #8)\n", 287 | "\n", 288 | "The answer shoud be 23514624000." 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": true 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "\n", 300 | "\n", 301 | "\n", 302 | "\n" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "**Q10.** A Pythagorean triplet is a set of three natural numbers, a < b < c, for which,\n", 310 | "\n", 311 | "$$\n", 312 | "a^2 + b^2 = c^2\n", 313 | "$$\n", 314 | "For example, $3^2 + 4^2 = 9 + 16 = 25 = 5^2$\n", 315 | "\n", 316 | "There exists exactly one Pythagorean triplet for which a + b + c = 1000.\n", 317 | "Write a program to find the product abc. (Euler problem #9)\n", 318 | "\n", 319 | "The answer should be (200, 375, 425, 31875000)." 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": { 326 | "collapsed": true 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "\n", 331 | "\n", 332 | "\n", 333 | "\n" 334 | ] 335 | } 336 | ], 337 | "metadata": { 338 | "kernelspec": { 339 | "display_name": "Python 3", 340 | "language": "python", 341 | "name": "python3" 342 | }, 343 | "language_info": { 344 | "codemirror_mode": { 345 | "name": "ipython", 346 | "version": 3 347 | }, 348 | "file_extension": ".py", 349 | "mimetype": "text/x-python", 350 | "name": "python", 351 | "nbconvert_exporter": "python", 352 | "pygments_lexer": "ipython3", 353 | "version": "3.5.1" 354 | } 355 | }, 356 | "nbformat": 4, 357 | "nbformat_minor": 0 358 | } 359 | -------------------------------------------------------------------------------- /homework/Homework03.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**Due: 4 PM on Wednesday, 10 Feb 2016**" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Instructions\n", 15 | "-----\n", 16 | "\n", 17 | "Write code to solve all problems. The grading rubric includes the following criteria:\n", 18 | "\n", 19 | "- Correctness\n", 20 | "- Readability\n", 21 | "- Efficiency\n", 22 | "\n", 23 | "Please do not copy answers found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc is OK. Some discussion with friends is fine too - but again, do not just copy their answer. \n", 24 | "\n", 25 | "**Honor Code: By submitting this assignment, you certify that this is your original work.**\n", 26 | "\n", 27 | "**Note**: These exercises will involve quite a bit more code writing than the first 2 homework assignments so start early. They are also intentionally less specific so that you have to come up with your own plan to complete the exercises." 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "We will use the following data sets:\n", 35 | "```python\n", 36 | "titanic = sns.load_dataset(\"titanic\")\n", 37 | "iris = sns.load_dataset(\"iris\")\n", 38 | "```" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "**Q1 (20 pts)** Working with `numpy.random`.\n", 46 | "\n", 47 | "**Part 1 (10 pts)** Consider a sequence of $n$ Bernoulli trials with success probabilty $p$ per trial. A string of consecutive successes is known as a success *run*. Write a function that returns the counts for runs of length $k$ for each $k$ observed in a dictionary.\n", 48 | "\n", 49 | "For example: if the trials were [0, 1, 0, 1, 1, 0, 0, 0, 0, 1], the function should return \n", 50 | "```\n", 51 | "{1: 2, 2: 1})\n", 52 | "```" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "\n", 64 | "\n", 65 | "\n", 66 | "\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "**Part 2 (10 pts)** Continuing from Part 1, what is the probability of observing at least one run of length 5 or more when $n=100$ and $p=0.5$?. Estimate this from 100,000 simulated experiments. Is this more, less or equally likely than finding runs of length 7 or more when $p=0.7$?" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "\n", 85 | "\n", 86 | "\n", 87 | "\n" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "**Q2. (30 pts)** \n", 95 | "\n", 96 | "Using `RandomForestClassifier` from `sklearn`, find the 5 most important predictors of survival on the Titanic. Compare the accuracy of prediction using only these 5 predictors and using all non-redundant predictors. Some intial pre-processing code is provided. Hint: check out the `pandas.get_dummies()` function." 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "titanic = sns.load_dataset(\"titanic\")\n", 108 | "titanic.head()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "titanic.drop(['alive', 'embarked', 'class', 'who', 'adult_male'], axis=1, inplace=True)\n", 120 | "titanic.dropna(axis=0, inplace=True)\n", 121 | "titanic.head()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "collapsed": true 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "\n", 133 | "\n", 134 | "\n", 135 | "\n", 136 | "\n", 137 | "\n", 138 | "\n", 139 | "\n", 140 | "\n" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "**Q2. (25 pts)**\n", 148 | "\n", 149 | "Using `sklearn`, perform unsupervised learning of the iris data using 2 different clustering methods. Do NOT assume you know the number of clusters - rather the code should either determine it from the data or compare models with different numbers of components using some appropriate test statistic. Make a pairwise scatter plot of the four predictor variables indicating cluster by color for each unsupervised learning method used." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "\n", 161 | "\n", 162 | "\n", 163 | "\n", 164 | "\n", 165 | "\n", 166 | "\n", 167 | "\n", 168 | "\n", 169 | "\n" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "**Q3. (50 pts)**\n", 177 | "\n", 178 | "Write code to generate a plot similar to the following ![figure](http://mathworld.wolfram.com/images/eps-gif/ElementaryCA_850.gif) using the explanation for generation of 1D Cellular Automata found [here](http://mathworld.wolfram.com/ElementaryCellularAutomaton.html). You should only need to use standard Python, `numpy` and `matplotllib`." 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "To make it simpler, I have provided the code for plotting below. All you need to do is to supply the `make_ca` function (which may of course use as many ohter custom functons as you deem necessary). As you can see from the code below, the `make_ca` function takes 3 arguments\n", 186 | "```\n", 187 | "rule - an integer e.g. 30\n", 188 | "init - an initial state i.e. the first row of the image\n", 189 | "niter - the number of iterations i.e. the number of rows in the image\n", 190 | "```" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "\n", 202 | "\n", 203 | "\n", 204 | "\n", 205 | "\n", 206 | "\n", 207 | "\n", 208 | "\n", 209 | "\n", 210 | "\n" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "from matplotlib.ticker import NullFormatter, IndexLocator\n", 222 | "\n", 223 | "def plot_grid(rule, grid, ax=None):\n", 224 | " if ax is None:\n", 225 | " ax = plt.subplot(111)\n", 226 | " ax.grid(True, which='major', color='grey', linewidth=0.5)\n", 227 | " ax.imshow(grid, interpolation='none', cmap='Greys', aspect=1, alpha=0.8)\n", 228 | " ax.xaxis.set_major_locator(IndexLocator(1, 0))\n", 229 | " ax.yaxis.set_major_locator(IndexLocator(1, 0))\n", 230 | " ax.xaxis.set_major_formatter( NullFormatter() )\n", 231 | " ax.yaxis.set_major_formatter( NullFormatter() )\n", 232 | " ax.set_title('Rule %d' % rule)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "niter = 15\n", 244 | "width = niter*2+1\n", 245 | "init = np.zeros(width, 'int')\n", 246 | "init[width//2] = 1\n", 247 | "rules = np.array([30, 54, 60, 62, 90, 94, 102, 110, 122, 126, \n", 248 | " 150, 158, 182, 188, 190, 220, 222, 250]).reshape((-1, 3))\n", 249 | "\n", 250 | "nrows, ncols = rules.shape\n", 251 | "fig, axes = plt.subplots(nrows, ncols, figsize=(ncols*3, nrows*2))\n", 252 | "for i in range(nrows):\n", 253 | " for j in range(ncols):\n", 254 | " grid = make_ca(rules[i, j], init, niter)\n", 255 | " plot_grid(rules[i, j], grid, ax=axes[i,j])\n", 256 | "plt.tight_layout()" 257 | ] 258 | } 259 | ], 260 | "metadata": { 261 | "kernelspec": { 262 | "display_name": "Python 3", 263 | "language": "python", 264 | "name": "python3" 265 | }, 266 | "language_info": { 267 | "codemirror_mode": { 268 | "name": "ipython", 269 | "version": 3 270 | }, 271 | "file_extension": ".py", 272 | "mimetype": "text/x-python", 273 | "name": "python", 274 | "nbconvert_exporter": "python", 275 | "pygments_lexer": "ipython3", 276 | "version": "3.5.1" 277 | } 278 | }, 279 | "nbformat": 4, 280 | "nbformat_minor": 0 281 | } 282 | -------------------------------------------------------------------------------- /homework/Homework04.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**Due: 4 PM on Wednesday, 17 Feb 2016**\n", 8 | "\n", 9 | "The usual warnings apply - the homework is not officially released until 11 Feb 2016, and we may make changes till then." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Instructions\n", 17 | "-----\n", 18 | "\n", 19 | "Write code to solve all problems. The grading rubric includes the following criteria:\n", 20 | "\n", 21 | "- Correctness\n", 22 | "- Readability\n", 23 | "- Efficiency\n", 24 | "\n", 25 | "Please do not copy answers found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc is OK. Some discussion with friends is fine too - but again, do not just copy their answer. \n", 26 | "\n", 27 | "**Honor Code: By submitting this assignment, you certify that this is your original work.**" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "**Question 1 (20 points)**. \n", 35 | "\n", 36 | "Euclid's algorithm for finding the greatest common divisor of two numbers is\n", 37 | "\n", 38 | "```python\n", 39 | "gcd(a, 0) = a\n", 40 | "gcd(a, b) = gcd(b, a modulo b)\n", 41 | "```\n", 42 | "\n", 43 | "- Write a function to find the greatest common divisor in Python (8 points)\n", 44 | "- What is the greatest common divisor of 17384 and 1928? (2 point)\n", 45 | "- Write a function to calculate the least common multiple (8 points)\n", 46 | "- What is the least common multiple of 17384 and 1928? (2 point)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "\n", 58 | "\n", 59 | "\n", 60 | "\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "**Question 2 (20 points)**. \n", 68 | "\n", 69 | "Consider the linear transformation $f(x)$ on $\\mathbb{R}^3$ that takes the standard basis $\\left\\{e_1,e_2,e_3\\right\\}$ to $\\left\\{v_1,v_2,v_3\\right\\}$ where\n", 70 | "\n", 71 | "$$v_1=\\left(\\begin{matrix}10\\\\-10\\\\16\\end{matrix}\\right), v_2=\\left(\\begin{matrix}2\\\\-5\\\\20\\end{matrix}\\right) \\textrm {and } v_3=\\left(\\begin{matrix}1\\\\-4\\\\13\\end{matrix}\\right)$$\n", 72 | "\n", 73 | "1. Write a matrix $A$ that represents the same linear transformaton. (4 points)\n", 74 | "\n", 75 | "2. Compute the rank of $A$ using two different methods (do not use `matrix_rank`!). (4 points)\n", 76 | "\n", 77 | "3. Find the eigenvalues and eigenvectors of $A$. (4 points)\n", 78 | "\n", 79 | "4. What is the matrix representation of $f$ with respect to the eigenbasis? (48 points)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "\n", 91 | "\n", 92 | "\n", 93 | "\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "**Exercise 3 (20 pts)**. Avodiing catastrophic cancellation.\n", 101 | "\n", 102 | "Read the Wikipedia entry on [loss of significance](https://en.wikipedia.org/wiki/Loss_of_significance). Then answer the following problem:\n", 103 | "\n", 104 | "The tail of the standard logistic distributon is given by $1 - F(t) = 1 - (1+e^{-t})^{-1}$.\n", 105 | "\n", 106 | "- Define a function `f1` to calculate the tail probability of the logistic distribution using the formula given above\n", 107 | "- Use [`sympy`](http://docs.sympy.org/latest/index.html) to find the exact value of the tail distribution (using the same symbolic formula) to 20 decimal digits\n", 108 | "- Calculate the *relative error* of `f1` when $t = 25$ (The relative error is given by `abs(exact - approximate)/exact`)\n", 109 | "- Rewrite the expression for the tail of the logistic distribution using simple algebra so that there is no risk of cancellation, and write a function `f2` using this formula. Calculate the *relative error* of `f2` when $t = 25$. \n", 110 | "- How much more accurate is `f2` compared with `f1` in terms of the relative error?" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": { 117 | "collapsed": true 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "\n", 122 | "\n", 123 | "\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "**Exercise 4 (40 pts)**. One of the goals of the course it that you will be able to implement novel algorihtms from the literature. \n", 131 | "\n", 132 | "- Implement the mean-shift algorithm in 1D as described [here](http://homepages.inf.ed.ac.uk/rbf/CVonline/LOCAL_COPIES/TUZEL1/MeanShift.pdf). \n", 133 | " - Use the following function signature\n", 134 | " ```python\n", 135 | " def mean_shift(xs, x, kernel, max_iters=100, tol=1e-6):\n", 136 | " ```\n", 137 | " - xs is the data set, x is the starting location, and kernel is a kernel function\n", 138 | " - tol is the difference in $||x||$ across iterations\n", 139 | "- Use the following kernels with bandwidth $h$ (a default value of 1.0 will work fine)\n", 140 | " - Flat - return 1 if $||x|| < h$ and 0 otherwise\n", 141 | " - Gaussian \n", 142 | " $$\\frac{1}{\\sqrt{2 \\pi h}}e^{\\frac{-||x||^2}{h^2}}$$\n", 143 | " - Note that $||x||$ is the norm of the data point being evaluated minus the current value of $x$\n", 144 | "- Use both kernels to find all 3 modes of the data set in `x1d.npy`\n", 145 | "- Modify the algorihtm abd/or kernels so that it now works in an arbitrary number of dimensions.\n", 146 | "- Uset both kernels to find all 3 modes of the data set in `x2d.npy`\n", 147 | "- Plot the path of successive intermeidate solutions of the mean-shift algorithm starting from `x0 = (-4, 5)` until it converges onto a mode in the 2D data for each kernel. Superimposet the path on top of a contour plot of the data density. Repeat for `x0 = (0, 0)` and `x0 = (10, 10)` ." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "collapsed": true 155 | }, 156 | "outputs": [], 157 | "source": [] 158 | } 159 | ], 160 | "metadata": { 161 | "kernelspec": { 162 | "display_name": "Python 3", 163 | "language": "python", 164 | "name": "python3" 165 | }, 166 | "language_info": { 167 | "codemirror_mode": { 168 | "name": "ipython", 169 | "version": 3 170 | }, 171 | "file_extension": ".py", 172 | "mimetype": "text/x-python", 173 | "name": "python", 174 | "nbconvert_exporter": "python", 175 | "pygments_lexer": "ipython3", 176 | "version": "3.5.1" 177 | } 178 | }, 179 | "nbformat": 4, 180 | "nbformat_minor": 0 181 | } 182 | -------------------------------------------------------------------------------- /homework/Homework05.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**Due: 4 PM on Wednesday, 24 Feb 2016**\n", 8 | "\n", 9 | "The usual warnings apply - the homework is not officially released until 18 Feb 2016, and we may make changes till then." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Instructions\n", 17 | "-----\n", 18 | "\n", 19 | "Write code to solve all problems. The grading rubric includes the following criteria:\n", 20 | "\n", 21 | "- Correctness\n", 22 | "- Readability\n", 23 | "- Efficiency\n", 24 | "\n", 25 | "Please do not copy answers found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc is OK. Some discussion with friends is fine too - but again, do not just copy their answer. \n", 26 | "\n", 27 | "**Honor Code: By submitting this assignment, you certify that this is your original work.**" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "**Question 1 (25 points).** Consider the following function on $\\mathbb{R}^2$:\n", 35 | "\n", 36 | "$$f(x_1,x_2) = -x_1x_2e^{-\\frac{(x_1^2+x_2^2)}{2}}$$\n", 37 | "\n", 38 | "1. Use `sympy` to compute its gradient.\n", 39 | "2. Compute the Hessian matrix. \n", 40 | "3. Find the critical points of $f$.\n", 41 | "4. Characterize the critical points as max/min or neither. \n", 42 | "5. Find the minimum under the constraint \n", 43 | "$$g(x) = x_1^2+x_2^2 \\leq 10$$\n", 44 | "and \n", 45 | "$$h(x) = 2x_1 + 3x_2 = 5$$ using `scipy.optimize.minimize`.\n", 46 | "5. Plot the function using `matplotlib`.\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "\n", 58 | "\n", 59 | "\n", 60 | "\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "**Question 2 (15 points).**\n", 68 | "\n", 69 | "A milkmaid is at point A and needs to get to point B. However, she also needs to fill a pail of water from the river en route from A to B. The equation of the river's path is shown in the figure below. What is the minimum distance she has to travel to do this?\n", 70 | "\n", 71 | "1. Solve using `scipy.optimize` and constrained minimization.\n", 72 | "2. 2. Create a plot of the solution using matplotlib (similar to provided figure but with optimal path added).\n", 73 | "\n", 74 | "Note: Beware of local optima. \n", 75 | "\n", 76 | "![Milkmaid problem](milkmaid.png)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "\n", 88 | "\n", 89 | "\n", 90 | "\n", 91 | "\n" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "Background to Q3 - Q5\n", 99 | "----\n", 100 | "\n", 101 | "Latent Semantic Analysis (LSA) is a method for reducing the dimnesionality of documents treated as a bag of words. It is used for document classification, clustering and retrieval. For example, LSA can be used to search for prior art given a new patent application. In this homework, we will implement a small library for simple latent semantic analysis as a practical example of the application of SVD. The ideas are very similar to PCA.\n", 102 | "\n", 103 | "We will implement a toy example of LSA to get familiar with the ideas. If you want to use LSA or similar methods for statiscal language analyis, the most efficient Python library is probably [gensim](https://radimrehurek.com/gensim/) - this also provides an online algorithm - i.e. the training information can be continuously updated. Other useful functions for processing natural language can be found in the [Natural Lnaguage Toolkit](http://www.nltk.org/).\n", 104 | "\n", 105 | "**Note**: The SVD from scipy.linalg performs a full decomposition, which is inefficient since we only need to decompose until we get the first k singluar values. If the SVD from `scipy.linalg` is too slow, please use the `sparsesvd` function from the [sparsesvd](https://pypi.python.org/pypi/sparsesvd/) package to perform SVD instead. You can install in the usual way with \n", 106 | "```\n", 107 | "!pip install sparsesvd\n", 108 | "```\n", 109 | "\n", 110 | "Then import the following\n", 111 | "```python\n", 112 | "from sparsesvd import sparsesvd \n", 113 | "from scipy.sparse import csc_matrix \n", 114 | "```\n", 115 | "\n", 116 | "and use as follows\n", 117 | "```python\n", 118 | "sparsesvd(csc_matrix(M), k=10)\n", 119 | "```\n", 120 | "\n" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "**Question 3 (20 points)**: Write 3 functions to calculate the term frequency (tf), the inverse document frequency (idf) and the product (tf-idf). Each function should take a single argument `docs`, which is a dictionary of (key=identifier, value=dcoument text) pairs, and return an appropriately sized array. Convert '-' to ' ' (space), remove punctuation, convert text to lowercase and split on whitespace to generate a collection of terms from the dcoument text.\n", 128 | "\n", 129 | "- tf = the number of occurrences of term $i$ in document $j$\n", 130 | "- idf = $\\log \\frac{n}{1 + \\text{df}_i}$ where $n$ is the total number of documents and $\\text{df}_i$ is the number of documents in which term $i$ occurs.\n", 131 | "\n", 132 | "Print the table of tf-idf values for the following document collection\n", 133 | "\n", 134 | "```\n", 135 | "s1 = \"The quick brown fox\"\n", 136 | "s2 = \"Brown fox jumps over the jumps jumps jumps\"\n", 137 | "s3 = \"The the the lazy dog elephant.\"\n", 138 | "s4 = \"The the the the the dog peacock lion tiger elephant\"\n", 139 | "\n", 140 | "docs = {'s1': s1, 's2': s2, 's3': s3, 's4': s4}\n", 141 | "```\n", 142 | "\n", 143 | "Note: You can use either a numpy array or pandas dataframe to store the matrix. However, we suggest using a Pnadas dataframe since that will allow you to keep track of the row (term) and column (document) names in a single object. Of course, you could also maintain a numpy matrix, a list of terms, and a list of documents separately if you prefer.\n", 144 | "\n", 145 | "\n", 146 | "\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "collapsed": true 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "\n", 158 | "\n", 159 | "\n" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "**Question 4 (20 points)**\n", 167 | "\n", 168 | "1. Write a function that takes a matrix $M$ and an integer $k$ as arguments, and reconstructs a reduced matrix using only the $k$ largest singular values. Use the `scipy.linagl.svd` function to perform the decomposition. This is the least squares approximation to the matrix $M$ in $k$ dimensions.\n", 169 | "\n", 170 | "2. Apply the function you just wrote to the following term-frequency matrix for a set of $9$ documents using $k=2$ and print the reconstructed matrix $M'$.\n", 171 | "```\n", 172 | "M = np.array([[1, 0, 0, 1, 0, 0, 0, 0, 0],\n", 173 | " [1, 0, 1, 0, 0, 0, 0, 0, 0],\n", 174 | " [1, 1, 0, 0, 0, 0, 0, 0, 0],\n", 175 | " [0, 1, 1, 0, 1, 0, 0, 0, 0],\n", 176 | " [0, 1, 1, 2, 0, 0, 0, 0, 0],\n", 177 | " [0, 1, 0, 0, 1, 0, 0, 0, 0],\n", 178 | " [0, 1, 0, 0, 1, 0, 0, 0, 0],\n", 179 | " [0, 0, 1, 1, 0, 0, 0, 0, 0],\n", 180 | " [0, 1, 0, 0, 0, 0, 0, 0, 1],\n", 181 | " [0, 0, 0, 0, 0, 1, 1, 1, 0],\n", 182 | " [0, 0, 0, 0, 0, 0, 1, 1, 1],\n", 183 | " [0, 0, 0, 0, 0, 0, 0, 1, 1]])\n", 184 | "```\n", 185 | "\n", 186 | "3. Calculate the pairwise correlation matrix for the original matrix M and the reconstructed matrix using $k=2$ singular values (you may use [scipy.stats.spearmanr](http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html) to do the calculations). Consider the fist 5 sets of documents as one group $G1$ and the last 4 as another group $G2$ (i.e. first 5 and last 4 columns). What is the average within group correlation for $G1$, $G2$ and the average cross-group correlation for G1-G2 using either $M$ or $M'$. (Do not include self-correlation in the within-group calculations.)." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "collapsed": true 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "\n", 198 | "\n", 199 | "\n", 200 | "\n" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "**Question 5 (20 points)**. Clustering with LSA\n", 208 | "\n", 209 | "1. Begin by loading a pubmed database of selected article titles using:\n", 210 | "```python\n", 211 | "import pickle\n", 212 | "docs = pickle.load(open('pubmed.pic', 'rb'))\n", 213 | "```\n", 214 | " Create a tf-idf matrix for every term that appears at least once in any of the documents. What is the shape of the tf-idf matrix? \n", 215 | "\n", 216 | "2. Perform SVD on the tf-idf matrix to obtain $U \\Sigma V^T$ (often written as $T \\Sigma D^T$ in this context with $T$ representing the terms and $D$ representing the documents). If we set all but the top $k$ singular values to 0, the reconstructed matrix is essentially $U_k \\Sigma_k V_k^T$, where $U_k$ is $m \\times k$, $\\Sigma_k$ is $k \\times k$ and $V_k^T$ is $k \\times n$. Terms in this reduced space are represented by $U_k \\Sigma_k$ and documents by $\\Sigma_k V^T_k$. Reconstruct the matrix using the first $k=10$ singular values.\n", 217 | "\n", 218 | "3. Use agglomerative hierachical clustering with complete linkage to plot a dendrogram and comment on the likely number of document clusters with $k = 100$. Use the dendrogram function from [SciPy ](https://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.cluster.hierarchy.dendrogram.html).\n", 219 | "\n", 220 | "4. Determine how similar each of the original documents is to the new document `mystery.txt`. Since $A = U \\Sigma V^T$, we also have $V = A^T U S^{-1}$ using orthogonality and the rule for transposing matrix products. This suggests that in order to map the new document to the same concept space, first find the tf-idf vector $v$ for the new document - this must contain all (and only) the terms present in the existing tf-idx matrix. Then the query vector $q$ is given by $v^T U_k \\Sigma_k^{-1}$. Find the 10 documents most similar to the new document and the 10 most dissimilar. \n", 221 | "\n", 222 | "5. Many documents often have some boilerplate material such as organization information, Copyright, etc. at the front or back of the document. Does it matter that the front and back matter of each document is essentially identical for either LSA-based clustering (part 3) or information retrieval (part 4)? Why or why not?" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 1, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "<_io.TextIOWrapper name='mystery.txt' mode='r' encoding='UTF-8'>" 236 | ] 237 | }, 238 | "execution_count": 1, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "open('mystery.txt')\n", 245 | "\n", 246 | "\n", 247 | "\n" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [] 258 | } 259 | ], 260 | "metadata": { 261 | "kernelspec": { 262 | "display_name": "Python 3", 263 | "language": "python", 264 | "name": "python3" 265 | }, 266 | "language_info": { 267 | "codemirror_mode": { 268 | "name": "ipython", 269 | "version": 3 270 | }, 271 | "file_extension": ".py", 272 | "mimetype": "text/x-python", 273 | "name": "python", 274 | "nbconvert_exporter": "python", 275 | "pygments_lexer": "ipython3", 276 | "version": "3.5.1" 277 | } 278 | }, 279 | "nbformat": 4, 280 | "nbformat_minor": 0 281 | } 282 | -------------------------------------------------------------------------------- /homework/Homework07.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**Due: 4 PM on Wednesday, 24 Mar 2016**" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Instructions\n", 15 | "-----\n", 16 | "\n", 17 | "Write code to solve all problems. The grading rubric includes the following criteria:\n", 18 | "\n", 19 | "- Correctness\n", 20 | "- Readability\n", 21 | "- Efficiency\n", 22 | "\n", 23 | "Please do not copy answers found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc. is OK. Some discussion with friends is fine too - but again, do not just copy their answer. \n", 24 | "\n", 25 | "**Honor Code: By submitting this assignment, you certify that this is your original work.**" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "**Exercise 1 (50 points)**\n", 33 | "\n", 34 | "#### Gibbs sampler example from [Robert and Casella, 10.17](http://www.springer.com/statistics/statistical+theory+and+methods/book/978-0-387-21239-5)\n", 35 | "\n", 36 | "Suppose we have data of the number of failures ($y_i$) for each of 10 pumps in a nuclear plant. We also have the times ($t_i$) at which each pump was observed. We want to model the number of failures with a Poisson likelihood, where the expected number of failure $\\lambda_i$ differs for each pump. Since the time which we observed each pump is different, we need to scale each $\\lambda_i$ by its observed time $t_i$. To be explicit, we assume that $y_i$ has a Poisson distribution with rate $\\mu_i = \\lambda_i t_i$.\n", 37 | "\n", 38 | "The likelihood $f$ is \n", 39 | "$$\n", 40 | "\\prod_{i=1}^{10} \\text{Poisson}(\\mu_i)\n", 41 | "$$\n", 42 | "\n", 43 | "We let the prior $g$ for $\\lambda$ be \n", 44 | "\n", 45 | "$$\n", 46 | "\\lambda \\sim \\text{Gamma}(\\alpha_\\mu, \\beta_\\mu)\n", 47 | "$$\n", 48 | "\n", 49 | "and let the hyperprior $h$ for $\\alpha$ to be \n", 50 | "\n", 51 | "$$\n", 52 | "\\alpha \\sim \\text{Gamma}(\\alpha_\\alpha, \\beta_\\alpha)\n", 53 | "$$\n", 54 | "\n", 55 | "with $\\alpha_\\alpha = 1.8$ and $\\beta_\\alpha = 1.0$.\n", 56 | "\n", 57 | "and let the hyperprior $h$ for $\\beta$ to be \n", 58 | "\n", 59 | "$$\n", 60 | "\\beta \\sim \\text{Gamma}(\\alpha_\\beta, \\beta_\\beta)\n", 61 | "$$\n", 62 | "\n", 63 | "with $\\alpha_\\beta = 10.0$ and $\\beta_\\beta = 1.0$.\n", 64 | "\n", 65 | "There are 12 unknown parameters (10 $\\lambda$s, $\\alpha$ and $\\beta$) in this hierarchical model. Do th following using `pymc3` and some plotting package.\n", 66 | "\n", 67 | "- Wrtie the model and run for 10,000 iterations using the No U-Turn Sampler (30 points)\n", 68 | "- plot the traces and distributions of the last 10% for $\\lambda_i$, $\\alpha$ and $\\beta$ (there should be 12 sets of plots) (10 points)\n", 69 | "- Gnnerate 1,000 samples of the number of failures $y_i$ from the prior distribution and plot the histogram or density. That is, for each of the 10 pumps, we want to see the distribtuion of 1,000 draws of the number of failures (5 points).\n", 70 | "- Generate 1,000 posterior predictive samples of the number of failures $y_i$ and plot the histogram or density. This is similar to the previous question but using draws from the posterior (5 points)\n", 71 | "\n", 72 | "Use the following data\n", 73 | "```python\n", 74 | "y = np.array([5, 1, 5, 14, 3, 19, 1, 1, 4, 22])\n", 75 | "t = np.array([94.32, 15.72, 62.88, 125.76, 5.24, 31.44, 1.05, 1.05, 2.10, 10.48])\n", 76 | "```" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "import pymc3 as pm" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 23, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "y = np.array([5, 1, 5, 14, 3, 19, 1, 1, 4, 22])\n", 99 | "t = np.array([94.32, 15.72, 62.88, 125.76, 5.24, 31.44, 1.05, 1.05, 2.10, 10.48])" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "\n", 111 | "\n", 112 | "\n", 113 | "\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "**Exercise 2 (50 points)**\n", 121 | "\n", 122 | "Repeat Exercise 1 using `pystan`." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "\n", 134 | "\n", 135 | "\n", 136 | "\n" 137 | ] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 3", 143 | "language": "python", 144 | "name": "python3" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.5.1" 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 0 161 | } 162 | -------------------------------------------------------------------------------- /homework/Homework08.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**Due: 4 PM on Wednesday, 30 Mar 2016**" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Instructions\n", 15 | "-----\n", 16 | "\n", 17 | "Write code to solve all problems. The grading rubric includes the following criteria:\n", 18 | "\n", 19 | "- Correctness\n", 20 | "- Readability\n", 21 | "- Efficiency\n", 22 | "\n", 23 | "Please do not copy answers found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc. is OK. Some discussion with friends is fine too - but again, do not just copy their answer. \n", 24 | "\n", 25 | "**Honor Code: By submitting this assignment, you certify that this is your original work.**" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "**Exercise 1 (25 points)**\n", 33 | "\n", 34 | "- Write a function in C that calculates the mean of an array of doubles, putting the function declaration and function definition in separate files (10 points)\n", 35 | "- Write a driver program to call the function with the inputs being an array containing the numbers 1,2,3,4,5 and print the results to standard output (5 pints)\n", 36 | "- Write a `makefile` that compiles the executable upon calling `make` at the command line and removes all generated files upon calling `make clean` (10 points)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "\n", 48 | "\n", 49 | "\n" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "**Exercise 2 (25 points)**\n", 57 | "\n", 58 | "- Write a function `matrix_multiply` in C with the following function signature \n", 59 | "```\n", 60 | "void matrix_multiply(double** A, double **B, double **C, int m, int n, int p)\n", 61 | "```\n", 62 | "The function multiples an $m \\times n$ matrix $A$ with an $n \\times p$ matrix $B$ and gives the result in the matrix $C$ (10 points)\n", 63 | "- Write a function to pretty print a matrix to standard output, and use it to display $A$ and $B$. The output should look something like this (5 point):\n", 64 | "```\n", 65 | "[[3.0, 0.1, 5.0, 18.1],\n", 66 | " [7.8, 7.9, 3.2, 1.0],\n", 67 | " [6.1, 5.5, 8.9, 4.1]]\n", 68 | "```\n", 69 | "\n", 70 | "- Write a driver program to test it with the following matrices. Matrices should be generated using dynamic memory allocation, freeing up the memory when done (10 points)\n", 71 | "\n", 72 | "$$\n", 73 | "A = \\pmatrix{1 & 2 & 3\\\\4 & 5 & 6}, B = \\pmatrix{1 & 2 & 3 & 4\\\\5 & 6 & 7 & 8\\\\9 & 0 & 1 & 2}\n", 74 | "$$" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "\n", 86 | "\n", 87 | "\n" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "**Exercise 3 (25 points)**\n", 95 | "\n", 96 | "- Implement the secant method in 1D for root finding in C++. Pass in the function as a generalized function pointer. Use the method to find all roots of the polynomial equation $f(x) = x^3 - 7x - 6$ (20 points)\n", 97 | "- Write the roots to a text file that can be read in Python and plot the roots and polynomial using Python (5 points)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "\n", 109 | "\n", 110 | "\n" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "**Exercise 4 (25 points)**\n", 118 | "\n", 119 | "You are given the following set of data\n", 120 | "\n", 121 | "$$\n", 122 | "x = \\pmatrix{0 \\\\ 1 \\\\ 2 \\\\ 3 \\\\ 4 \\\\ 5 \\\\ 6 \\\\ 7 \\\\ 8 \\\\ 9}, \n", 123 | "y = \\pmatrix{1.5 \\\\ 7.5 \\\\ 10.7 \\\\ 7.9 \\\\ -2.0 \\\\ -12.4 \\\\ -28.6 \\\\ -46.3 \\\\ -68.1 \\\\ -97.1}\n", 124 | "$$\n", 125 | "\n", 126 | "- Write your own **gradient descent** optimization function in C++ to find the least squares solution for the coefficients $\\beta$ of a quadratic polynomial. You may use the `armadillo` library (20 points)\n", 127 | "- Write the solution to a text file that can be read in Python and plot the least squares quadratic fit together with the data points using Python (5 points)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": true 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "\n", 139 | "\n", 140 | "\n" 141 | ] 142 | } 143 | ], 144 | "metadata": { 145 | "kernelspec": { 146 | "display_name": "Python 3", 147 | "language": "python", 148 | "name": "python3" 149 | }, 150 | "language_info": { 151 | "codemirror_mode": { 152 | "name": "ipython", 153 | "version": 3 154 | }, 155 | "file_extension": ".py", 156 | "mimetype": "text/x-python", 157 | "name": "python", 158 | "nbconvert_exporter": "python", 159 | "pygments_lexer": "ipython3", 160 | "version": "3.5.1" 161 | } 162 | }, 163 | "nbformat": 4, 164 | "nbformat_minor": 0 165 | } 166 | -------------------------------------------------------------------------------- /homework/milkmaid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/homework/milkmaid.png -------------------------------------------------------------------------------- /homework/mystery.txt: -------------------------------------------------------------------------------- 1 | Intensive blood-glucose control with sulphonylureas or insulin compared with 2 | conventional treatment and risk of complications in patients with type 2 diabetes 3 | BACKGROUND: Improved blood-glucose control decreases the progression of diabetic 4 | microvascular disease, but the effect on macrovascular complications is unknown. 5 | There is concern that sulphonylureas may increase cardiovascular mortality in 6 | patients with type 2 diabetes and that high insulin concentrations may enhance 7 | atheroma formation. We compared the effects of intensive blood-glucose control 8 | with either sulphonylurea or insulin and conventional treatment on the risk of 9 | microvascular and macrovascular complications in patients with type 2 diabetes in 10 | a randomised controlled trial. METHODS: 3867 newly diagnosed patients with type 2 11 | diabetes, median age 54 years (IQR 48-60 years), who after 3 months' diet 12 | treatment had a mean of two fasting plasma glucose (FPG) concentrations of 13 | 6.1-15.0 mmol/L were randomly assigned intensive policy with a sulphonylurea 14 | (chlorpropamide, glibenclamide, or glipizide) or with insulin, or conventional 15 | policy with diet. The aim in the intensive group was FPG less than 6 mmol/L. In 16 | the conventional group, the aim was the best achievable FPG with diet alone; 17 | drugs were added only if there were hyperglycaemic symptoms or FPG greater than 18 | 15 mmol/L. Three aggregate endpoints were used to assess differences between 19 | conventional and intensive treatment: any diabetes-related endpoint (sudden 20 | death, death from hyperglycaemia or hypoglycaemia, fatal or non-fatal myocardial 21 | infarction, angina, heart failure, stroke, renal failure, amputation [of at least 22 | one digit], vitreous haemorrhage, retinopathy requiring photocoagulation, 23 | blindness in one eye, or cataract extraction); diabetes-related death (death from 24 | myocardial infarction, stroke, peripheral vascular disease, renal disease, 25 | hyperglycaemia or hypoglycaemia, and sudden death); all-cause mortality. Single 26 | clinical endpoints and surrogate subclinical endpoints were also assessed. All 27 | analyses were by intention to treat and frequency of hypoglycaemia was also 28 | analysed by actual therapy. FINDINGS: Over 10 years, haemoglobin A1c (HbA1c) was 29 | 7.0% (6.2-8.2) in the intensive group compared with 7.9% (6.9-8.8) in the 30 | conventional group--an 11% reduction. There was no difference in HbA1c among 31 | agents in the intensive group. Compared with the conventional group, the risk in 32 | the intensive group was 12% lower (95% CI 1-21, p=0.029) for any diabetes-related 33 | endpoint; 10% lower (-11 to 27, p=0.34) for any diabetes-related death; and 6% 34 | lower (-10 to 20, p=0.44) for all-cause mortality. Most of the risk reduction in 35 | the any diabetes-related aggregate endpoint was due to a 25% risk reduction 36 | (7-40, p=0.0099) in microvascular endpoints, including the need for retinal 37 | photocoagulation. There was no difference for any of the three aggregate 38 | endpoints between the three intensive agents (chlorpropamide, glibenclamide, or 39 | insulin). Patients in the intensive group had more hypoglycaemic episodes than 40 | those in the conventional group on both types of analysis (both p<0.0001). The 41 | rates of major hypoglycaemic episodes per year were 0.7% with conventional 42 | treatment, 1.0% with chlorpropamide, 1.4% with glibenclamide, and 1.8% with 43 | insulin. Weight gain was significantly higher in the intensive group (mean 2.9 44 | kg) than in the conventional group (p<0.001), and patients assigned insulin had a 45 | greater gain in weight (4.0 kg) than those assigned chlorpropamide (2.6 kg) or 46 | glibenclamide (1.7 kg). INTERPRETATION: Intensive blood-glucose control by either 47 | sulphonylureas or insulin substantially decreases the risk of microvascular 48 | complications, but not macrovascular disease, in patients with type 2 49 | diabetes.(ABSTRACT TRUNCATED) 50 | -------------------------------------------------------------------------------- /homework/x1d.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/homework/x1d.npy -------------------------------------------------------------------------------- /homework/x2d.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/homework/x2d.npy -------------------------------------------------------------------------------- /images/Scraping data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/images/Scraping data.png -------------------------------------------------------------------------------- /images/hw2_q4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/images/hw2_q4.png -------------------------------------------------------------------------------- /lectures/02D_Classes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Classes\n", 8 | "====\n", 9 | "\n", 10 | "As you probably know, Python is an object-oriented language, and so has very strong support for objects. In fact, everything in Python is an object. We will mostly use an imperative or functional rather than object-oriented programming style in this course. \n", 11 | "\n", 12 | "Here is the bare minimum about Python objects. " 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "Defining a new class\n", 20 | "----\n", 21 | "\n", 22 | "We define a class A with 2 'special' double underscore methods and one normal method. This class will have an attribute `x` that is specified at the time of creating new instances of the class.\n", 23 | "\n", 24 | "- The __init__ method initializes properties of any new instance of A\n", 25 | "- The __repr__ method provides an accurate string representation of A. For example, if we print an instance of A, the __repr__ method will be used. If you don't specify a __repr__ (or __str__) special method, the default name when printing only gives the address in memory.\n", 26 | "\n", 27 | "There are many more special method, as described in the [official documentation](https://docs.python.org/3.5/reference/datamodel.html). We will not go there." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 62, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "class A:\n", 39 | " \"\"\"Base class.\"\"\"\n", 40 | "\n", 41 | " def __init__(self, x):\n", 42 | " self.x = x\n", 43 | "\n", 44 | " def __repr__(self):\n", 45 | " return '%s(%a)' % (self.__class__.__name__, self.x)\n", 46 | "\n", 47 | " def report(self):\n", 48 | " \"\"\"Report type of contained value.\"\"\"\n", 49 | "\n", 50 | " return 'My value is of type %s' % type(self.x)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "Docstrings\n", 58 | "----" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 72, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "'Base class.'" 72 | ] 73 | }, 74 | "execution_count": 72, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "A.__doc__" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 74, 86 | "metadata": { 87 | "collapsed": false 88 | }, 89 | "outputs": [ 90 | { 91 | "name": "stdout", 92 | "output_type": "stream", 93 | "text": [ 94 | "Help on class A in module __main__:\n", 95 | "\n", 96 | "class A(builtins.object)\n", 97 | " | Base class.\n", 98 | " | \n", 99 | " | Methods defined here:\n", 100 | " | \n", 101 | " | __init__(self, x)\n", 102 | " | Initialize self. See help(type(self)) for accurate signature.\n", 103 | " | \n", 104 | " | __repr__(self)\n", 105 | " | Return repr(self).\n", 106 | " | \n", 107 | " | report(self)\n", 108 | " | Report type of contained value.\n", 109 | " | \n", 110 | " | ----------------------------------------------------------------------\n", 111 | " | Data descriptors defined here:\n", 112 | " | \n", 113 | " | __dict__\n", 114 | " | dictionary for instance variables (if defined)\n", 115 | " | \n", 116 | " | __weakref__\n", 117 | " | list of weak references to the object (if defined)\n", 118 | "\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "help(A)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 73, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "'Report type of contained value.'" 137 | ] 138 | }, 139 | "execution_count": 73, 140 | "metadata": {}, 141 | "output_type": "execute_result" 142 | } 143 | ], 144 | "source": [ 145 | "A.report.__doc__" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "Making instance of a class\n", 153 | "----" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "#### Example of a class without __repr__." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 64, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "class X:\n", 172 | " \"\"\"Empty class.\"\"\"" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 65, 178 | "metadata": { 179 | "collapsed": false 180 | }, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "<__main__.X object at 0x1115eda20>\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "x = X()\n", 192 | "print(x)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "#### Make new instances of the class A" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 59, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "a0 = A('a')" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 67, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "A('a')\n" 225 | ] 226 | } 227 | ], 228 | "source": [ 229 | "print(a0)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 60, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "a1 = A(x = 3.14)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 68, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "A(3.14)\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "print(a1)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "#### Attribute access" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 51, 272 | "metadata": { 273 | "collapsed": false 274 | }, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "('a', 3.14)" 280 | ] 281 | }, 282 | "execution_count": 51, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "a0.x, a1.x" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "#### Method access" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 52, 301 | "metadata": { 302 | "collapsed": false 303 | }, 304 | "outputs": [ 305 | { 306 | "data": { 307 | "text/plain": [ 308 | "(\"My value is of type \", \"My value is of type \")" 309 | ] 310 | }, 311 | "execution_count": 52, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "a0.report(), a1.report()" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "Class inheritance\n", 325 | "----" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 63, 331 | "metadata": { 332 | "collapsed": true 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "class B(A):\n", 337 | " \"\"\"Derived class inherits from A.\"\"\"\n", 338 | "\n", 339 | " def report(self):\n", 340 | " \"\"\"Overwrite report() method of A.\"\"\"\n", 341 | " return self.x" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 71, 347 | "metadata": { 348 | "collapsed": false 349 | }, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "'Derived class inherits from A.'" 355 | ] 356 | }, 357 | "execution_count": 71, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "B.__doc__" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "#### Make new instances of class B" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 54, 376 | "metadata": { 377 | "collapsed": true 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "b0 = B(3 + 4j)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 55, 387 | "metadata": { 388 | "collapsed": true 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "b1 = B(x = a1)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "#### Attribute access" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 56, 405 | "metadata": { 406 | "collapsed": false 407 | }, 408 | "outputs": [ 409 | { 410 | "data": { 411 | "text/plain": [ 412 | "(3+4j)" 413 | ] 414 | }, 415 | "execution_count": 56, 416 | "metadata": {}, 417 | "output_type": "execute_result" 418 | } 419 | ], 420 | "source": [ 421 | "b0.x" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 57, 427 | "metadata": { 428 | "collapsed": false 429 | }, 430 | "outputs": [ 431 | { 432 | "data": { 433 | "text/plain": [ 434 | "A(3.14)" 435 | ] 436 | }, 437 | "execution_count": 57, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "b1.x" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "#### Nested attribute access" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 58, 456 | "metadata": { 457 | "collapsed": false 458 | }, 459 | "outputs": [ 460 | { 461 | "data": { 462 | "text/plain": [ 463 | "\"My value is of type \"" 464 | ] 465 | }, 466 | "execution_count": 58, 467 | "metadata": {}, 468 | "output_type": "execute_result" 469 | } 470 | ], 471 | "source": [ 472 | "b1.x.report()" 473 | ] 474 | } 475 | ], 476 | "metadata": { 477 | "kernelspec": { 478 | "display_name": "Python 3", 479 | "language": "python", 480 | "name": "python3" 481 | }, 482 | "language_info": { 483 | "codemirror_mode": { 484 | "name": "ipython", 485 | "version": 3 486 | }, 487 | "file_extension": ".py", 488 | "mimetype": "text/x-python", 489 | "name": "python", 490 | "nbconvert_exporter": "python", 491 | "pygments_lexer": "ipython3", 492 | "version": "3.5.1" 493 | } 494 | }, 495 | "nbformat": 4, 496 | "nbformat_minor": 0 497 | } 498 | -------------------------------------------------------------------------------- /lectures/19A_Parallel_Programming.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Parallel Programming\n", 8 | "====\n", 9 | "\n", 10 | "The goal is to design parallel programs that are flexible, efficient and simple.\n", 11 | "\n", 12 | "**Step 0**: Start by profiling a serial program to identify bottlenecks\n", 13 | "\n", 14 | "**Step 1**: Are there for opportunities for parallelism?\n", 15 | "\n", 16 | "- Can tasks be performed in parallel?\n", 17 | " - Function calls\n", 18 | " - Loops\n", 19 | "- Can data be split and operated on in parallel?\n", 20 | " - Decomposition of arrays along rows, columns, blocks\n", 21 | " - Decomposition of trees into sub-trees\n", 22 | "- Is there a pipeline with a sequence of stages?\n", 23 | " - Data preprocessing and analysis\n", 24 | " - Graphics rendering\n", 25 | "\n", 26 | "**Step 2**: What is the nature of the parallelism?\n", 27 | "\n", 28 | "- Linear\n", 29 | " - Embarrassingly parallel programs\n", 30 | "- Recursive\n", 31 | " - Adaptive partitioning methods\n", 32 | "\n", 33 | "**Step 3**: What is the granularity?\n", 34 | "\n", 35 | "- 10s of jobs\n", 36 | "- 1000s of jobs\n", 37 | "\n", 38 | "**Step 4**: Choose an algorithm\n", 39 | "\n", 40 | "- Organize by tasks\n", 41 | " - Task parallelism\n", 42 | " - Divide and conquer\n", 43 | "\n", 44 | "- Organize by data\n", 45 | " - Geometric decomposition\n", 46 | " - Recursive decomposition\n", 47 | "\n", 48 | "- Organize by flow\n", 49 | " - Pipeline\n", 50 | " - Event-based processing\n", 51 | "\n", 52 | "**Step 5**: Map to program and data structures\n", 53 | "\n", 54 | "- Program structures\n", 55 | " - Single program multiple data (SPMD)\n", 56 | " - Master/worker\n", 57 | " - Loop parallelism\n", 58 | " - Fork/join\n", 59 | "- Data structures \n", 60 | " - Shared data\n", 61 | " - Shared queue\n", 62 | " - Distributed array\n", 63 | "\n", 64 | "**Step 6**: Map to parallel environment\n", 65 | "\n", 66 | "- Multi-core shared memory\n", 67 | " - Cython with OpenMP\n", 68 | " - multiprocessing\n", 69 | " - IPython.cluster\n", 70 | "- Multi-computer\n", 71 | " - IPython.cluster\n", 72 | " - MPI\n", 73 | " - Hadoop / Spark\n", 74 | "- GPU\n", 75 | " - CUDA\n", 76 | " - OpenCL\n", 77 | "\n", 78 | "**Step 7**: Execute, debug, tune in parallel environment" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "Embarrassingly parallel programs\n", 86 | "----\n", 87 | "\n", 88 | "Many statistical problems are embarrassingly parallel and can be easily decomposed into independent tasks or data sets. Here are several examples:\n", 89 | "\n", 90 | "- Monte Carlo integration\n", 91 | "- Multiple chains of MCMC\n", 92 | "- Bootstrap for confidence intervals\n", 93 | "- Power calculations by simulation\n", 94 | "- Permutation-resampling tests \n", 95 | "- Fitting same model on multiple data sets\n", 96 | "\n", 97 | "Other problems are serial at small scale, but can be parallelized at large scales. For example, EM and MCMC iterations are inherently serial since there is a dependence on the previous state, but within a single iteration, there can be many thousands of density calculations (one for each data point to calculate the likelihood), and this is an embarrassingly parallel problem within a single iteration. \n", 98 | "\n", 99 | "These \"low hanging fruits\" are great because they offer a path to easy parallelism with minimal complexity." 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "Executing parallel code\n", 107 | "----\n", 108 | "\n", 109 | "**The bigger the problem, the more scope there is for parallelism**\n", 110 | "\n", 111 | "**Amhdahls' law** says that the speedup from parallelization is bounded by the ratio of parallelizable to irreducibly serial code in the algorithm. However, for big data analysis, **Gustafson's Law** is more relevant. This says that we are nearly always interested in increasing the size of the parallelizable bits, and the ratio of parallelizable to irreducibly serial code is not a static quantity but depends on data size. For example, Gibbs sampling has an irreducibly serial nature, but for large samples, each iteration may be able perform PDF evaluations in parallel for zillions of data points." 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "Coming highlights\n", 119 | "-----\n", 120 | "\n", 121 | "- Parallelism in pre-built packages \n", 122 | " -`sklearn`\n", 123 | " - `pymc3`\n", 124 | " - `pystan`\n", 125 | "- Parallelism when compiling to native code\n", 126 | " - Using `target=paraallel` in `numba.vectorize` and `numb.guvectorize`\n", 127 | " - Using `openmp` with `cython.parallel`, `cython.prange` and `cython.nogil`\n", 128 | "- Parallelism for multi-core computers\n", 129 | " - Using `concurrent.futures`\n", 130 | " - Using `multiprocessing`\n", 131 | " - Using `ipyparallel` within Jupyter\n", 132 | "- Data too big for memory but not for disk\n", 133 | " - `memmap`\n", 134 | " - `HDF5` and `h5py`\n", 135 | " - Using `dask`\n", 136 | " - Usign `blaze`\n", 137 | "- Data too big for one computer\n", 138 | " - Distributed storage\n", 139 | " - Data sketches\n", 140 | " - Using `pyspark`" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": true 148 | }, 149 | "outputs": [], 150 | "source": [] 151 | } 152 | ], 153 | "metadata": { 154 | "kernelspec": { 155 | "display_name": "Python 3", 156 | "language": "python", 157 | "name": "python3" 158 | }, 159 | "language_info": { 160 | "codemirror_mode": { 161 | "name": "ipython", 162 | "version": 3 163 | }, 164 | "file_extension": ".py", 165 | "mimetype": "text/x-python", 166 | "name": "python", 167 | "nbconvert_exporter": "python", 168 | "pygments_lexer": "ipython3", 169 | "version": "3.5.1" 170 | } 171 | }, 172 | "nbformat": 4, 173 | "nbformat_minor": 0 174 | } 175 | -------------------------------------------------------------------------------- /lectures/21E_Spark_And_Sklearn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 3", 16 | "language": "python", 17 | "name": "python3" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 3 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython3", 29 | "version": "3.5.1" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 0 34 | } 35 | -------------------------------------------------------------------------------- /lectures/21F_Spark_GraphX.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 3", 16 | "language": "python", 17 | "name": "python3" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 3 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython3", 29 | "version": "3.5.1" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 0 34 | } 35 | -------------------------------------------------------------------------------- /lectures/21G_Spark_Streaming,ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Spark Streaming\n", 8 | "====\n", 9 | "\n", 10 | "The Spark Streaming library takes a stream of data and breaks it up into micro-batches that are then processed, giving the illusion of a continually updated stream of results." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "Resources\n", 18 | "----\n", 19 | "\n", 20 | "[Spark Streaming Programming Guide](http://spark.apache.org/docs/latest/streaming-programming-guide.html)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Streaming using sockets\n", 28 | "----\n", 29 | "\n", 30 | "We will first illustrate the idea of streaming data over TCP/IP with the Python standard library `socket` module. The consumer and producer should be run in separate terminals\n", 31 | "\n", 32 | "Terminal 1\n", 33 | "```bash\n", 34 | "python consumer.py localhost 10000\n", 35 | "```\n", 36 | "\n", 37 | "Terminal 2\n", 38 | "```bash\n", 39 | "python producer.py localhost 10000\n", 40 | "```" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### Consumer keeps a running word count" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 1, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "Overwriting consumer.py\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "%%file consumer.py\n", 67 | "import sys\n", 68 | "import socket\n", 69 | "from collections import Counter\n", 70 | "\n", 71 | "HOST = sys.argv[1]\n", 72 | "PORT = int(sys.argv[2])\n", 73 | "\n", 74 | "s = socket.socket()\n", 75 | "s.bind((HOST, PORT))\n", 76 | "s.listen(4)\n", 77 | "connection, address = s.accept()\n", 78 | "\n", 79 | "c = Counter()\n", 80 | "\n", 81 | "while True:\n", 82 | " line = connection.recv(64)\n", 83 | " words = line.split()\n", 84 | " if words:\n", 85 | " c.update(words)\n", 86 | " print(c.most_common(5))" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "### Producer sends data to server for processing" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 2, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "Overwriting client.py\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "%%file client.py\n", 113 | "import socket\n", 114 | "import time\n", 115 | "import sys\n", 116 | "\n", 117 | "HOST = sys.argv[1]\n", 118 | "PORT = int(sys.argv[2])\n", 119 | "s = socket.socket()\n", 120 | "s.connect((HOST, PORT))\n", 121 | "while True:\n", 122 | " for line in open('data/Ulysses.txt'):\n", 123 | " s.sendall(str.encode(line))\n", 124 | " time.sleep(1)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "Using Spark Streaming\n", 132 | "----\n", 133 | "\n", 134 | "Now we'll replace the consumer with a Spark application. This will work with micro-batches lasting 2 seconds." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 1, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "from pyspark import SparkContext\n", 146 | "\n", 147 | "sc = SparkContext('local[*]')" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 5, 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "[('the', 13600), ('of', 8127), ('and', 6542), ('a', 5842), ('to', 4787)]" 161 | ] 162 | }, 163 | "execution_count": 5, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "lines = sc.textFile('data/Ulysses.txt')\n", 170 | "\n", 171 | "counts = (lines.flatMap(lambda line: line.split())\n", 172 | " .map(lambda word: (word, 1))\n", 173 | " .reduceByKey(lambda x,y: x+ y))\n", 174 | "\n", 175 | "counts.takeOrdered(5, key=lambda x: -x[1])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "### Monitor a directory for new or renamed files" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 3, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "Writing file_consumer.py\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "%%file file_consumer.py\n", 202 | "\n", 203 | "import sys\n", 204 | "from pyspark import SparkContext\n", 205 | "from pyspark.streaming import StreamingContext\n", 206 | "\n", 207 | "sc = SparkContext('local[*]')\n", 208 | "sc.setLogLevel(\"WARN\")\n", 209 | "\n", 210 | "ssc = StreamingContext(sc, 2)\n", 211 | "lines = ssc.textFileStream(sys.argv[1])\n", 212 | "\n", 213 | "counts = (lines.flatMap(lambda line: line.split())\n", 214 | " .map(lambda word: (word, 1))\n", 215 | " .reduceByKey(lambda x,y: x+ y))\n", 216 | "\n", 217 | "counts.pprint()\n", 218 | "\n", 219 | "ssc.start()\n", 220 | "ssc.awaitTermination()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "### Usage\n", 228 | "\n", 229 | "Run in terminal\n", 230 | "```bash\n", 231 | "~/anaconda3/share/spark-1.6.0/bin/spark-submit file_consumer.py \n", 232 | "```\n", 233 | "\n", 234 | "When you copy, move or save a file to ``, the word counts for that file will be updated .. " 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "### Monitor a TCP/IP socket" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 4, 247 | "metadata": { 248 | "collapsed": false 249 | }, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "Overwriting socket_consumer.py\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "%%file socket_consumer.py\n", 261 | "\n", 262 | "import sys\n", 263 | "from pyspark import SparkContext\n", 264 | "from pyspark.streaming import StreamingContext\n", 265 | "\n", 266 | "sc = SparkContext('local[*]')\n", 267 | "sc.setLogLevel(\"WARN\")\n", 268 | "\n", 269 | "ssc = StreamingContext(sc, 2)\n", 270 | "lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))\n", 271 | "\n", 272 | "counts = (lines.flatMap(lambda line: line.split())\n", 273 | " .map(lambda word: (word, 1))\n", 274 | " .reduceByKey(lambda x,y: x+ y))\n", 275 | "\n", 276 | "counts.pprint()\n", 277 | "\n", 278 | "ssc.start()\n", 279 | "ssc.awaitTermination()" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "### Usage\n", 287 | "\n", 288 | "Run in terminal\n", 289 | "```bash\n", 290 | "~/anaconda3/share/spark-1.6.0/bin/spark-submit socket_consumer.py localhost 10000\n", 291 | "```\n", 292 | "\n", 293 | "In a different terminal\n", 294 | "```\n", 295 | "nc -lk 10000\n", 296 | "```\n", 297 | "\n", 298 | "Any text pasted in the `nc` terminal will have its words counted." 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": { 304 | "collapsed": true 305 | }, 306 | "source": [ 307 | "### Keeping state" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 6, 313 | "metadata": { 314 | "collapsed": false 315 | }, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "Overwriting stateful_socket_consumer.py\n" 322 | ] 323 | } 324 | ], 325 | "source": [ 326 | "%%file stateful_socket_consumer.py\n", 327 | "\n", 328 | "import sys\n", 329 | "from pyspark import SparkContext\n", 330 | "from pyspark.streaming import StreamingContext\n", 331 | "\n", 332 | "def updateFunc(new, last):\n", 333 | " if last is None:\n", 334 | " last = 0\n", 335 | " return sum(new) + last\n", 336 | "\n", 337 | "sc = SparkContext('local[*]')\n", 338 | "sc.setLogLevel(\"WARN\")\n", 339 | "\n", 340 | "ssc = StreamingContext(sc, 2)\n", 341 | "ssc.checkpoint(\"checkpoint\")\n", 342 | "\n", 343 | "lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))\n", 344 | "\n", 345 | "counts = (lines.flatMap(lambda line: line.split())\n", 346 | " .map(lambda word: (word, 1))\n", 347 | " .updateStateByKey(updateFunc)\n", 348 | " .transform(lambda x: x.sortByKey()))\n", 349 | "\n", 350 | "counts.pprint()\n", 351 | "\n", 352 | "ssc.start()\n", 353 | "ssc.awaitTermination()" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": { 359 | "collapsed": true 360 | }, 361 | "source": [ 362 | "### Usage\n", 363 | "\n", 364 | "Same as above, but the Spark program will now maintain an updated running count." 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": { 371 | "collapsed": true 372 | }, 373 | "outputs": [], 374 | "source": [] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "collapsed": true 381 | }, 382 | "outputs": [], 383 | "source": [] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": { 389 | "collapsed": true 390 | }, 391 | "outputs": [], 392 | "source": [] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": { 398 | "collapsed": true 399 | }, 400 | "outputs": [], 401 | "source": [] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": { 407 | "collapsed": true 408 | }, 409 | "outputs": [], 410 | "source": [] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": { 416 | "collapsed": true 417 | }, 418 | "outputs": [], 419 | "source": [] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": { 425 | "collapsed": true 426 | }, 427 | "outputs": [], 428 | "source": [] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": { 434 | "collapsed": true 435 | }, 436 | "outputs": [], 437 | "source": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": { 443 | "collapsed": true 444 | }, 445 | "outputs": [], 446 | "source": [] 447 | } 448 | ], 449 | "metadata": { 450 | "kernelspec": { 451 | "display_name": "Python 3", 452 | "language": "python", 453 | "name": "python3" 454 | }, 455 | "language_info": { 456 | "codemirror_mode": { 457 | "name": "ipython", 458 | "version": 3 459 | }, 460 | "file_extension": ".py", 461 | "mimetype": "text/x-python", 462 | "name": "python", 463 | "nbconvert_exporter": "python", 464 | "pygments_lexer": "ipython3", 465 | "version": "3.5.1" 466 | } 467 | }, 468 | "nbformat": 4, 469 | "nbformat_minor": 0 470 | } 471 | -------------------------------------------------------------------------------- /lectures/21H_Spark_Cloud.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Spark on Cloud\n", 8 | "====\n", 9 | "\n", 10 | "How to set up and run Spark on Azure or AWS EC2 clusters." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "Azure\n", 18 | "----\n", 19 | "\n", 20 | "Follow [instructions provided by Microsoft](https://azure.microsoft.com/en-us/documentation/articles/hdinsight-apache-spark-jupyter-spark-sql/).\n", 21 | "\n", 22 | "To terminate the cluster, you have to **delete** it." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "AWS\n", 30 | "----\n", 31 | "\n", 32 | "AWS setup is more involved. We will show how to access `pyspark` via ssh to an `EMR` cluster, as well as how to set up the `Zeppelin` browser-based notebook (similar to Jupyter).\n", 33 | "\n", 34 | "**References**\n", 35 | "\n", 36 | "- [EMR Spark](http://docs.aws.amazon.com/ElasticMapReduce/latest/ReleaseGuide/emr-spark.html)\n", 37 | "- [AWS tutorial](http://docs.aws.amazon.com/ElasticMapReduce/latest/ManagementGuide/emr-gs.html)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "Know your AWS public and private [access keys](http://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSGettingStartedGuide/AWSCredentials.html)\n", 45 | "----\n", 46 | "\n", 47 | "These will look something like\n", 48 | "\n", 49 | "- public: `AKIAIOSFODNN7EXAMPLE`\n", 50 | "- private: `wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY`" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "Know your AWS EC2 [key-pair](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html)\n", 58 | "----\n", 59 | "\n", 60 | "This is a name that you give - mine is `cliburn-2016` and an associated PEM file - I keep mine at ~/AWS/cliburn-2016.pem.\n", 61 | "\n", 62 | "Set the correct permissions on the PEM file.\n", 63 | "```\n", 64 | "chmod 400 xxx.pem\n", 65 | "```" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "source": [ 74 | "Install AWS command line client\n", 75 | "----\n", 76 | "\n", 77 | "```\n", 78 | "pip install awscli\n", 79 | "```\n", 80 | "\n", 81 | "If you run into problems, see [docs](http://docs.aws.amazon.com/cli/latest/userguide/installing.html)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Configure the AWS command line client\n", 89 | "----\n", 90 | "\n", 91 | "```\n", 92 | "aws configure\n", 93 | "```\n", 94 | "\n", 95 | "```\n", 96 | "AWS Access Key ID: <>\n", 97 | "AWS Secret Access Key: <>\n", 98 | "Default region name: us-east-1\n", 99 | "Default output format: json\n", 100 | "```" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Create a cluster\n", 108 | "----\n", 109 | "\n", 110 | "**Warning**: You will be charged for this.\n", 111 | "\n", 112 | "```\n", 113 | "aws emr create-cluster --name \"<>\" --release-label emr-4.5.0 --applications Name=Spark Name=Zeppelin-Sandbox --ec2-attributes KeyName=<>> --instance-type m3.xlarge --instance-count 3 --use-default-roles\n", 114 | "```\n", 115 | "\n", 116 | "For example, I start mine with\n", 117 | "```\n", 118 | "aws emr create-cluster --name \"spak-2016-d\" --release-label emr-4.5.0 --applications Name=Spark Name=Zeppelin-Sandbox --ec2-attributes KeyName=\"cliburn-2016\" --instance-type m3.xlarge --instance-count 3 --use-default-role\n", 119 | "```\n", 120 | "\n", 121 | "A cluster-id should be returned\n", 122 | "```\n", 123 | "{\n", 124 | " \"ClusterId\": \"j-XXXXXXXXXXXXXXX\"\n", 125 | "}\n", 126 | "```" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "Get information about the cluster\n", 134 | "-----\n", 135 | "\n", 136 | "```\n", 137 | "aws emr describe-cluster --cluster-id -XXXXXXXXXXXXXXX\n", 138 | "```\n", 139 | "\n", 140 | "or just inspect the state\n", 141 | "```\n", 142 | "aws emr describe-cluster --cluster-id -XXXXXXXXXXXXXXX | grep \\\"State\\\"\n", 143 | "```" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "Connect to the cluster via `ssh`\n", 151 | "----\n", 152 | "\n", 153 | "```\n", 154 | "aws emr ssh --cluster-id -XXXXXXXXXXXXXXX --key-pair-file cliburn-2016.pem \n", 155 | "```" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "Note the IP address that is returned\n", 163 | "----\n", 164 | "\n", 165 | "It will be something like `ec2-XX-X-XX-XXX.compute-1.amazonaws.com`" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "Run `pyspark`\n", 173 | "----\n", 174 | "\n", 175 | "Run\n", 176 | "```\n", 177 | "pyspark\n", 178 | "```\n", 179 | "\n", 180 | "And you will be in a `pyspark` console where you can issue Spark commands.\n", 181 | "\n", 182 | "When you've had enough fun playing in `pyspark` for a while, end the session with `Ctrl-D` and exit to leave the `ssh` session." 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "Run the `Zepellin` [notebook](https://zeppelin.incubator.apache.org)\n", 190 | "----\n", 191 | "\n", 192 | "Create an SSH tunnel to port 8890\n", 193 | "\n", 194 | "```\n", 195 | "ssh -i xxx.pem -L 8192:ec2-xx-xx-xx.compute-1.amazonaws.com:8192 hadoop@ec2-xx-xx-xx-xx.compute-1.amazonaws.com -N -v\n", 196 | "```\n", 197 | "\n", 198 | "Fill in the `xxx` with the locatin of your PEM file, and the appropriate IP address." 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "Connect to `Zeppelin` notebook\n", 206 | "----\n", 207 | "\n", 208 | "Open a browser to http://localhost:8890/ - if it worked you should see this\n", 209 | "\n", 210 | "![Zeppelin screenshot](http://cloudacademy.com/blog/wp-content/uploads/2016/01/Zeppelin-Notebook-1.png)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "Create notebook and run Spark within it\n", 218 | "----\n", 219 | "\n", 220 | "The default cell uses `scala`. For `pyspark` just start a cell with `%pyspark`." 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "Terminate the cluster\n", 228 | "----\n", 229 | "\n", 230 | "When you are done, remember to terminate the cluster!\n", 231 | "\n", 232 | "```\n", 233 | "aws emr terminate-clusters --cluster-id j-XXXXXXXXXXXXXXX\n", 234 | "```\n", 235 | "\n", 236 | "and confirm that it is terminating\n", 237 | "\n", 238 | "```\n", 239 | "aws emr describe-cluster --cluster-id j-XXXXXXXXXXXXXXX | grep \\\"State\\\"\n", 240 | "```\n", 241 | "\n", 242 | "You should see\n", 243 | "\n", 244 | "```\n", 245 | " \"State\": \"TERMINATING\"\n", 246 | " \"State\": \"TERMINATING\"\n", 247 | " \"State\": \"TERMINATING\"\n", 248 | "```\n", 249 | "\n", 250 | "If you are paranoid, log into the [AWS Management Console ](https://aws.amazon.com/console/) and click on `Services | EMR` and check the status of your cluster." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "collapsed": true 258 | }, 259 | "outputs": [], 260 | "source": [] 261 | } 262 | ], 263 | "metadata": { 264 | "kernelspec": { 265 | "display_name": "Python 3", 266 | "language": "python", 267 | "name": "python3" 268 | }, 269 | "language_info": { 270 | "codemirror_mode": { 271 | "name": "ipython", 272 | "version": 3 273 | }, 274 | "file_extension": ".py", 275 | "mimetype": "text/x-python", 276 | "name": "python", 277 | "nbconvert_exporter": "python", 278 | "pygments_lexer": "ipython3", 279 | "version": "3.5.1" 280 | } 281 | }, 282 | "nbformat": 4, 283 | "nbformat_minor": 0 284 | } 285 | -------------------------------------------------------------------------------- /lectures/Customizing_Jupyter.ipynb: -------------------------------------------------------------------------------- 1 | ../misc/Customizing_Jupyter.ipynb -------------------------------------------------------------------------------- /lectures/Extra_Packages.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Extra Packages\n", 8 | "====\n", 9 | "\n", 10 | "These are packages used in the notebooks that are not part of the standard Anaconda distribution. If you get a package not found error, execute the appropriate installation code cell and restart the kernel in the notebook with the error." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "source": [ 19 | "```bash\n", 20 | "\n", 21 | "! conda update conda\n", 22 | "! conda update -y matplotlib pandas scikit-learn seaborn\n", 23 | "\n", 24 | "! pip instll version_information\n", 25 | "! pip install rpy2\n", 26 | "! pip install ggplot\n", 27 | "! pip install qgrid\n", 28 | "! pip install ipython-sql\n", 29 | "! pip install pandasql\n", 30 | "! pip install lshash\n", 31 | "! pip install hat-trie\n", 32 | "! pip install hyperloglog\n", 33 | "! pip install git+https://github.com/jaybaird/python-bloomfilter.git\n", 34 | "```" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [] 45 | } 46 | ], 47 | "metadata": { 48 | "kernelspec": { 49 | "display_name": "Python 3", 50 | "language": "python", 51 | "name": "python3" 52 | }, 53 | "language_info": { 54 | "codemirror_mode": { 55 | "name": "ipython", 56 | "version": 3 57 | }, 58 | "file_extension": ".py", 59 | "mimetype": "text/x-python", 60 | "name": "python", 61 | "nbconvert_exporter": "python", 62 | "pygments_lexer": "ipython3", 63 | "version": "3.5.1" 64 | } 65 | }, 66 | "nbformat": 4, 67 | "nbformat_minor": 0 68 | } 69 | -------------------------------------------------------------------------------- /lectures/HtWt.csv: -------------------------------------------------------------------------------- 1 | male,height,weight 2 | 0,63.2,168.7 3 | 0,68.7,169.8 4 | 0,64.8,176.6 5 | 0,67.9,246.8 6 | 1,68.9,151.6 7 | 1,67.8,158.0 8 | 1,68.2,168.6 9 | 0,64.8,137.2 10 | 1,64.3,177.0 11 | 0,64.7,128.0 12 | 1,66.9,168.4 13 | 1,66.9,136.2 14 | 1,67.1,160.3 15 | 1,70.2,233.9 16 | 1,67.4,171.7 17 | 1,71.1,185.5 18 | 0,63.4,177.6 19 | 1,66.9,132.9 20 | 0,71.0,140.1 21 | 1,70.4,151.9 22 | 0,59.5,147.2 23 | 1,70.4,159.0 24 | 0,61.5,113.0 25 | 1,74.5,194.5 26 | 0,65.3,145.1 27 | 1,68.8,196.5 28 | 0,67.2,148.9 29 | 1,68.7,132.9 30 | 0,60.0,168.4 31 | 0,62.5,146.2 32 | 1,72.0,236.4 33 | 1,67.9,140.0 34 | 1,65.1,156.2 35 | 1,63.5,178.7 36 | 1,68.2,147.5 37 | 0,64.6,97.7 38 | 1,68.1,189.6 39 | 0,66.2,221.9 40 | 0,62.8,168.1 41 | 0,65.3,143.1 42 | 0,65.8,217.7 43 | 0,68.7,133.2 44 | 0,63.8,96.5 45 | 1,70.6,270.6 46 | 0,61.5,137.2 47 | 0,61.9,124.2 48 | 0,65.1,128.3 49 | 1,68.7,203.6 50 | 0,57.6,132.4 51 | 1,66.3,189.4 52 | 1,69.0,174.0 53 | 0,63.4,163.3 54 | 1,69.5,183.5 55 | 1,67.8,193.8 56 | 0,61.6,119.7 57 | 1,71.2,157.4 58 | 1,67.4,146.1 59 | 0,66.1,128.3 60 | 1,70.7,179.1 61 | 0,67.0,140.0 62 | 1,66.8,202.2 63 | 1,69.9,169.4 64 | 0,57.7,122.8 65 | 0,62.5,248.5 66 | 1,66.6,154.4 67 | 0,60.6,140.2 68 | 1,70.4,141.6 69 | 0,66.4,144.4 70 | 0,62.3,116.2 71 | 1,73.3,175.0 72 | -------------------------------------------------------------------------------- /lectures/Lagrange_multiplier.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/Lagrange_multiplier.png -------------------------------------------------------------------------------- /lectures/Local_Installation.ipynb: -------------------------------------------------------------------------------- 1 | ../misc/Local_Installation.ipynb -------------------------------------------------------------------------------- /lectures/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | @echo " coverage to run coverage check of the documentation (if enabled)" 49 | 50 | .PHONY: clean 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | .PHONY: html 55 | html: 56 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 57 | @echo 58 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 59 | 60 | .PHONY: dirhtml 61 | dirhtml: 62 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 63 | @echo 64 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 65 | 66 | .PHONY: singlehtml 67 | singlehtml: 68 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 69 | @echo 70 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 71 | 72 | .PHONY: pickle 73 | pickle: 74 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 75 | @echo 76 | @echo "Build finished; now you can process the pickle files." 77 | 78 | .PHONY: json 79 | json: 80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 81 | @echo 82 | @echo "Build finished; now you can process the JSON files." 83 | 84 | .PHONY: htmlhelp 85 | htmlhelp: 86 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 87 | @echo 88 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 89 | ".hhp project file in $(BUILDDIR)/htmlhelp." 90 | 91 | .PHONY: qthelp 92 | qthelp: 93 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 94 | @echo 95 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 96 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 97 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/ComputationalStatisticsinPython.qhcp" 98 | @echo "To view the help file:" 99 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/ComputationalStatisticsinPython.qhc" 100 | 101 | .PHONY: applehelp 102 | applehelp: 103 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 104 | @echo 105 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 106 | @echo "N.B. You won't be able to view it unless you put it in" \ 107 | "~/Library/Documentation/Help or install it in your application" \ 108 | "bundle." 109 | 110 | .PHONY: devhelp 111 | devhelp: 112 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 113 | @echo 114 | @echo "Build finished." 115 | @echo "To view the help file:" 116 | @echo "# mkdir -p $$HOME/.local/share/devhelp/ComputationalStatisticsinPython" 117 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/ComputationalStatisticsinPython" 118 | @echo "# devhelp" 119 | 120 | .PHONY: epub 121 | epub: 122 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 123 | @echo 124 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 125 | 126 | .PHONY: latex 127 | latex: 128 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 129 | @echo 130 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 131 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 132 | "(use \`make latexpdf' here to do that automatically)." 133 | 134 | .PHONY: latexpdf 135 | latexpdf: 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo "Running LaTeX files through pdflatex..." 138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 140 | 141 | .PHONY: latexpdfja 142 | latexpdfja: 143 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 144 | @echo "Running LaTeX files through platex and dvipdfmx..." 145 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 146 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 147 | 148 | .PHONY: text 149 | text: 150 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 151 | @echo 152 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 153 | 154 | .PHONY: man 155 | man: 156 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 157 | @echo 158 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 159 | 160 | .PHONY: texinfo 161 | texinfo: 162 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 163 | @echo 164 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 165 | @echo "Run \`make' in that directory to run these through makeinfo" \ 166 | "(use \`make info' here to do that automatically)." 167 | 168 | .PHONY: info 169 | info: 170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 171 | @echo "Running Texinfo files through makeinfo..." 172 | make -C $(BUILDDIR)/texinfo info 173 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 174 | 175 | .PHONY: gettext 176 | gettext: 177 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 178 | @echo 179 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 180 | 181 | .PHONY: changes 182 | changes: 183 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 184 | @echo 185 | @echo "The overview file is in $(BUILDDIR)/changes." 186 | 187 | .PHONY: linkcheck 188 | linkcheck: 189 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 190 | @echo 191 | @echo "Link check complete; look for any errors in the above output " \ 192 | "or in $(BUILDDIR)/linkcheck/output.txt." 193 | 194 | .PHONY: doctest 195 | doctest: 196 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 197 | @echo "Testing of doctests in the sources finished, look at the " \ 198 | "results in $(BUILDDIR)/doctest/output.txt." 199 | 200 | .PHONY: coverage 201 | coverage: 202 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 203 | @echo "Testing of coverage in the sources finished, look at the " \ 204 | "results in $(BUILDDIR)/coverage/python.txt." 205 | 206 | .PHONY: xml 207 | xml: 208 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 209 | @echo 210 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 211 | 212 | .PHONY: pseudoxml 213 | pseudoxml: 214 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 215 | @echo 216 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 217 | -------------------------------------------------------------------------------- /lectures/Spark03.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Spark Libraries\n", 8 | "====\n", 9 | "\n", 10 | "A tour of the Spark SQL, Streaming, GraphX libraries and MLLib libraries." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "from pyspark import SparkContext\n", 22 | "sc = SparkContext('local[*]')" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [] 33 | } 34 | ], 35 | "metadata": { 36 | "kernelspec": { 37 | "display_name": "Python 3", 38 | "language": "python", 39 | "name": "python3" 40 | }, 41 | "language_info": { 42 | "codemirror_mode": { 43 | "name": "ipython", 44 | "version": 3 45 | }, 46 | "file_extension": ".py", 47 | "mimetype": "text/x-python", 48 | "name": "python", 49 | "nbconvert_exporter": "python", 50 | "pygments_lexer": "ipython3", 51 | "version": "3.5.1" 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 0 56 | } 57 | -------------------------------------------------------------------------------- /lectures/Spark04.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Spark on a Cluster\n", 8 | "====\n", 9 | "\n", 10 | "How to set up and run Spark on an AWS EC2 cluster." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [] 21 | } 22 | ], 23 | "metadata": { 24 | "kernelspec": { 25 | "display_name": "Python 3", 26 | "language": "python", 27 | "name": "python3" 28 | }, 29 | "language_info": { 30 | "codemirror_mode": { 31 | "name": "ipython", 32 | "version": 3 33 | }, 34 | "file_extension": ".py", 35 | "mimetype": "text/x-python", 36 | "name": "python", 37 | "nbconvert_exporter": "python", 38 | "pygments_lexer": "ipython3", 39 | "version": "3.5.1" 40 | } 41 | }, 42 | "nbformat": 4, 43 | "nbformat_minor": 0 44 | } 45 | -------------------------------------------------------------------------------- /lectures/Template01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Jupyter notebook\n", 8 | "----\n", 9 | "```\n", 10 | "Quick tour\n", 11 | "Keyboard shortcuts\n", 12 | "Markdown cells\n", 13 | "Code cells\n", 14 | "Switching kernels\n", 15 | "```" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "\n", 27 | "\n", 28 | "\n", 29 | "\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "Unix shell\n", 37 | "-----\n", 38 | "```\n", 39 | "Getting information: man, info, Google\n", 40 | "File and directory naviagtion - pwd, ls, cd, mkdir, rm, cp, mv\n", 41 | "Pipes and I/O: |, >, <\n", 42 | "Finding stuff: find, grep, locate\n", 43 | "```" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "\n", 55 | "\n", 56 | "\n", 57 | "\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "Interactive Python\n", 65 | "----\n", 66 | "```\n", 67 | "help, ?, ??\n", 68 | "magic functions\n", 69 | "Calling R from Jupyter\n", 70 | "Simple calculations\n", 71 | "Everything is an object\n", 72 | "```" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "\n", 84 | "\n", 85 | "\n", 86 | "\n" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Types and Collections\n", 94 | "----\n", 95 | "```\n", 96 | "bool, int, float, complex\n", 97 | "string\n", 98 | "None\n", 99 | "tuple and named_tuple, tuple unpacking\n", 100 | "list \n", 101 | "set\n", 102 | "dictionarie, ordered_dictionarie, defaultdict\n", 103 | "numpy.array\n", 104 | "```" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "\n", 116 | "\n", 117 | "\n" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Operators\n", 125 | "----\n", 126 | "```\n", 127 | "+, -, *, /, //, **, %\n", 128 | "==, != , <, >=\n", 129 | "and, or, not, ~, |, &\n", 130 | "<<, >>\n", 131 | "+=, *= etc\n", 132 | "in\n", 133 | "Operator overloading - list, set, dict operators\n", 134 | "```" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "\n", 146 | "\n", 147 | "\n", 148 | "\n" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "Names, assignment and identity\n", 156 | "----\n", 157 | "```\n", 158 | "=, ==, is, id\n", 159 | "```" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": true 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "\n", 171 | "\n", 172 | "\n" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "Exercise: Word counter\n", 180 | "----\n", 181 | "\n", 182 | "Coutn the number of times each word occurs in the poem Jabberwocky. Ignore case and remove all puncutation." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": true 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "jabberwocky = '''\n", 194 | "’Twas brillig, and the slithy toves\n", 195 | " Did gyre and gimble in the wabe:\n", 196 | "All mimsy were the borogoves,\n", 197 | " And the mome raths outgrabe.\n", 198 | "\n", 199 | "“Beware the Jabberwock, my son!\n", 200 | " The jaws that bite, the claws that catch!\n", 201 | "Beware the Jubjub bird, and shun\n", 202 | " The frumious Bandersnatch!”\n", 203 | "\n", 204 | "He took his vorpal sword in hand;\n", 205 | " Long time the manxome foe he sought—\n", 206 | "So rested he by the Tumtum tree\n", 207 | " And stood awhile in thought.\n", 208 | "\n", 209 | "And, as in uffish thought he stood,\n", 210 | " The Jabberwock, with eyes of flame,\n", 211 | "Came whiffling through the tulgey wood,\n", 212 | " And burbled as it came!\n", 213 | "\n", 214 | "One, two! One, two! And through and through\n", 215 | " The vorpal blade went snicker-snack!\n", 216 | "He left it dead, and with its head\n", 217 | " He went galumphing back.\n", 218 | "\n", 219 | "“And hast thou slain the Jabberwock?\n", 220 | " Come to my arms, my beamish boy!\n", 221 | "O frabjous day! Callooh! Callay!”\n", 222 | " He chortled in his joy.\n", 223 | "\n", 224 | "’Twas brillig, and the slithy toves\n", 225 | " Did gyre and gimble in the wabe:\n", 226 | "All mimsy were the borogoves,\n", 227 | " And the mome raths outgrabe.\n", 228 | "'''" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "collapsed": true 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "\n", 240 | "\n", 241 | "\n", 242 | "\n" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "Control Flow\n", 250 | "----\n", 251 | "```\n", 252 | "if, elif, else\n", 253 | "for\n", 254 | "while\n", 255 | "continue\n", 256 | "break\n", 257 | "ternary operator\n", 258 | "```" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "collapsed": true 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "\n", 270 | "\n", 271 | "\n", 272 | "\n" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "Exercise: FizzBuzz\n", 280 | "----\n", 281 | "```\n", 282 | "The \"Fizz-Buzz test\" is an interview question designed to help filter out the 99.5% of programming job candidates who can't seem to program their way out of a wet paper bag. The text of the programming assignment is as follows:\n", 283 | "\"Write a program that prints the numbers from 1 to 100. But for multiples of three print “Fizz” instead of the number and for the multiples of five print “Buzz”. For numbers which are multiples of both three and five print “FizzBuzz”.\"\n", 284 | "```\n", 285 | "Source: [Using FizzBuzz to Find Developers who Grok Coding](http://tickletux.wordpress.com/2007/01/24/using-fizzbuzz-to-find-developers-who-grok-coding/)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": { 292 | "collapsed": true 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "\n", 297 | "\n", 298 | "\n", 299 | "\n" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "Built-in functions\n", 307 | "----" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 3, 313 | "metadata": { 314 | "collapsed": false 315 | }, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/plain": [ 320 | "['abs',\n", 321 | " 'all',\n", 322 | " 'any',\n", 323 | " 'ascii',\n", 324 | " 'bin',\n", 325 | " 'bool',\n", 326 | " 'bytearray',\n", 327 | " 'bytes',\n", 328 | " 'callable',\n", 329 | " 'chr',\n", 330 | " 'classmethod',\n", 331 | " 'compile',\n", 332 | " 'complex',\n", 333 | " 'copyright',\n", 334 | " 'credits',\n", 335 | " 'delattr',\n", 336 | " 'dict',\n", 337 | " 'dir',\n", 338 | " 'divmod',\n", 339 | " 'dreload',\n", 340 | " 'enumerate',\n", 341 | " 'eval',\n", 342 | " 'exec',\n", 343 | " 'filter',\n", 344 | " 'float',\n", 345 | " 'format',\n", 346 | " 'frozenset',\n", 347 | " 'get_ipython',\n", 348 | " 'getattr',\n", 349 | " 'globals',\n", 350 | " 'hasattr',\n", 351 | " 'hash',\n", 352 | " 'help',\n", 353 | " 'hex',\n", 354 | " 'id',\n", 355 | " 'input',\n", 356 | " 'int',\n", 357 | " 'isinstance',\n", 358 | " 'issubclass',\n", 359 | " 'iter',\n", 360 | " 'len',\n", 361 | " 'license',\n", 362 | " 'list',\n", 363 | " 'locals',\n", 364 | " 'map',\n", 365 | " 'max',\n", 366 | " 'memoryview',\n", 367 | " 'min',\n", 368 | " 'next',\n", 369 | " 'object',\n", 370 | " 'oct',\n", 371 | " 'open',\n", 372 | " 'ord',\n", 373 | " 'pow',\n", 374 | " 'print',\n", 375 | " 'property',\n", 376 | " 'range',\n", 377 | " 'repr',\n", 378 | " 'reversed',\n", 379 | " 'round',\n", 380 | " 'set',\n", 381 | " 'setattr',\n", 382 | " 'slice',\n", 383 | " 'sorted',\n", 384 | " 'staticmethod',\n", 385 | " 'str',\n", 386 | " 'sum',\n", 387 | " 'super',\n", 388 | " 'tuple',\n", 389 | " 'type',\n", 390 | " 'vars',\n", 391 | " 'zip']" 392 | ] 393 | }, 394 | "execution_count": 3, 395 | "metadata": {}, 396 | "output_type": "execute_result" 397 | } 398 | ], 399 | "source": [ 400 | "([x for x in dir(__builtin__) if x.islower() and not x.startswith('__')])" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": { 407 | "collapsed": true 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "\n", 412 | "\n", 413 | "\n", 414 | "\n" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "User-defined functions\n", 422 | "----\n", 423 | "```\n", 424 | "def\n", 425 | "lambda\n", 426 | "higher order functions\n", 427 | "```" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": { 434 | "collapsed": true 435 | }, 436 | "outputs": [], 437 | "source": [ 438 | "\n", 439 | "\n", 440 | "\n", 441 | "\n" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "Functional style\n", 449 | "----\n", 450 | "```\n", 451 | "map, reduce, filter\n", 452 | "comprehensions - list, set, dictionary\n", 453 | "generator expressions\n", 454 | "```" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": { 461 | "collapsed": true 462 | }, 463 | "outputs": [], 464 | "source": [ 465 | "\n", 466 | "\n", 467 | "\n", 468 | "\n" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "Exericse\n", 476 | "----\n", 477 | "\n", 478 | "Write a program to flatten a list of lists into a flat list. For example,\n", 479 | "```\n", 480 | "flatten([[1,2,3],[4,5],[6,7,8]]) should return [1,2,3,4,5,6,7,8].\n", 481 | "```\n", 482 | "\n", 483 | "Do this using\n", 484 | "- a for loop \n", 485 | "- a list comprehension" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": { 492 | "collapsed": true 493 | }, 494 | "outputs": [], 495 | "source": [ 496 | "\n", 497 | "\n", 498 | "\n", 499 | "\n" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "Modules\n", 507 | "----\n", 508 | "```\n", 509 | "Installing new modules\n", 510 | "Writing your own module\n", 511 | "Importing from a modue\n", 512 | "```" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": { 519 | "collapsed": true 520 | }, 521 | "outputs": [], 522 | "source": [ 523 | "\n", 524 | "\n", 525 | "\n" 526 | ] 527 | } 528 | ], 529 | "metadata": { 530 | "kernelspec": { 531 | "display_name": "Python 3", 532 | "language": "python", 533 | "name": "python3" 534 | }, 535 | "language_info": { 536 | "codemirror_mode": { 537 | "name": "ipython", 538 | "version": 3 539 | }, 540 | "file_extension": ".py", 541 | "mimetype": "text/x-python", 542 | "name": "python", 543 | "nbconvert_exporter": "python", 544 | "pygments_lexer": "ipython3", 545 | "version": "3.5.1" 546 | } 547 | }, 548 | "nbformat": 4, 549 | "nbformat_minor": 0 550 | } 551 | -------------------------------------------------------------------------------- /lectures/commutative.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/commutative.png -------------------------------------------------------------------------------- /lectures/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Computational Statistics in Python documentation build configuration file, created by 5 | # sphinx-quickstart on Thu Jan 14 10:45:35 2016. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | import sys 17 | import os 18 | import shlex 19 | import cloud_sptheme as csp 20 | 21 | # If extensions (or modules to document with autodoc) are in another directory, 22 | # add these directories to sys.path here. If the directory is relative to the 23 | # documentation root, use os.path.abspath to make it absolute, like shown here. 24 | #sys.path.insert(0, os.path.abspath('.')) 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | #needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'nbsphinx', 36 | 'sphinx.ext.mathjax', 37 | 'IPython.sphinxext.ipython_console_highlighting', 38 | ] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ['_templates'] 42 | 43 | # The suffix(es) of source filenames. 44 | # You can specify multiple suffix as a list of string: 45 | # source_suffix = ['.rst', '.md'] 46 | source_suffix = '.rst' 47 | 48 | # The encoding of source files. 49 | #source_encoding = 'utf-8-sig' 50 | 51 | # The master toctree document. 52 | master_doc = 'index' 53 | 54 | # General information about the project. 55 | project = 'Computational Statistics in Python' 56 | copyright = '2016, Cliburn Chan, Janice McCarthy' 57 | author = 'Cliburn Chan, Janice McCarthy' 58 | 59 | # The version info for the project you're documenting, acts as replacement for 60 | # |version| and |release|, also used in various other places throughout the 61 | # built documents. 62 | # 63 | # The short X.Y version. 64 | version = '0.1' 65 | # The full version, including alpha/beta/rc tags. 66 | release = '0.1' 67 | 68 | # The language for content autogenerated by Sphinx. Refer to documentation 69 | # for a list of supported languages. 70 | # 71 | # This is also used if you do content translation via gettext catalogs. 72 | # Usually you set "language" from the command line for these cases. 73 | language = None 74 | 75 | # There are two options for replacing |today|: either, you set today to some 76 | # non-false value, then it is used: 77 | #today = '' 78 | # Else, today_fmt is used as the format for a strftime call. 79 | #today_fmt = '%B %d, %Y' 80 | 81 | # List of patterns, relative to source directory, that match files and 82 | # directories to ignore when looking for source files. 83 | exclude_patterns = ['_build', '**.ipynb_checkpoints'] 84 | 85 | # The reST default role (used for this markup: `text`) to use for all 86 | # documents. 87 | #default_role = None 88 | 89 | # If true, '()' will be appended to :func: etc. cross-reference text. 90 | #add_function_parentheses = True 91 | 92 | # If true, the current module name will be prepended to all description 93 | # unit titles (such as .. function::). 94 | #add_module_names = True 95 | 96 | # If true, sectionauthor and moduleauthor directives will be shown in the 97 | # output. They are ignored by default. 98 | #show_authors = False 99 | 100 | # The name of the Pygments (syntax highlighting) style to use. 101 | pygments_style = 'sphinx' 102 | 103 | # A list of ignored prefixes for module index sorting. 104 | #modindex_common_prefix = [] 105 | 106 | # If true, keep warnings as "system message" paragraphs in the built documents. 107 | #keep_warnings = False 108 | 109 | # If true, `todo` and `todoList` produce output, else they produce nothing. 110 | todo_include_todos = False 111 | 112 | 113 | # -- Options for HTML output ---------------------------------------------- 114 | 115 | # The theme to use for HTML and HTML Help pages. See the documentation for 116 | # a list of builtin themes. 117 | html_theme = 'cloud' 118 | 119 | # Theme options are theme-specific and customize the look and feel of a theme 120 | # further. For a list of options available for each theme, see the 121 | # documentation. 122 | #html_theme_options = {} 123 | 124 | # Add any paths that contain custom themes here, relative to this directory. 125 | #html_theme_path = [] 126 | 127 | # The name for this set of Sphinx documents. If None, it defaults to 128 | # " v documentation". 129 | html_title = "Computational Statistics in Python" 130 | 131 | # A shorter title for the navigation bar. Default is the same as html_title. 132 | #html_short_title = None 133 | 134 | # The name of an image file (relative to this directory) to place at the top 135 | # of the sidebar. 136 | #html_logo = None 137 | 138 | # The name of an image file (within the static path) to use as favicon of the 139 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 140 | # pixels large. 141 | #html_favicon = None 142 | 143 | # Add any paths that contain custom static files (such as style sheets) here, 144 | # relative to this directory. They are copied after the builtin static files, 145 | # so a file named "default.css" will overwrite the builtin "default.css". 146 | html_static_path = ['_static'] 147 | 148 | # Add any extra paths that contain custom files (such as robots.txt or 149 | # .htaccess) here, relative to this directory. These files are copied 150 | # directly to the root of the documentation. 151 | #html_extra_path = [] 152 | 153 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 154 | # using the given strftime format. 155 | #html_last_updated_fmt = '%b %d, %Y' 156 | 157 | # If true, SmartyPants will be used to convert quotes and dashes to 158 | # typographically correct entities. 159 | #html_use_smartypants = True 160 | 161 | # Custom sidebar templates, maps document names to template names. 162 | #html_sidebars = {} 163 | 164 | # Additional templates that should be rendered to pages, maps page names to 165 | # template names. 166 | #html_additional_pages = {} 167 | 168 | # If false, no module index is generated. 169 | #html_domain_indices = True 170 | 171 | # If false, no index is generated. 172 | #html_use_index = True 173 | 174 | # If true, the index is split into individual pages for each letter. 175 | #html_split_index = False 176 | 177 | # If true, links to the reST sources are added to the pages. 178 | #html_show_sourcelink = True 179 | 180 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 181 | #html_show_sphinx = True 182 | 183 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 184 | #html_show_copyright = True 185 | 186 | # If true, an OpenSearch description file will be output, and all pages will 187 | # contain a tag referring to it. The value of this option must be the 188 | # base URL from which the finished HTML is served. 189 | #html_use_opensearch = '' 190 | 191 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 192 | #html_file_suffix = None 193 | 194 | # Language to be used for generating the HTML full-text search index. 195 | # Sphinx supports the following languages: 196 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' 197 | # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' 198 | #html_search_language = 'en' 199 | 200 | # A dictionary with options for the search language support, empty by default. 201 | # Now only 'ja' uses this config value 202 | #html_search_options = {'type': 'default'} 203 | 204 | # The name of a javascript file (relative to the configuration directory) that 205 | # implements a search results scorer. If empty, the default will be used. 206 | #html_search_scorer = 'scorer.js' 207 | 208 | # Output file base name for HTML help builder. 209 | htmlhelp_basename = 'ComputationalStatisticsinPythondoc' 210 | 211 | # -- Options for LaTeX output --------------------------------------------- 212 | 213 | latex_elements = { 214 | # The paper size ('letterpaper' or 'a4paper'). 215 | #'papersize': 'letterpaper', 216 | 217 | # The font size ('10pt', '11pt' or '12pt'). 218 | #'pointsize': '10pt', 219 | 220 | # Additional stuff for the LaTeX preamble. 221 | #'preamble': '', 222 | 223 | # Latex figure (float) alignment 224 | #'figure_align': 'htbp', 225 | } 226 | 227 | # Grouping the document tree into LaTeX files. List of tuples 228 | # (source start file, target name, title, 229 | # author, documentclass [howto, manual, or own class]). 230 | latex_documents = [ 231 | (master_doc, 'ComputationalStatisticsinPython.tex', 'Computational Statistics in Python', 232 | 'Cliburn Chan, Janice McCarthy', 'manual'), 233 | ] 234 | 235 | # The name of an image file (relative to this directory) to place at the top of 236 | # the title page. 237 | #latex_logo = None 238 | 239 | # For "manual" documents, if this is true, then toplevel headings are parts, 240 | # not chapters. 241 | #latex_use_parts = False 242 | 243 | # If true, show page references after internal links. 244 | #latex_show_pagerefs = False 245 | 246 | # If true, show URL addresses after external links. 247 | #latex_show_urls = False 248 | 249 | # Documents to append as an appendix to all manuals. 250 | #latex_appendices = [] 251 | 252 | # If false, no module index is generated. 253 | #latex_domain_indices = True 254 | 255 | 256 | # -- Options for manual page output --------------------------------------- 257 | 258 | # One entry per manual page. List of tuples 259 | # (source start file, name, description, authors, manual section). 260 | man_pages = [ 261 | (master_doc, 'computationalstatisticsinpython', 'Computational Statistics in Python', 262 | [author], 1) 263 | ] 264 | 265 | # If true, show URL addresses after external links. 266 | #man_show_urls = False 267 | 268 | 269 | # -- Options for Texinfo output ------------------------------------------- 270 | 271 | # Grouping the document tree into Texinfo files. List of tuples 272 | # (source start file, target name, title, author, 273 | # dir menu entry, description, category) 274 | texinfo_documents = [ 275 | (master_doc, 'ComputationalStatisticsinPython', 'Computational Statistics in Python', 276 | author, 'ComputationalStatisticsinPython', 'One line description of project.', 277 | 'Miscellaneous'), 278 | ] 279 | 280 | # Documents to append as an appendix to all manuals. 281 | #texinfo_appendices = [] 282 | 283 | # If false, no module index is generated. 284 | #texinfo_domain_indices = True 285 | 286 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 287 | #texinfo_show_urls = 'footnote' 288 | 289 | # If true, do not generate a @detailmenu in the "Top" node's menu. 290 | #texinfo_no_detailmenu = False 291 | -------------------------------------------------------------------------------- /lectures/data/adult.names.txt: -------------------------------------------------------------------------------- 1 | | This data was extracted from the census bureau database found at 2 | | http://www.census.gov/ftp/pub/DES/www/welcome.html 3 | | Donor: Ronny Kohavi and Barry Becker, 4 | | Data Mining and Visualization 5 | | Silicon Graphics. 6 | | e-mail: ronnyk@sgi.com for questions. 7 | | Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random). 8 | | 48842 instances, mix of continuous and discrete (train=32561, test=16281) 9 | | 45222 if instances with unknown values are removed (train=30162, test=15060) 10 | | Duplicate or conflicting instances : 6 11 | | Class probabilities for adult.all file 12 | | Probability for the label '>50K' : 23.93% / 24.78% (without unknowns) 13 | | Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns) 14 | | 15 | | Extraction was done by Barry Becker from the 1994 Census database. A set of 16 | | reasonably clean records was extracted using the following conditions: 17 | | ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0)) 18 | | 19 | | Prediction task is to determine whether a person makes over 50K 20 | | a year. 21 | | 22 | | First cited in: 23 | | @inproceedings{kohavi-nbtree, 24 | | author={Ron Kohavi}, 25 | | title={Scaling Up the Accuracy of Naive-Bayes Classifiers: a 26 | | Decision-Tree Hybrid}, 27 | | booktitle={Proceedings of the Second International Conference on 28 | | Knowledge Discovery and Data Mining}, 29 | | year = 1996, 30 | | pages={to appear}} 31 | | 32 | | Error Accuracy reported as follows, after removal of unknowns from 33 | | train/test sets): 34 | | C4.5 : 84.46+-0.30 35 | | Naive-Bayes: 83.88+-0.30 36 | | NBTree : 85.90+-0.28 37 | | 38 | | 39 | | Following algorithms were later run with the following error rates, 40 | | all after removal of unknowns and using the original train/test split. 41 | | All these numbers are straight runs using MLC++ with default values. 42 | | 43 | | Algorithm Error 44 | | -- ---------------- ----- 45 | | 1 C4.5 15.54 46 | | 2 C4.5-auto 14.46 47 | | 3 C4.5 rules 14.94 48 | | 4 Voted ID3 (0.6) 15.64 49 | | 5 Voted ID3 (0.8) 16.47 50 | | 6 T2 16.84 51 | | 7 1R 19.54 52 | | 8 NBTree 14.10 53 | | 9 CN2 16.00 54 | | 10 HOODG 14.82 55 | | 11 FSS Naive Bayes 14.05 56 | | 12 IDTM (Decision table) 14.46 57 | | 13 Naive-Bayes 16.12 58 | | 14 Nearest-neighbor (1) 21.42 59 | | 15 Nearest-neighbor (3) 20.35 60 | | 16 OC1 15.04 61 | | 17 Pebls Crashed. Unknown why (bounds WERE increased) 62 | | 63 | | Conversion of original data as follows: 64 | | 1. Discretized agrossincome into two ranges with threshold 50,000. 65 | | 2. Convert U.S. to US to avoid periods. 66 | | 3. Convert Unknown to "?" 67 | | 4. Run MLC++ GenCVFiles to generate data,test. 68 | | 69 | | Description of fnlwgt (final weight) 70 | | 71 | | The weights on the CPS files are controlled to independent estimates of the 72 | | civilian noninstitutional population of the US. These are prepared monthly 73 | | for us by Population Division here at the Census Bureau. We use 3 sets of 74 | | controls. 75 | | These are: 76 | | 1. A single cell estimate of the population 16+ for each state. 77 | | 2. Controls for Hispanic Origin by age and sex. 78 | | 3. Controls by Race, age and sex. 79 | | 80 | | We use all three sets of controls in our weighting program and "rake" through 81 | | them 6 times so that by the end we come back to all the controls we used. 82 | | 83 | | The term estimate refers to population totals derived from CPS by creating 84 | | "weighted tallies" of any specified socio-economic characteristics of the 85 | | population. 86 | | 87 | | People with similar demographic characteristics should have 88 | | similar weights. There is one important caveat to remember 89 | | about this statement. That is that since the CPS sample is 90 | | actually a collection of 51 state samples, each with its own 91 | | probability of selection, the statement only applies within 92 | | state. 93 | 94 | 95 | >50K, <=50K. 96 | 97 | age: continuous. 98 | workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. 99 | fnlwgt: continuous. 100 | education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. 101 | education-num: continuous. 102 | marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. 103 | occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. 104 | relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. 105 | race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. 106 | sex: Female, Male. 107 | capital-gain: continuous. 108 | capital-loss: continuous. 109 | hours-per-week: continuous. 110 | native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands. 111 | -------------------------------------------------------------------------------- /lectures/data/sonar.names.txt: -------------------------------------------------------------------------------- 1 | NAME: Sonar, Mines vs. Rocks 2 | 3 | SUMMARY: This is the data set used by Gorman and Sejnowski in their study 4 | of the classification of sonar signals using a neural network [1]. The 5 | task is to train a network to discriminate between sonar signals bounced 6 | off a metal cylinder and those bounced off a roughly cylindrical rock. 7 | 8 | SOURCE: The data set was contributed to the benchmark collection by Terry 9 | Sejnowski, now at the Salk Institute and the University of California at 10 | San Deigo. The data set was developed in collaboration with R. Paul 11 | Gorman of Allied-Signal Aerospace Technology Center. 12 | 13 | MAINTAINER: Scott E. Fahlman 14 | 15 | PROBLEM DESCRIPTION: 16 | 17 | The file "sonar.mines" contains 111 patterns obtained by bouncing sonar 18 | signals off a metal cylinder at various angles and under various 19 | conditions. The file "sonar.rocks" contains 97 patterns obtained from 20 | rocks under similar conditions. The transmitted sonar signal is a 21 | frequency-modulated chirp, rising in frequency. The data set contains 22 | signals obtained from a variety of different aspect angles, spanning 90 23 | degrees for the cylinder and 180 degrees for the rock. 24 | 25 | Each pattern is a set of 60 numbers in the range 0.0 to 1.0. Each number 26 | represents the energy within a particular frequency band, integrated over 27 | a certain period of time. The integration aperture for higher frequencies 28 | occur later in time, since these frequencies are transmitted later during 29 | the chirp. 30 | 31 | The label associated with each record contains the letter "R" if the object 32 | is a rock and "M" if it is a mine (metal cylinder). The numbers in the 33 | labels are in increasing order of aspect angle, but they do not encode the 34 | angle directly. 35 | 36 | METHODOLOGY: 37 | 38 | This data set can be used in a number of different ways to test learning 39 | speed, quality of ultimate learning, ability to generalize, or combinations 40 | of these factors. 41 | 42 | In [1], Gorman and Sejnowski report two series of experiments: an 43 | "aspect-angle independent" series, in which the whole data set is used 44 | without controlling for aspect angle, and an "aspect-angle dependent" 45 | series in which the training and testing sets were carefully controlled to 46 | ensure that each set contained cases from each aspect angle in 47 | appropriate proportions. 48 | 49 | For the aspect-angle independent experiments the combined set of 208 cases 50 | is divided randomly into 13 disjoint sets with 16 cases in each. For each 51 | experiment, 12 of these sets are used as training data, while the 13th is 52 | reserved for testing. The experiment is repeated 13 times so that every 53 | case appears once as part of a test set. The reported performance is an 54 | average over the entire set of 13 different test sets, each run 10 times. 55 | 56 | It was observed that this random division of the sample set led to rather 57 | uneven performance. A few of the splits gave poor results, presumably 58 | because the test set contains some samples from aspect angles that are 59 | under-represented in the corresponding training set. This motivated Gorman 60 | and Sejnowski to devise a different set of experiments in which an attempt 61 | was made to balance the training and test sets so that each would have a 62 | representative number of samples from all aspect angles. Since detailed 63 | aspect angle information was not present in the data base of samples, the 64 | 208 samples were first divided into clusters, using a 60-dimensional 65 | Euclidian metric; each of these clusters was then divided between the 66 | 104-member training set and the 104-member test set. 67 | 68 | The actual training and testing samples used for the "aspect angle 69 | dependent" experiments are marked in the data files. The reported 70 | performance is an average over 10 runs with this single division of the 71 | data set. 72 | 73 | A standard back-propagation network was used for all experiments. The 74 | network had 60 inputs and 2 output units, one indicating a cylinder and the 75 | other a rock. Experiments were run with no hidden units (direct 76 | connections from each input to each output) and with a single hidden layer 77 | with 2, 3, 6, 12, or 24 units. Each network was trained by 300 epochs over 78 | the entire training set. 79 | 80 | The weight-update formulas used in this study were slightly different from 81 | the standard form. A learning rate of 2.0 and momentum of 0.0 was used. 82 | Errors less than 0.2 were treated as zero. Initial weights were uniform 83 | random values in the range -0.3 to +0.3. 84 | 85 | RESULTS: 86 | 87 | For the angle independent experiments, Gorman and Sejnowski report the 88 | following results for networks with different numbers of hidden units: 89 | 90 | Hidden % Right on Std. % Right on Std. 91 | Units Training set Dev. Test Set Dev. 92 | ------ ------------ ---- ---------- ---- 93 | 0 89.4 2.1 77.1 8.3 94 | 2 96.5 0.7 81.9 6.2 95 | 3 98.8 0.4 82.0 7.3 96 | 6 99.7 0.2 83.5 5.6 97 | 12 99.8 0.1 84.7 5.7 98 | 24 99.8 0.1 84.5 5.7 99 | 100 | For the angle-dependent experiments Gorman and Sejnowski report the 101 | following results: 102 | 103 | Hidden % Right on Std. % Right on Std. 104 | Units Training set Dev. Test Set Dev. 105 | ------ ------------ ---- ---------- ---- 106 | 0 79.3 3.4 73.1 4.8 107 | 2 96.2 2.2 85.7 6.3 108 | 3 98.1 1.5 87.6 3.0 109 | 6 99.4 0.9 89.3 2.4 110 | 12 99.8 0.6 90.4 1.8 111 | 24 100.0 0.0 89.2 1.4 112 | 113 | Not surprisingly, the network's performance on the test set was somewhat 114 | better when the aspect angles in the training and test sets were balanced. 115 | 116 | Gorman and Sejnowski further report that a nearest neighbor classifier on 117 | the same data gave an 82.7% probability of correct classification. 118 | 119 | Three trained human subjects were each tested on 100 signals, chosen at 120 | random from the set of 208 returns used to create this data set. Their 121 | responses ranged between 88% and 97% correct. However, they may have been 122 | using information from the raw sonar signal that is not preserved in the 123 | processed data sets presented here. 124 | 125 | REFERENCES: 126 | 127 | 1. Gorman, R. P., and Sejnowski, T. J. (1988). "Analysis of Hidden Units 128 | in a Layered Network Trained to Classify Sonar Targets" in Neural Networks, 129 | Vol. 1, pp. 75-89. 130 | -------------------------------------------------------------------------------- /lectures/em.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/em.png -------------------------------------------------------------------------------- /lectures/index.rst: -------------------------------------------------------------------------------- 1 | .. Computational Statistics in Python documentation master file, created by 2 | sphinx-quickstart on Thu Jan 14 10:45:35 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Computational Statistics in Python 7 | ============================================================== 8 | 9 | `Notebooks and problem sets from GitHub repository `_ 10 | 11 | Topics: 12 | -------- 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | 17 | 01_Introduction_To_Python 18 | 02A_Functions 19 | 02B_Strings 20 | 02C_IO 21 | 02D_Classes 22 | 03A_Numbers 23 | 03B_Graphics 24 | 04A_Data 25 | 04B_SQL 26 | 05_Machine_Learning 27 | 06_LinearAlgebra1 28 | 07_LinearAlgebra2 29 | 08_LinearAlgebraExamples 30 | 09_PCA 31 | 10_SymbolicAlgebra 32 | 11_OptimizationOneDimension 33 | 12_MultivariateOptimizationAlgorithms 34 | 13_Optimization 35 | 14_ExpectationMaximization 36 | 15A_RandomNumbers 37 | 15B_ResamplingAndSimulation 38 | 15C_MonteCarloIntegration 39 | 16A_MCMC 40 | 16B_AuxiliaryVariableMCMC 41 | 16C_PyMC3 42 | 16D_PyStan 43 | 17A_C_Crash_Course 44 | 17B_C_InOneLecture 45 | 17C_C++_Primer_Solutions 46 | 17D_Review_C_C++ 47 | 18A_CodeOptimization 48 | 18B_Foreing_Language_Interface 49 | 18C_Numba 50 | 18D_Cython 51 | 18E_Benchmarks 52 | 18F_Optimization_Bakeoff 53 | 19A_Parallel_Programming 54 | 19B_Threads_Processses_Concurrency 55 | 19C_IPyParallel 56 | 20A_Intermediate_Sized_Data 57 | 20B_Big_Data_Structures 58 | 21A_Introduction_To_Spark 59 | 21B_Efficiency_In_Spark 60 | 21C_Spark_SQL 61 | 21D_Spark_MLib 62 | 63 | Setup 64 | ----------- 65 | 66 | - :doc:`Local_Installation` 67 | - :doc:`Customizing_Jupyter` 68 | 69 | Homework 70 | ---------- 71 | 72 | - :doc:`homework/Homework01` 73 | - :doc:`homework/Homework01_Solutions` 74 | - :doc:`homework/Homework02` 75 | - :doc:`homework/Homework02_Solutions` 76 | - :doc:`homework/Homework03` 77 | - :doc:`homework/Homework03_Solutions` 78 | - :doc:`homework/Homework04` 79 | - :doc:`homework/Homework04_Solutions` 80 | - :doc:`homework/Homework05` 81 | - :doc:`homework/Homework05_Solutions` 82 | - :doc:`homework/Homework06` 83 | - :doc:`homework/Homework06_Solutions` 84 | - :doc:`homework/Homework07` 85 | - :doc:`homework/Homework07_Solutions` 86 | - :doc:`homework/Homework08` 87 | - :doc:`homework/Homework08_Solutions` 88 | 89 | 90 | Indices and tables 91 | ================== 92 | 93 | * :ref:`genindex` 94 | * :ref:`modindex` 95 | * :ref:`search` 96 | -------------------------------------------------------------------------------- /lectures/jensen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/jensen.png -------------------------------------------------------------------------------- /lectures/julia_benchmarks.pic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/julia_benchmarks.pic -------------------------------------------------------------------------------- /lectures/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 1>NUL 2>NUL 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\ComputationalStatisticsinPython.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\ComputationalStatisticsinPython.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /lectures/mcmc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/mcmc.png -------------------------------------------------------------------------------- /lectures/my_module.py: -------------------------------------------------------------------------------- 1 | 2 | PI = 3.14 3 | 4 | def my_f(x): 5 | return PI*x -------------------------------------------------------------------------------- /lectures/spectral.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/spectral.png -------------------------------------------------------------------------------- /lectures/sphinx-readme: -------------------------------------------------------------------------------- 1 | To build sphinx docs: 2 | ``` 3 | pip install nbsphinx 4 | pip install cloud_sptheme 5 | cd lectures 6 | make html 7 | make latexpdf 8 | ``` 9 | 10 | For HTML, open _build/html/index.html 11 | For PDF, open _build/latex/ComputationalStatisticsinPython.pdf 12 | -------------------------------------------------------------------------------- /misc/Customizing_Jupyter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Customizing the Jupyter notebook\n", 8 | "====\n", 9 | "\n", 10 | "These are strictly optional." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "1. Creating a startup script\n", 18 | "----" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "It is convenient to have a bunch of default imports and set up inline plotting automatically. Here's how to do it." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Create an ipython profile" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "! ipython profile create" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "### Edit the next cell to set your defaults, then execute it." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "collapsed": true 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "%%file ~/.ipython/profile_default/startup/start.ipy\n", 62 | "\n", 63 | "import os\n", 64 | "import sys\n", 65 | "import glob\n", 66 | "import operator as op\n", 67 | "import itertools as it\n", 68 | "from functools import reduce, partial\n", 69 | "import numpy as np\n", 70 | "import pandas as pd\n", 71 | "from pandas import DataFrame, Series\n", 72 | "import matplotlib.pyplot as plt\n", 73 | "import seaborn as sns\n", 74 | "sns.set_context(\"notebook\", font_scale=1.5)\n", 75 | "%matplotlib inline" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "### Stop and restart your Jupyter kernel" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "2. Change keybindings to emacs \n", 90 | "----" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "! pip install jupyter-emacskeys" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "3. Install slide mode (RISE)\n", 109 | "----" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "%%bash\n", 121 | "\n", 122 | "git clone https://github.com/damianavila/RISE.git\n", 123 | "cd RISE\n", 124 | "python setup.py install" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "4. Install Calico extensions (see videos for what they do)\n", 132 | "----\n", 133 | "\n", 134 | "This will not work in the Docker container as you do not have the appropriate permissions. However, you can do this for your local installation if you wish." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "import IPython" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "### Spell-check" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": true 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "IPython.display.YouTubeVideo(\"Km3AtRynWFQ\")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "### Document tools" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "IPython.display.YouTubeVideo(\"YbM8rrj-Bms\")" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "### Cell tools" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "collapsed": true 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "IPython.display.YouTubeVideo(\"WwoTzvOkEJQ\")" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "### Execute the next 2 cells to install spell-check, document and cell tools" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "collapsed": true 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "%%bash\n", 218 | "\n", 219 | "ipython install-nbextension https://bitbucket.org/ipre/calico/downloads/calico-spell-check-1.0.zip\n", 220 | "ipython install-nbextension https://bitbucket.org/ipre/calico/downloads/calico-document-tools-1.0.zip\n", 221 | "ipython install-nbextension https://bitbucket.org/ipre/calico/downloads/calico-cell-tools-1.0.zip" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "%%file ~/.jupyter/custom/custom.js\n", 233 | "\n", 234 | "require(['base/js/utils'],\n", 235 | " function(utils) {\n", 236 | " utils.load_extensions('calico-spell-check',\n", 237 | " 'calico-document-tools',\n", 238 | " 'calico-cell-tools');\n", 239 | " });" 240 | ] 241 | } 242 | ], 243 | "metadata": { 244 | "kernelspec": { 245 | "display_name": "Python 3", 246 | "language": "python", 247 | "name": "python3" 248 | }, 249 | "language_info": { 250 | "codemirror_mode": { 251 | "name": "ipython", 252 | "version": 3 253 | }, 254 | "file_extension": ".py", 255 | "mimetype": "text/x-python", 256 | "name": "python", 257 | "nbconvert_exporter": "python", 258 | "pygments_lexer": "ipython3", 259 | "version": "3.5.1" 260 | } 261 | }, 262 | "nbformat": 4, 263 | "nbformat_minor": 0 264 | } 265 | -------------------------------------------------------------------------------- /misc/Local_Installation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Setting up a local install of Jupyter with multiple kernels (Python 3.5, Python 2.7, R, Juila)\n", 8 | "====\n", 9 | "\n", 10 | "The only installation you are recommended to do is to install Anaconda 3.5, so that you have a backup when the OIT version is flaky. The other kernels and the Docker version are **not required** and you should only do so if you are comforatble with command line installs. Even the Anaconda 3.5 installation is optional if the OIT version works well for you.\n", 11 | "\n", 12 | "Note: I have only done this on OSX 10.11.2 (El Capitan) with XCode and command line compilers installed.\n", 13 | "\n", 14 | "To install Anaconda for Python 3.5\n", 15 | "---- \n", 16 | "\n", 17 | "Download and install Anaconda Python 3.5 from https://www.continuum.io/downloads\n", 18 | "\n", 19 | "Open a terminal\n", 20 | "```bash\n", 21 | "conda update conda\n", 22 | "conda update anaconda\n", 23 | "```\n", 24 | "\n", 25 | "(OPTIONAL) To install Python 2.7 as well\n", 26 | "----\n", 27 | "\n", 28 | "Open a terminal\n", 29 | "```bash\n", 30 | "conda create -n py27 python=2.7 anaconda\n", 31 | "source activate py27\n", 32 | "ipython kernel install\n", 33 | "source deactivate\n", 34 | "```\n", 35 | "\n", 36 | "(OPTIONAL) To install R\n", 37 | "----\n", 38 | "\n", 39 | "- If you want `conda` to manage your R packages\n", 40 | "\n", 41 | "```bash\n", 42 | "conda install -y -c r r-irkernel r-recommended r-essentials\n", 43 | "```\n", 44 | "\n", 45 | "> Note: The bug that required this appears to have been fixed\n", 46 | "```\n", 47 | "wget https://anaconda.org/r/ncurses/5.9/download/osx-64/ncurses-5.9-1.tar.bz2 \\\n", 48 | " https://anaconda.org/r/nlopt/2.4.2/download/osx-64/nlopt-2.4.2-1.tar.bz2 \\\n", 49 | " && conda install --yes ncurses-5.9-1.tar.bz2 nlopt-2.4.2-1.tar.bz2\n", 50 | "```\n", 51 | "\n", 52 | "- If you have an existing R installation that you want to use\n", 53 | "\n", 54 | "Start R\n", 55 | "```R\n", 56 | "install.packages(c('rzmq','repr','IRkernel','IRdisplay'),\n", 57 | " repos = c('http://irkernel.github.io/', getOption('repos')))\n", 58 | "IRkernel::installspec()\n", 59 | "```\n", 60 | "\n", 61 | "(OPTIONAL) To install Julia\n", 62 | "----\n", 63 | "\n", 64 | "Download and install Julia from http://julialang.org/downloads/\n", 65 | "\n", 66 | "Start Julia\n", 67 | "```julia\n", 68 | "Pkg.add(\"IJulia\")\n", 69 | "Pkg.build(\"IJulia\")\n", 70 | "```\n", 71 | "\n", 72 | "(OPTIONAL) To install `pyspark`\n", 73 | "----\n", 74 | "\n", 75 | "Open a terminal\n", 76 | "```bash\n", 77 | "conda install -y -c anaconda-cluster spark\n", 78 | "```\n", 79 | "\n", 80 | "Check\n", 81 | "----\n", 82 | "Open terminal\n", 83 | "```\n", 84 | "jupyter notebook\n", 85 | "```\n", 86 | "\n", 87 | "See if the installed kernels are found in the drop-down menu." 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "(OPTIONAL) Installing Spark via Docker\n", 95 | "----\n", 96 | "\n", 97 | "- Install Docker (https://docs.docker.com/engine/installation/)\n", 98 | "- Launch the Docker Quickstart Terminal\n", 99 | "\n", 100 | "Be patient - this can take a while the first time you do it\n", 101 | "\n", 102 | "When done, it shouuld show something like this\n", 103 | "```\n", 104 | " ## .\n", 105 | " ## ## ## ==\n", 106 | " ## ## ## ## ## ===\n", 107 | " /\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\\___/ ===\n", 108 | " ~~~ {~~ ~~~~ ~~~ ~~~~ ~~~ ~ / ===- ~~~\n", 109 | " \\______ o __/\n", 110 | " \\ \\ __/\n", 111 | " \\____\\_______/\n", 112 | "\n", 113 | "\n", 114 | "docker is configured to use the default machine with IP 192.168.99.100\n", 115 | "For help getting started, check out the docs at https://docs.docker.com\n", 116 | "```\n", 117 | "\n", 118 | "**Note the IP address given - you will need this to access the notebook.**\n", 119 | "\n", 120 | "In the Docker terminal\n", 121 | "```\n", 122 | "docker run -d -p 8888:8888 jupyter/all-spark-notebook\n", 123 | "```\n", 124 | "\n", 125 | "Check by typing in the Docker terminal\n", 126 | "```\n", 127 | "docker ps\n", 128 | "```\n", 129 | "\n", 130 | "Be patient - this can take a while the first time you do it.\n", 131 | "\n", 132 | "It shoudl show something like\n", 133 | "```bash\n", 134 | "CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES\n", 135 | "965a6a80bf44 jupyter/all-spark-notebook \"tini -- start-notebo\" 4 minutes ago Up 4 minutes 0.0.0.0:8888->8888/tcp big_kilby\n", 136 | "```\n", 137 | "\n", 138 | "**Note the machine name (mine is big_kilby, yours will likely be different).**\n", 139 | "\n", 140 | "Open your browser at the following URL http://192.168.99.100:8888 (Use the IP given above)\n", 141 | "\n", 142 | "This should bring you to a Jupyter notebook. Open a Python3 notebook from the drop dwon menu and test:\n", 143 | "\n", 144 | "```python\n", 145 | "import pyspark\n", 146 | "sc = pyspark.SparkContext('local[*]')\n", 147 | "\n", 148 | "# do something to prove it works\n", 149 | "rdd = sc.parallelize(range(1000))\n", 150 | "rdd.takeSample(False, 5)\n", 151 | "```\n", 152 | "\n", 153 | "If successful, you should get a list of 5 integers after a short delay.\n", 154 | "\n", 155 | "Save and exit the notebook.\n", 156 | "\n", 157 | "Cleap up in the Docker terminal\n", 158 | "```\n", 159 | "docker stop big_kilby\n", 160 | "exit\n", 161 | "```\n", 162 | "\n", 163 | "Use the machine name foudnd with `docker ps` in place of `big_kilby`.\n", 164 | "\n" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "collapsed": true 172 | }, 173 | "outputs": [], 174 | "source": [] 175 | } 176 | ], 177 | "metadata": { 178 | "kernelspec": { 179 | "display_name": "Python 3", 180 | "language": "python", 181 | "name": "python3" 182 | }, 183 | "language_info": { 184 | "codemirror_mode": { 185 | "name": "ipython", 186 | "version": 3 187 | }, 188 | "file_extension": ".py", 189 | "mimetype": "text/x-python", 190 | "name": "python", 191 | "nbconvert_exporter": "python", 192 | "pygments_lexer": "ipython3", 193 | "version": "3.5.1" 194 | } 195 | }, 196 | "nbformat": 4, 197 | "nbformat_minor": 0 198 | } 199 | -------------------------------------------------------------------------------- /misc/Recommended_Books.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Further Reading\n", 8 | "====\n", 9 | "\n", 10 | "Much of the material for learning Python for statistics and data science can be found online, but some of us still enjoy reading books ... For those of us diehard bibliophiles, these are books that I referred to for the course and enjoyed reading. They are lister in roughly the same order as the course lectures.\n", 11 | "\n", 12 | "- Python in a Nutshell by Steve Holden, Anna Ravenscroft and Alex Martelli (3rd edition)\n", 13 | "\n", 14 | "> A really nice reference for Python 3. \n", 15 | "\n", 16 | "- Python Cookbook: Recipes for Mastering Python 3 by David Beazley, Brian K. Jones (3rd Edition)\n", 17 | "\n", 18 | "> When you want are stuck on a specific task and Stack Overflow is not working for you.\n", 19 | "\n", 20 | "- Learning IPython for Interactive Computing and Data Visualization by Cyrille Rossant (2nd edition)\n", 21 | "\n", 22 | "> If you want to master Jupyter.\n", 23 | "\n", 24 | "- Fluent Python by Luciano Ramalho\n", 25 | "\n", 26 | "> Awesome resource for learning how to code in idiomatic Python like a Pythonista.\n", 27 | "\n", 28 | "- Python for Data Analysis: Data Wrangling with Pandas, NumPy, and IPython by Wes McKinney\n", 29 | "\n", 30 | "> Pandas by the developer. A little dated but you can supplement with the online material.\n", 31 | "\n", 32 | "- Python Data Science Handbook by Jake VanderPlas\n", 33 | "\n", 34 | "> Still a work in progress, but it looks like the single best book for this course.\n", 35 | "\n", 36 | "- Bayesian Methods for Hackers: Probabilistic Programming and Bayesian Inference by Cameron Davidson-Pilon\n", 37 | "\n", 38 | "> Examples of how to use PyMC3.\n", 39 | "\n", 40 | "- High Performance Python by Micha Gorelick and Ian Ozsvald\n", 41 | "\n", 42 | "> Make your Python code faster.\n", 43 | "\n", 44 | "- Cython: A Guide for Python Programmers by Kurt W Smith\n", 45 | "\n", 46 | "> If you want to master Cython, this book is your guide.\n", 47 | "\n", 48 | "- 21st Century C: C Tips from the New School by Ben Klemens (2nd edition)\n", 49 | "\n", 50 | "> Modern C for statisticians.\n", 51 | "\n", 52 | "- Discovering Modern C++: An Intensive Course for Scientists, Engineers, and Programmers by Peter Gottschling\n", 53 | "\n", 54 | "> Awesome introduction to modern C++ (C++11 and C++14) for numerical work. Possibly too dense if you don't already have some familiarity with C/C++. \n", 55 | "\n", 56 | "- Learning Spark: Lightning-Fast Big Data Analysis by Holden Karau, Andy Konwinski, Patrick Wendell, Matei Zaharia (unfortunately Spark books tend to be outdated the moment they are printed - this edition covers Spark 1.3 and we are already at Spark 1.6)\n", 57 | "\n", 58 | "> Introduction to Spark with Java, Scala and Python examples.\n", 59 | "\n", 60 | "- Data Algorithms: Recipes for Scaling Up with Hadoop and Spark by Mahmoud Parsian\n", 61 | "\n", 62 | "> Sort of a cookbook with examples in Hadoop and Spark. Emphasis on biomedical applications. \n", 63 | "\n", 64 | "- Data Visualization with Python and JavaScript: Scrape, Clean, Explore & Transform Your Data by Kyran Dale\n", 65 | "\n", 66 | "> Still in early stages, but looks very promising. If I ever include lectures on data visualization, I suspect this book will be my reference." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [] 77 | } 78 | ], 79 | "metadata": { 80 | "kernelspec": { 81 | "display_name": "Python 3", 82 | "language": "python", 83 | "name": "python3" 84 | }, 85 | "language_info": { 86 | "codemirror_mode": { 87 | "name": "ipython", 88 | "version": 3 89 | }, 90 | "file_extension": ".py", 91 | "mimetype": "text/x-python", 92 | "name": "python", 93 | "nbconvert_exporter": "python", 94 | "pygments_lexer": "ipython3", 95 | "version": "3.5.1" 96 | } 97 | }, 98 | "nbformat": 4, 99 | "nbformat_minor": 0 100 | } 101 | -------------------------------------------------------------------------------- /misc/Spark_Test_Drive.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pyspark\n", 12 | "sc = pyspark.SparkContext('local[*]')" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 3, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "4" 26 | ] 27 | }, 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "output_type": "execute_result" 31 | } 32 | ], 33 | "source": [ 34 | "sc.defaultParallelism" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "[452, 10, 725, 642, 670]" 48 | ] 49 | }, 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "rdd = sc.parallelize(range(1000))\n", 57 | "rdd.takeSample(False, 5)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "squares = rdd.map(lambda x: x**2)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 7, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "332833500" 82 | ] 83 | }, 84 | "execution_count": 7, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "squares.reduce(lambda x, y: x + y)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 11, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "332833500" 104 | ] 105 | }, 106 | "execution_count": 11, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "# check\n", 113 | "sum(n**2 for n in range(1000))" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Python 3", 129 | "language": "python", 130 | "name": "python3" 131 | }, 132 | "language_info": { 133 | "codemirror_mode": { 134 | "name": "ipython", 135 | "version": 3 136 | }, 137 | "file_extension": ".py", 138 | "mimetype": "text/x-python", 139 | "name": "python", 140 | "nbconvert_exporter": "python", 141 | "pygments_lexer": "ipython3", 142 | "version": "3.5.1" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 0 147 | } 148 | -------------------------------------------------------------------------------- /misc/TopicCoverageForMidterm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Basic Python\n", 8 | "----\n", 9 | "\n", 10 | "- Appropriate use of data structures\n", 11 | "- Write a function\n", 12 | "- Higher order functions\n", 13 | "- Equivalents of for-loops\n", 14 | "- Basic string processing\n", 15 | "- Importing packages\n", 16 | "- Read and write to file\n", 17 | "\n", 18 | "Scientific computation\n", 19 | "----\n", 20 | "\n", 21 | "- Symbolic algebra with `sympy` e.g. perform integration\n", 22 | "- Working with `numpy` vectors and arrays\n", 23 | "- Using `scipy.linalg` routines\n", 24 | "- Graphing in `matplotlib` and `seaborn`\n", 25 | "- Split-apply-combine with `pandas` Series and DataFrames\n", 26 | "- Finding roots and minima with `scipy.optimize`\n", 27 | "- Pre-processing, unsupervised and supervised learning with `sklearn`\n", 28 | "\n", 29 | "Linear Algebra\n", 30 | "----\n", 31 | "\n", 32 | "- Linear combinations and independence\n", 33 | "- Change of basis and similar matrices\n", 34 | "- $A = LU$ (Gaussian elimination)\n", 35 | "- $A = LL^T$ (Symmetry)\n", 36 | "- $A = QR$ (Orthogonality)\n", 37 | "- $A = S\\Lambda S^{-1}$ (Eigenvalues and eigenvectors)\n", 38 | "- $A = Q\\Lambda Q^T$ (Diagonalization of symmetric matrix)\n", 39 | "- $A = U\\Sigma V^T$ (Singular values)\n", 40 | "- Positive definite matrices\n", 41 | "- $A^TA\\hat{x} = A^Tb$ (Projection and least squares)\n", 42 | "\n", 43 | "Calculus\n", 44 | "----\n", 45 | "\n", 46 | "- Evaluating critical points\n", 47 | "- Taylor series\n", 48 | "- Gradient, Jacobian and Hessian\n", 49 | "- Calculating conjugate gradients\n", 50 | "- Newton method (univariate and multivariate)\n", 51 | "- Gradient descent (batch and stochastic)\n", 52 | "- Lagrange multipliers for constrained problems\n", 53 | "\n", 54 | "EM\n", 55 | "----\n", 56 | "\n", 57 | "- Likelihood and log-likelihood\n", 58 | "- Jensen's inequality\n", 59 | "- Basic concept of the EM algorithm\n", 60 | "- K-means algorithm\n", 61 | "\n", 62 | "Random numbers\n", 63 | "----\n", 64 | "\n", 65 | "- Inverse transform method\n", 66 | "- Familiar with probability distributions from `numpy.random` and `scipy.stats`\n", 67 | "- How to calculate quantiles, PDF and CDF and other statistics of a random variable\n", 68 | "\n", 69 | "Resampling and Monte Carlo simulations\n", 70 | "----\n", 71 | "\n", 72 | "- Bootstrap\n", 73 | "- Leave-one out calculations (including LOOCV)\n", 74 | "- Empirical CDF and kernel density estimation\n", 75 | "\n", 76 | "Numerical integration\n", 77 | "----\n", 78 | "\n", 79 | "- Quadrature with `scipy.integrate` methods\n", 80 | "- Why Monte Carlo integration works\n", 81 | "- Change of variables\n", 82 | "- Importance sampler\n", 83 | "- Other Monte Carlo swindles\n", 84 | "\n", 85 | "Markov Chain Monte Carlo\n", 86 | "----\n", 87 | "\n", 88 | "- Working with Markov matrices (linear algebra revisited)\n", 89 | "- Conditions for convergence to equilibrium distribution\n", 90 | "- Detailed balance\n", 91 | "- How they work\n", 92 | " - Random walk sampler\n", 93 | " - Gibbs sampler\n", 94 | " - Slice sampler\n", 95 | " - HMC sampler\n", 96 | "- Use of PyMC3 and PyStan to fit simple hierarchical models\n" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python 3", 112 | "language": "python", 113 | "name": "python3" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.5.1" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 0 130 | } 131 | -------------------------------------------------------------------------------- /misc/old-exams/Midterm-Sample-Revised.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Instructions\n", 8 | "----\n", 9 | "\n", 10 | "This is a \"closed book\" examination - in particular, you are not to use any resources outside of this notebook (except possibly pen and paper). You may consult help from within the notebook using ? but not any online references. You should turn wireless off or set your laptop in \"Airplane\" mode prior to taking the exam. \n", 11 | "\n", 12 | "You have 1 hour and 45 minutes to complete the exam." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import sys\n", 25 | "import glob\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "import numpy as np\n", 28 | "import pandas as pd\n", 29 | "%matplotlib inline\n", 30 | "plt.style.use('ggplot')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "np.set_printoptions(formatter={'float': '{: 0.3f}'.format})" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "%load_ext rpy2.ipython" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "**Question 1 (10 points)**. \n", 60 | "\n", 61 | "Euclid's algorithm for finding the greatest common divisor of two numbers is\n", 62 | "\n", 63 | "```python\n", 64 | "gcd(a, 0) = a\n", 65 | "gcd(a, b) = gcd(b, a modulo b)\n", 66 | "```\n", 67 | "\n", 68 | "1. Write a function to find the greatest common divisor in Python (4 poinst)\n", 69 | "2. What is the greatest common divisor of 17384 and 1928? (1 point)\n", 70 | "3. Write a function to calculate the least common multiple (4 points)\n", 71 | "4. What is the least common multiple of 17384 and 1928? (1 point)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [], 81 | "source": [] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "collapsed": false 88 | }, 89 | "outputs": [], 90 | "source": [] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [], 99 | "source": [] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [], 108 | "source": [] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "**Question 2 (10 points)**. \n", 124 | "\n", 125 | "Consider the following matrix $A$ with dimensions (4,6), to be interpreted as 4 rows of the measurements of 6 features.\n", 126 | "```python\n", 127 | "np.array([[5, 5, 2, 6, 2, 0],\n", 128 | " [8, 6, 7, 8, 9, 7],\n", 129 | " [9, 5, 0, 4, 6, 8],\n", 130 | " [8, 7, 9, 3, 6, 1]])\n", 131 | "```\n", 132 | "\n", 133 | "1. Add 1 to the first row, 2 to the second row, 3 to the third row and 4 to the fourth row using a vector `v = np.array([1,2,3,4])` and broadcasting. (2 points)\n", 134 | "2. Normalize A so that its row means are all 0 and call it A1. (2 points)\n", 135 | "3. What are the singular values of A1? (2 points)\n", 136 | "4. What are the eigenvalues of the covariance matrix of A1? (2 points)\n", 137 | "5. Find the least squares solution vector $x$ if $Ax = y$ where `y = np.array([1,2,3,4]).T` (2 points)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [], 147 | "source": [] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [], 156 | "source": [] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "collapsed": false 163 | }, 164 | "outputs": [], 165 | "source": [] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "collapsed": false 172 | }, 173 | "outputs": [], 174 | "source": [] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [], 183 | "source": [] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "**Question 3 (10 points)**.\n", 190 | "\n", 191 | "1. Prove that $e^{x^2 + y^2}$ is a convex function. (5 points)\n", 192 | "2. Using `scipy.optimize`, find the values of $x$ and $y$ that minimize $e^{x^2 + y^2}$ in the unconstrained case and in the presence of the constraint that $x + y = 3$. Use (1,1) as a starting guess (5 points)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [], 202 | "source": [] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "collapsed": false 209 | }, 210 | "outputs": [], 211 | "source": [] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "**Question 4 (10 points)**.\n", 218 | "\n", 219 | "A milkmaid is at point A and needs to get to point B. However, she also needs to fill a pail of water from the river en route from A to B. The equation of the river's path is shown in the figure below. What is the minimum distance she has to travel to do this?\n", 220 | "\n", 221 | "1. Solve using `scipy.optimize` and constrained minimization.\n", 222 | "2. Solve without using `scipy.optimize`. Hint: Use Lagrange \n", 223 | "\n", 224 | "![Milkmaid problem](milkmaid.png)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": { 231 | "collapsed": false 232 | }, 233 | "outputs": [], 234 | "source": [] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [], 243 | "source": [] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "**Question 5 (10 points)**. \n", 250 | "\n", 251 | "Find the minimum of the following quadratic function on $\\mathbb{R}^4$ \n", 252 | "\n", 253 | "$$f(x) = x^TAx +b^Tx +c$$\n", 254 | "where\n", 255 | "$$A = \\left(\\begin{matrix}13&5&0&0\\\\5&7&0&0\\\\0&0&20&-7\\\\0&0&-7&12\\end{matrix}\\right), b = \\left(\\begin{matrix}1\\\\1\\\\1\\\\1\\end{matrix}\\right) \\textrm {and } c = 2$$\n", 256 | "\n", 257 | "and $x$ is a column vector.\n", 258 | "\n", 259 | "a. Using scipy.optimize (4 points)\n", 260 | "\n", 261 | "b. Using a matrix decomposition method (library functions - no need to code your own). Note: for full credit you should exploit matrix structure. (4 points)\n", 262 | "\n", 263 | "c. Find the minimum under the constraint $||x||^2 = 1$ (i.e. on the unit sphere in $\\mathbb{R}^4$). (2 points)\n", 264 | "\n", 265 | "**Note: Do not be overly concerned if your values for $x$ at the minimum do not match exactly **\n" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [], 275 | "source": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": { 281 | "collapsed": false 282 | }, 283 | "outputs": [], 284 | "source": [] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": { 290 | "collapsed": false 291 | }, 292 | "outputs": [], 293 | "source": [] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "**Question 6 (10 points)**.\n", 300 | "\n", 301 | "Given the set of vectors\n", 302 | "\n", 303 | "```\n", 304 | "v1 = np.array([1,2,3])\n", 305 | "v2 = np.array([2,4,7])\n", 306 | "v3 = np.array([1,0,1])\n", 307 | "```\n", 308 | "\n", 309 | "1. Calculate the pairwise Euclidean distance matrix using nested for loops. (2 points)\n", 310 | "2. Calculate the pairwise Euclidean distance matrix using numpy broadcasting. (3 points)\n", 311 | "3. Find an orthogonal basis for the space spanned by these vectors without using any functions from `numpy.linag` or `scipy.linalg` (5 points)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": { 318 | "collapsed": false 319 | }, 320 | "outputs": [], 321 | "source": [] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [], 330 | "source": [] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": { 336 | "collapsed": false 337 | }, 338 | "outputs": [], 339 | "source": [] 340 | } 341 | ], 342 | "metadata": { 343 | "kernelspec": { 344 | "display_name": "Python 2", 345 | "language": "python", 346 | "name": "python2" 347 | }, 348 | "language_info": { 349 | "codemirror_mode": { 350 | "name": "ipython", 351 | "version": 2 352 | }, 353 | "file_extension": ".py", 354 | "mimetype": "text/x-python", 355 | "name": "python", 356 | "nbconvert_exporter": "python", 357 | "pygments_lexer": "ipython2", 358 | "version": "2.7.11" 359 | } 360 | }, 361 | "nbformat": 4, 362 | "nbformat_minor": 0 363 | } 364 | -------------------------------------------------------------------------------- /misc/old-exams/milkmaid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/misc/old-exams/milkmaid.png -------------------------------------------------------------------------------- /projects/FinalProject.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Instructions for Final Project (25%)\n", 8 | "----\n", 9 | "\n", 10 | "**Note**: As uusal, this work should be your own original effort. Please do not just copy and paste someone else's code from the web - that would be a serious violation of academic integrity. However, if there is an existing implemetation, it would make sense to do a comparative analysis for accuracy, features and execution speed.\n", 11 | "\n", 12 | "**Important**: Papers can be found in Sakai/Resources/FinalProjectPapers. If you intend to use a ppaer not in the folder, please remember that it must be aproved by Cliburn/Janice first!\n", 13 | "\n", 14 | "- Deadline for submission of progress report 1 6 Arpil 2016 (10 points)\n", 15 | " - Choice of paper\n", 16 | " - Set up Github repository for final project\n", 17 | " - Preliminary section and subsection headings\n", 18 | " - Abstract of project (150-250 words)\n", 19 | "- Deadline for submission of progress report 2 22 April 2016 (10 points)\n", 20 | " - Code written and working on simulated data set\n", 21 | " - Background written\n", 22 | " - Flesh out sections and subsections as appropriate\n", 23 | "- Deadline for submission of final report 30 April 2016 (80 points)\n", 24 | " - See rubric at bottom of notebook\n", 25 | "\n", 26 | "For the final project, you will need to implement a \"new\" statistical algorithm in Python from the research literature and write a report on it (in an Jupyter notebook, of course!). As a guide, the report should include most or all of the following sections, although it is up to you which aspects you choose to focus on (e.g. if the algorithm is highly complex, just developing and testing the Python code might be the main emphasis; alternatively, if the algorithm is quite straightforward, the emphasis can be on making a Spark version and benchmarking it):\n", 27 | "\n", 28 | "Note that for this project, you will be working with IPython notebooks, python scripts (e.g. test code), possibly code in other languages (e.g. Cython, C), data files and a MakeFile.\n", 29 | "\n", 30 | "### Background\n", 31 | "\n", 32 | "State the research paper you are using. Describe the concept of the algorithm and why it is interesting and/or useful. If appropriate, describe the mathematical basis of the algorithm. Some potential topics for the background include:\n", 33 | "\n", 34 | "- What problem does it address? \n", 35 | "- What are known and possible applications of the algorithm? \n", 36 | "- What are its advantages and disadvantages relative to other algorithms?\n", 37 | "- How will you use it in your research?\n", 38 | "\n", 39 | "### Implementation\n", 40 | "\n", 41 | "Implement the algorithm as a Python function or family of functions as clearly as possible. Can your code be understood by someone with a similar quantitative background as yours? Does it follow good coding conventions?\n", 42 | "\n", 43 | "### Testing\n", 44 | "\n", 45 | "Write unit tests for your functions and make sure they pass. These should include both common and edge cases (e.g. test at boundaries - what happens if the input is an empty vector etc.).\n", 46 | "\n", 47 | "### Optimization\n", 48 | "\n", 49 | "Profile the performance of the algorithm and identify bottlenecks. Eliminate these bottlenecks where possible using vectorization, Cython and/or just-in-time compiling. Document what you did and the resulting performance improvement.\n", 50 | "\n", 51 | "### High performance computing\n", 52 | "\n", 53 | "If you can identify tasks that can be performed in parallel, parallelize the algorithm. Depending on the algorithm, this may involve the use of parallel or distributed computing (or some combination of them). If the algorithm cannot be parallelized, more weight will be given to how effectively you perform optimization using Cython or just-in-time compilation. Document what you did and the resulting performance improvement.\n", 54 | "\n", 55 | "### Application and comparison\n", 56 | "\n", 57 | "Apply the algorithm to a \"real\" problem (this can be simulated data) to show how it works and to compare it with at least one other algorithm that addresses the same problem from some available Python or R package. \n", 58 | "\n", 59 | "### Reproducible analysis\n", 60 | "\n", 61 | "You are expected to perform the entire development process using a new GitHub repository with regular commits. The GitHub repository should contain a Makefile that can run all tests, as well as perform profiling and benchmarking.\n", 62 | "\n", 63 | "Rough Rubric \n", 64 | "---\n", 65 | "\n", 66 | "Interim submissions will count towards 20% of the grade and the final submission will make up the remaining 80%. The rubric is only meant as a guide - for example, if the algorihtm is highly complex, we will take that into account and give extra credit for a correct algorithm, even if it is not fully optimized.\n", 67 | "\n", 68 | "- Review of algorithm and the problems it solves (10 points)\n", 69 | " - Literature review\n", 70 | " - Clear explanation of the main ideas behind the algorithm\n", 71 | " - Describe why the algorithm is useful\n", 72 | "- Good programming practices (10 points)\n", 73 | " - Use of Github\n", 74 | " - Use of literate programming in Jupyter\n", 75 | " - Generate final report as PDF via script (e.g. Makefile)\n", 76 | " - Commenting and use of dcostrings\n", 77 | "- Implementation of a working program (30 points)\n", 78 | " - Does it run?\n", 79 | " - Is it correct? How do you know? Use of tests\n", 80 | " - Is it written cleanly and efficiently?\n", 81 | "- Optimization efforts (15 points)\n", 82 | " - Opportunities for vectorization\n", 83 | " - Opportunities to use JIT or Cython\n", 84 | " - Opportunities to use multi-core machines\n", 85 | " - Opportunities to use Spark for distributed programming\n", 86 | "- Applications (15 points)\n", 87 | " - Application to simulated or toy datasets\n", 88 | " - Application to one real data set\n", 89 | " - Discussion of results" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 0, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [], 99 | "source": [] 100 | } 101 | ], 102 | "metadata": { 103 | "kernelspec": { 104 | "display_name": "Python 2", 105 | "language": "python", 106 | "name": "python2" 107 | }, 108 | "language_info": { 109 | "codemirror_mode": { 110 | "name": "ipython", 111 | "version": 2 112 | }, 113 | "file_extension": ".py", 114 | "mimetype": "text/x-python", 115 | "name": "python", 116 | "nbconvert_exporter": "python", 117 | "pygments_lexer": "ipython2", 118 | "version": "2.7.11" 119 | } 120 | }, 121 | "nbformat": 4, 122 | "nbformat_minor": 0 123 | } 124 | -------------------------------------------------------------------------------- /projects/FinalProjectGuide.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Final Project Workflow\n", 8 | "====\n", 9 | "\n", 10 | "There is a lot to do, and these are simply suggestions - nobody is expected to do *all* the suggested steps, and there may be alternative strategies you choose to implement. I suggest that you do most of your development in a Jupyter notebook, supplemented by a code editor if necessary (especially if you are writing C/C++), making use of Markdown cells to document what you are doing. That way, you only have to clean up and refine the notebook and you have a final project ready for submission. \n", 11 | "\n", 12 | "### Week 0\n", 13 | "\n", 14 | "* Choose paper\n", 15 | "* Identify algorithm to implement\n", 16 | "* Write abstract and outline of approach\n", 17 | "\n", 18 | "### Week 1\n", 19 | "\n", 20 | "* Code algorithm in Python\n", 21 | " * Write modular code\n", 22 | " * Functional core - use pure functions where possible\n", 23 | " * Imperative shell - minimize stateful code to interactions and I/O\n", 24 | "* Write tests to check correctness\n", 25 | " * Check boundary conditions\n", 26 | " * Are there known analytic/asymptotic solutions to compare against?\n", 27 | " * Are there other packages implementing the algorithm to compare against?\n", 28 | " * Are there alternative algorithms that should give the same answer?\n", 29 | "\n", 30 | "**Deadline for 1st progress report: 6th April 2016**\n", 31 | "\n", 32 | "### Week 2\n", 33 | "\n", 34 | "* Profile for speed\n", 35 | " * Use cProfile and the `prun` magic\n", 36 | " * Identify performance bottlenecks\n", 37 | "* Optimize slow functions \n", 38 | " * Consider using `line_profiler` if necessary\n", 39 | " * Consider the following strategies:\n", 40 | " * More idiomatic Python\n", 41 | " * Cache results (e.g. `lru_cache` decorator)?\n", 42 | " * Better data structure?\n", 43 | " * Better algorithm?\n", 44 | " * Vectorize with `numpy` or `pandas`\n", 45 | " * Use a JIT compiler (e.g. `numba`)\n", 46 | " * USe `Cython` to recode function\n", 47 | " * Write C/C++ function and wrap for use in Python\n", 48 | "\n", 49 | "### Week 3\n", 50 | "\n", 51 | "* Write parallel code\n", 52 | " * Using Cython `prange` and `openmp`\n", 53 | " * Using threads\n", 54 | " * Using processes\n", 55 | "* Scaling for massive data sets\n", 56 | " * Using appropriate data storage (e.g. HDF5, databases)\n", 57 | " * Using `pyspark` for distributed computing\n", 58 | "* Re-run tests after optimization to check that output has not changed \n", 59 | "* Comparative analysis for each new version with `time` and `timeit` magic \n", 60 | "* Applications \n", 61 | " * Apply to simulated data sets\n", 62 | " * Apply to real data sets \n", 63 | "\n", 64 | "**Deadline for 2nd progress report: 22nd April 2016**\n", 65 | "\n", 66 | "### Week 4\n", 67 | "\n", 68 | "* Packaging\n", 69 | " * Bundle code into a package for distribution\n", 70 | " * Provide instructions for installation on GitHub\n", 71 | " * Upload to Python Package Index if appropriate\n", 72 | "* Clean up work and documentation\n", 73 | "\n", 74 | "### Submission\n", 75 | "\n", 76 | "* Submit final project \n", 77 | " * As a `Jupyter` notebook or series of notebooks using literate programming\n", 78 | " * Generate PDF if possible\n", 79 | " * Use `nbsphinx` to convert to HTML if appropriate\n", 80 | " * As a LaTeX file, using `make` to automate document generation\n", 81 | "\n", 82 | "**Deadline for final report: 22nd April 2016**" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.5.1" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 0 116 | } 117 | --------------------------------------------------------------------------------