├── .gitignore
├── data
    ├── Chinook_Sqlite.sqlite
    ├── Ulysses.txt
    ├── animals.txt
    ├── animals2.txt
    ├── class.txt
    ├── crash.json
    ├── major.txt
    ├── north_carolina_bicycle_crash_data_heatmap_.json
    ├── reed.xml
    ├── student.txt
    └── student_class.txt
├── exams
    ├── ExtrraCredit-Solutions.ipynb
    ├── ExtrraCredit.ipynb
    ├── HtWt.csv
    ├── Midterm-Revised-Solutions-Final.ipynb
    ├── Midterm-Revised.ipynb
    ├── dexp.png
    └── xy.csv
├── homework
    ├── Homework01.ipynb
    ├── Homework01_Solutions.ipynb
    ├── Homework02.ipynb
    ├── Homework02_Solutions.ipynb
    ├── Homework03.ipynb
    ├── Homework03_Solutions.ipynb
    ├── Homework04.ipynb
    ├── Homework04_Solutions.ipynb
    ├── Homework05.ipynb
    ├── Homework05_Solutions.ipynb
    ├── Homework06.ipynb
    ├── Homework06_Solutions.ipynb
    ├── Homework07.ipynb
    ├── Homework07_Solutions.ipynb
    ├── Homework08.ipynb
    ├── Homework08_Solutions.ipynb
    ├── Homework09.ipynb
    ├── Homework09_Solutions.ipynb
    ├── Homework10.ipynb
    ├── Homework10_Solutions.ipynb
    ├── milkmaid.png
    ├── mystery.txt
    ├── pubmed.pic
    ├── x1d.npy
    └── x2d.npy
├── images
    ├── Scraping data.png
    └── hw2_q4.png
├── lectures
    ├── 01_Introduction_To_Python.ipynb
    ├── 02A_Functions.ipynb
    ├── 02B_Strings.ipynb
    ├── 02C_IO.ipynb
    ├── 02D_Classes.ipynb
    ├── 03A_Numbers.ipynb
    ├── 03B_Graphics.ipynb
    ├── 04A_Data.ipynb
    ├── 04B_SQL.ipynb
    ├── 05_Machine_Learning.ipynb
    ├── 06_LinearAlgebra1.ipynb
    ├── 07_LinearAlgebra2.ipynb
    ├── 08_LinearAlgebraExamples.ipynb
    ├── 09_PCA.ipynb
    ├── 10_SymbolicAlgebra.ipynb
    ├── 11_OptimizationOneDimension.ipynb
    ├── 12_MultivariateOptimizationAlgorithms.ipynb
    ├── 13_Optimization.ipynb
    ├── 14_ExpectationMaximization.ipynb
    ├── 15A_RandomNumbers.ipynb
    ├── 15B_ResamplingAndSimulation.ipynb
    ├── 15C_MonteCarloIntegration.ipynb
    ├── 16A_MCMC.ipynb
    ├── 16B_AuxiliaryVariableMCMC.ipynb
    ├── 16C_PyMC3.ipynb
    ├── 16D_PyStan.ipynb
    ├── 17A_C_Crash_Course.ipynb
    ├── 17B_C_InOneLecture.ipynb
    ├── 17C_C++_Primer_Solutions.ipynb
    ├── 17D_Review_C_C++.ipynb
    ├── 18A_CodeOptimization.ipynb
    ├── 18B_Foreing_Language_Interface.ipynb
    ├── 18C_Numba.ipynb
    ├── 18D_Cython.ipynb
    ├── 18E_Benchmarks.ipynb
    ├── 18F_Optimization_Bakeoff.ipynb
    ├── 19A_Parallel_Programming.ipynb
    ├── 19B_Threads_Processses_Concurrency.ipynb
    ├── 19C_IPyParallel.ipynb
    ├── 20A_Intermediate_Sized_Data.ipynb
    ├── 20B_Big_Data_Structures.ipynb
    ├── 21A_Introduction_To_Spark.ipynb
    ├── 21B_Efficiency_In_Spark.ipynb
    ├── 21C_Spark_SQL.ipynb
    ├── 21D_Spark_MLib.ipynb
    ├── 21E_Spark_And_Sklearn.ipynb
    ├── 21F_Spark_GraphX.ipynb
    ├── 21G_Spark_Streaming,ipynb
    ├── 21H_Spark_Cloud.ipynb
    ├── Customizing_Jupyter.ipynb
    ├── ExercisesForLab01-Solutions.ipynb
    ├── ExercisesForLab01.ipynb
    ├── Extra_Packages.ipynb
    ├── HtWt.csv
    ├── Lagrange_multiplier.png
    ├── Local_Installation.ipynb
    ├── Makefile
    ├── Spark01.ipynb
    ├── Spark02.ipynb
    ├── Spark03.ipynb
    ├── Spark04.ipynb
    ├── Stuff.ipynb
    ├── Template01.ipynb
    ├── commutative.png
    ├── conf.py
    ├── data
    │   ├── Portrait.txt
    │   ├── Ulysses.txt
    │   ├── adult.data.txt
    │   ├── adult.names.txt
    │   ├── adult.test.txt
    │   ├── sonar.all-data.txt
    │   ├── sonar.mines.txt
    │   ├── sonar.names.txt
    │   └── sonar.rocks.txt
    ├── em.png
    ├── index.rst
    ├── jensen.png
    ├── julia_benchmarks.pic
    ├── looking_glass.txt
    ├── make.bat
    ├── mcmc.png
    ├── my_module.py
    ├── radon.csv
    ├── spectral.png
    └── sphinx-readme
├── misc
    ├── Customizing_Jupyter.ipynb
    ├── Local_Installation.ipynb
    ├── Recommended_Books.ipynb
    ├── Spark_Test_Drive.ipynb
    ├── TopicCoverageForMidterm.ipynb
    └── old-exams
    │   ├── Midterm-Revised.ipynb
    │   ├── Midterm-Sample-Revised.ipynb
    │   └── milkmaid.png
├── projects
    ├── FinalProject.ipynb
    └── FinalProjectGuide.ipynb
└── syllabus.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.npy
 2 | *.npyz
 3 | *main*
 4 | *.so
 5 | *.dylib
 6 | *.json
 7 | *.txt
 8 | ex[0-9]*
 9 | *.csv
10 | Untitled*
11 | *.o
12 | *.c
13 | *.cpp
14 | *.h
15 | *.hpp
16 | *.pyx
17 | *pxd
18 | *.pic
19 | trace*
20 | .DS_Store
21 | RISE/
22 | *ipynb_checkpoints*
23 | *mplstyle*
24 | lectures/_build/
25 | data/
26 | web/
27 | 
28 | # Byte-compiled / optimized / DLL files
29 | __pycache__/
30 | *.py[cod]
31 | 
32 | # C extensions
33 | *.so
34 | 
35 | # Distribution / packaging
36 | .Python
37 | env/
38 | build/
39 | develop-eggs/
40 | dist/
41 | downloads/
42 | eggs/
43 | .eggs/
44 | lib/
45 | lib64/
46 | parts/
47 | sdist/
48 | var/
49 | *.egg-info/
50 | .installed.cfg
51 | *.egg
52 | 
53 | # PyInstaller
54 | #  Usually these files are written by a python script from a template
55 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
56 | *.manifest
57 | *.spec
58 | 
59 | # Installer logs
60 | pip-log.txt
61 | pip-delete-this-directory.txt
62 | 
63 | # Unit test / coverage reports
64 | htmlcov/
65 | .tox/
66 | .coverage
67 | .coverage.*
68 | .cache
69 | nosetests.xml
70 | coverage.xml
71 | *,cover
72 | 
73 | # Translations
74 | *.mo
75 | *.pot
76 | 
77 | # Django stuff:
78 | *.log
79 | 
80 | # Sphinx documentation
81 | docs/_build/
82 | 
83 | # PyBuilder
84 | target/
85 | 


--------------------------------------------------------------------------------
/data/Chinook_Sqlite.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/data/Chinook_Sqlite.sqlite


--------------------------------------------------------------------------------
/data/animals.txt:
--------------------------------------------------------------------------------
1 | name|species|age|weight
2 | arun|cat|5|7.3
3 | bob|bird|2|1.5
4 | coco|cat|2|5.5
5 | dumbo|elephant|23|454
6 | elmo|dog|5|11
7 | fido|dog|3|24.5
8 | gumba|bird|2|2.7


--------------------------------------------------------------------------------
/data/animals2.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | name|species|age|weight
 3 | arun|cat|5|7.3
 4 | bob|bird|2|1.5
 5 | coco|cat|2|5.5
 6 | dumbo|elephant|23|454
 7 | elmo|dog|5|11
 8 | fido|dog|3|24.5
 9 | gumba|bird|2|2.7
10 | 


--------------------------------------------------------------------------------
/data/class.txt:
--------------------------------------------------------------------------------
1 | class_id,code,name,credits
2 | 1,ANT01,Introduction to Hobbits,4
3 | 2,MAT802,Abstrct Nonsense,8
4 | 3,ENG234,Jabberwocky,2
5 | 4,STA007,Statistics for Secret Agens,4
6 | 5,PHY211,Physics of Star Wars,4


--------------------------------------------------------------------------------
/data/major.txt:
--------------------------------------------------------------------------------
1 | major_id,name
2 | 1,Computer Science
3 | 2,Physics
4 | 3,Statisitcs
5 | 4,English
6 | 5,History


--------------------------------------------------------------------------------
/data/student.txt:
--------------------------------------------------------------------------------
1 | 
2 | student_id,first,last,email,major_id
3 | 1,frodo,baggins,frodo.baggins@duke.edu,1
4 | 2,bilbo,baggins,b_baggins@duke.edu,3
5 | 3,golum,golum,golum.golum@duke.edu,2
6 | 4,gandalf,white,g.white@duke.edu,5
7 | 5,gandalf,grey,g.grey@duke.edu,6
8 | 6,saruman,wise,s.wise@duke.edu,2


--------------------------------------------------------------------------------
/data/student_class.txt:
--------------------------------------------------------------------------------
 1 | student_id,class_id
 2 | 1,3
 3 | 1,4
 4 | 2,1
 5 | 2,4
 6 | 3,1
 7 | 3,2
 8 | 3,3
 9 | 3,5
10 | 4,2
11 | 4,5


--------------------------------------------------------------------------------
/exams/HtWt.csv:
--------------------------------------------------------------------------------
 1 | male,height,weight
 2 | 0,63.2,168.7
 3 | 0,68.7,169.8
 4 | 0,64.8,176.6
 5 | 0,67.9,246.8
 6 | 1,68.9,151.6
 7 | 1,67.8,158.0
 8 | 1,68.2,168.6
 9 | 0,64.8,137.2
10 | 1,64.3,177.0
11 | 0,64.7,128.0
12 | 1,66.9,168.4
13 | 1,66.9,136.2
14 | 1,67.1,160.3
15 | 1,70.2,233.9
16 | 1,67.4,171.7
17 | 1,71.1,185.5
18 | 0,63.4,177.6
19 | 1,66.9,132.9
20 | 0,71.0,140.1
21 | 1,70.4,151.9
22 | 0,59.5,147.2
23 | 1,70.4,159.0
24 | 0,61.5,113.0
25 | 1,74.5,194.5
26 | 0,65.3,145.1
27 | 1,68.8,196.5
28 | 0,67.2,148.9
29 | 1,68.7,132.9
30 | 0,60.0,168.4
31 | 0,62.5,146.2
32 | 1,72.0,236.4
33 | 1,67.9,140.0
34 | 1,65.1,156.2
35 | 1,63.5,178.7
36 | 1,68.2,147.5
37 | 0,64.6,97.7
38 | 1,68.1,189.6
39 | 0,66.2,221.9
40 | 0,62.8,168.1
41 | 0,65.3,143.1
42 | 0,65.8,217.7
43 | 0,68.7,133.2
44 | 0,63.8,96.5
45 | 1,70.6,270.6
46 | 0,61.5,137.2
47 | 0,61.9,124.2
48 | 0,65.1,128.3
49 | 1,68.7,203.6
50 | 0,57.6,132.4
51 | 1,66.3,189.4
52 | 1,69.0,174.0
53 | 0,63.4,163.3
54 | 1,69.5,183.5
55 | 1,67.8,193.8
56 | 0,61.6,119.7
57 | 1,71.2,157.4
58 | 1,67.4,146.1
59 | 0,66.1,128.3
60 | 1,70.7,179.1
61 | 0,67.0,140.0
62 | 1,66.8,202.2
63 | 1,69.9,169.4
64 | 0,57.7,122.8
65 | 0,62.5,248.5
66 | 1,66.6,154.4
67 | 0,60.6,140.2
68 | 1,70.4,141.6
69 | 0,66.4,144.4
70 | 0,62.3,116.2
71 | 1,73.3,175.0
72 | 


--------------------------------------------------------------------------------
/exams/Midterm-Revised.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Mid-term Exam\n",
  8 |     "====\n",
  9 |     "\n",
 10 |     "This is a **closed book** exam except for the 1 page cheatsheet. You can use the help function within Jupyer (e.g. `range?`) and links from the Help Menu, but not use any other external reference or search engine. "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Run this  to get all necessary imports\n",
 18 |     "\n",
 19 |     "Not all the imports need to be used to solve the problems. I am just including everything that I think could be useful."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {
 26 |     "collapsed": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import numpy as np\n",
 31 |     "import numpy.random as rng\n",
 32 |     "import scipy.linalg as la\n",
 33 |     "import scipy.stats as stats\n",
 34 |     "import scipy.optimize as opt\n",
 35 |     "import matplotlib.pyplot as plt\n",
 36 |     "import seaborn as sns\n",
 37 |     "import pandas as pd\n",
 38 |     "from pandas import DataFrame, Series\n",
 39 |     "from sympy import symbols, integrate, exp, oo\n",
 40 |     "%matplotlib inline"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "#### Set random number seed so that answers are the same"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {
 54 |     "collapsed": false
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "rng.seed(123)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "**Question 1 (10 points)**\n",
 66 |     "\n",
 67 |     "Using the `iris` dataset, answer the following questions:\n",
 68 |     "\n",
 69 |     "- Find the mean, min and max values of all four measurements (sepal.length, sepal.width, petal.length, petal.width) for each species\n",
 70 |     "- Find the average petal.width for rows where the petal.length is less than the sepal.width"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 2,
 76 |    "metadata": {
 77 |     "collapsed": true
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "iris = sns.load_dataset('iris')"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "\n",
 93 |     "\n",
 94 |     "\n"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "**Question 2 (10 points)**\n",
102 |     "\n",
103 |     "Write a function `peek(df, n)` to display a random selection of $n$ rows of any dataframe (without repitition). Use it to show 5 random rows from the iris data set. The function should take as inputs a dataframe and an integer."
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "collapsed": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "\n",
115 |     "\n",
116 |     "\n"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "**Question 3 (30 points)**\n",
124 |     "\n",
125 |     "You are given the following set of data\n",
126 |     "\n",
127 |     "```python\n",
128 |     "x = np.arange(10)\n",
129 |     "y = np.array([  1.58873597,   7.55101533,  10.71372171,   7.90123225,\n",
130 |     "                -2.05877605, -12.40257359, -28.64568712, -46.39822281,\n",
131 |     "                -68.15488905, -97.16032044])\n",
132 |     "```\n",
133 |     "\n",
134 |     "- Find the least squares solution by solving the normal equations $A^T A \\hat{x} = A^T y$ - use `scipy.linalg.solve`. (5 points)\n",
135 |     "\n",
136 |     "- Write your own **gradient descent** optimization function to find the least squares solution for the coefficients $\\beta$ of a quadratic polynomial. Do **not** use a gradient descent algorithm from a package such as `scipy-optimize` or `scikit-learn`.  You can use a simple for loop - start with the parameters `beta = np.zeros(3)` with a learning rate $\\alpha = 0.0001$ and run for 100000 iterations. (15 points)\n",
137 |     "\n",
138 |     "- Plot the data together with the fitted polynomial from the first and second solutions in separate subplots. (10 points)\n",
139 |     "\n"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 3,
145 |    "metadata": {
146 |     "collapsed": true
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "x = np.arange(10)\n",
151 |     "y = np.array([  1.58873597,   7.55101533,  10.71372171,   7.90123225,\n",
152 |     "                -2.05877605, -12.40257359, -28.64568712, -46.39822281,\n",
153 |     "                -68.15488905, -97.16032044])"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {
160 |     "collapsed": true
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "\n",
165 |     "\n",
166 |     "\n"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "**Question 4 (20 points)**\n",
174 |     "\n",
175 |     "Consider the following system of equations:\n",
176 |     "\n",
177 |     "$$\\begin{align*}\n",
178 |     "2x_1& - x_2&    +x_1    &=& 6\\\\\n",
179 |     "-x_1& +2x_2& -  x_3 &=& 2\\\\\n",
180 |     " x_1   &  -x_2& + x_3 &=& 1\n",
181 |     "\\end{align*}$$\n",
182 |     "\n",
183 |     "1. Write the system in matrix form $Ax=b$ and define these in numpy or scipy.\n",
184 |     "2. Show that $A$ is positive-definite\n",
185 |     "3. Use the appropriate matrix decomposition function in numpy and back-substitution to solve the system. Remember to use the structure of the problem to determine the appropriate decomposition.\n"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {
192 |     "collapsed": true
193 |    },
194 |    "outputs": [],
195 |    "source": [
196 |     "\n",
197 |     "\n",
198 |     "\n"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "**Question 5 (10 points)**\n",
206 |     "\n",
207 |     "Let\n",
208 |     "\n",
209 |     "$A = \\left(\\begin{matrix}2 & -1 &1\\\\-1& 2& -1 \\\\1&-1& 1\n",
210 |     "\\end{matrix}\\right) \\;\\;\\;\\;\\;\\;\\textrm{ and }\\;\\;\\;\\;\\;\\; v = \\left(\\begin{matrix}1 \\\\ 1 \\\\2\\end{matrix}\\right)$\n",
211 |     "\n",
212 |     "Find $w$ such that $w$ is conjugate to $v$ under $A$. You may use *basic* linear algebra in scipy or numpy - i.e. matrix products."
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {
219 |     "collapsed": false
220 |    },
221 |    "outputs": [],
222 |    "source": [
223 |     "\n",
224 |     "\n",
225 |     "\n"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "**Question 6 (20 points)**\n",
233 |     "\n",
234 |     "- The Cauchy distribution is given by \n",
235 |     "$$\n",
236 |     "f(x) = \\frac{1}{\\pi (1 + x^2)}, \\ \\ -\\infty \\lt x \\lt \\infty \n",
237 |     "$$\n",
238 |     "\n",
239 |     "Integrate the tail probability $P(X > 2)$ using Monte Carlo integration with 1 million samples from the uniform distribution using an appropriate change of variables (10 points)\n",
240 |     "\n",
241 |     "- Estimate the following integral using Monte Carlo integration and 1 million draws.  Hint: See figure. (10 points)\n",
242 |     "\n",
243 |     "$$\n",
244 |     "\\int_{-\\infty}^{\\infty} x^2 \\frac{1}{2}e^{-|x|} dx\n",
245 |     "$$\n",
246 |     "\n",
247 |     "![Hint](./dexp.png)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {
254 |     "collapsed": true
255 |    },
256 |    "outputs": [],
257 |    "source": [
258 |     "\n",
259 |     "\n",
260 |     "\n"
261 |    ]
262 |   }
263 |  ],
264 |  "metadata": {
265 |   "kernelspec": {
266 |    "display_name": "Python 3",
267 |    "language": "python",
268 |    "name": "python3"
269 |   },
270 |   "language_info": {
271 |    "codemirror_mode": {
272 |     "name": "ipython",
273 |     "version": 3
274 |    },
275 |    "file_extension": ".py",
276 |    "mimetype": "text/x-python",
277 |    "name": "python",
278 |    "nbconvert_exporter": "python",
279 |    "pygments_lexer": "ipython3",
280 |    "version": "3.5.1"
281 |   }
282 |  },
283 |  "nbformat": 4,
284 |  "nbformat_minor": 0
285 | }
286 | 


--------------------------------------------------------------------------------
/exams/dexp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/exams/dexp.png


--------------------------------------------------------------------------------
/exams/xy.csv:
--------------------------------------------------------------------------------
 1 | x,y
 2 | 0.0,0.9143693966994388
 3 | 0.1,3.597345446583586
 4 | 0.2,3.4829784980519922
 5 | 0.30000000000000004,2.2937052860819085
 6 | 0.4,3.821399748031464
 7 | 0.5,66.51436537097152
 8 | 0.6000000000000001,3.1733207566069264
 9 | 0.7000000000000001,5.771087371143823
10 | 0.8,8.065936258705534
11 | 0.9,6.533259597734899
12 | 1.0,7.321113848377946
13 | 


--------------------------------------------------------------------------------
/homework/Homework01.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "**Due: 4 PM on Wednesday, 27 Jan 2016**"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Instructions\n",
 15 |     "-----\n",
 16 |     "\n",
 17 |     "Write code to solve all 10 problems. Each problem is worth 10 points. The grading rubric includes the following criteria:\n",
 18 |     "\n",
 19 |     "- Correctness\n",
 20 |     "- Readability\n",
 21 |     "- Efficiency\n",
 22 |     "\n",
 23 |     "Please do not copy answwrs found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc is OK. Some discussion with friends is fine too - but again, do not just copy thier answer. \n",
 24 |     "\n",
 25 |     "**Honor Code: By submitting this assignment, you certify that this is your origianl work.**"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "Overwriting data/animals.txt\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "%%file ../data/animals.txt\n",
 45 |     "name|species|age|weight\n",
 46 |     "arun|cat|5|7.3\n",
 47 |     "bob|bird|2|1.5\n",
 48 |     "coco|cat|2|5.5\n",
 49 |     "dumbo|elephant|23|454\n",
 50 |     "elmo|dog|5|11\n",
 51 |     "fido|dog|3|24.5\n",
 52 |     "gumba|bird|2|2.7"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "**Q1.** Using only the Unix shell commands, find only rows showing the 3rd, 4th and 5th heaviest animals in the file animals.txt."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {
 66 |     "collapsed": true
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "\n",
 71 |     "\n",
 72 |     "\n"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "**Q2.** Using only the Unix shell commands, find all files in the current directory and all its subdirecotries that contain the word elephant regardless of case."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "\n",
 91 |     "\n",
 92 |     "\n"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "**Q3.** Using only the Python standard library, find only rows showing the 3rd, 4th and 5th heaviest animals in the file animals.txt"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "collapsed": true
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "\n",
111 |     "\n",
112 |     "\n"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "**Q4.** Using only the Python standard library, find all files in the current directory and all its sub-directories that contain the word `elephant` regardless of case."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 9,
125 |    "metadata": {
126 |     "collapsed": false
127 |    },
128 |    "outputs": [
129 |     {
130 |      "name": "stdout",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "./Homework01.ipynb\n",
134 |       "./.ipynb_checkpoints/Homework01-checkpoint.ipynb\n"
135 |      ]
136 |     }
137 |    ],
138 |    "source": [
139 |     "# Not graded but here is a possile solution.\n",
140 |     "for dirpath, dirnames, filenames in os.walk('.'):\n",
141 |     "    for filename in filenames:\n",
142 |     "        path = os.path.join(dirpath, filename)\n",
143 |     "        with open(path) as f:\n",
144 |     "            text = f.read()\n",
145 |     "            if 'elephant' in text.lower():\n",
146 |     "                print(path)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "**Q5.** Starting with `range(1, 20)`, make a list of the squares of each odd number in the following ways\n",
154 |     "\n",
155 |     "- With a for loop\n",
156 |     "- Using a list comprehension\n",
157 |     "- Using map and filter\n",
158 |     "\n",
159 |     "The answer should be [1, 9, 25, 49, 81, 121, 169, 225, 289, 361]"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {
166 |     "collapsed": true
167 |    },
168 |    "outputs": [],
169 |    "source": [
170 |     "\n",
171 |     "\n",
172 |     "\n",
173 |     "\n"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "**Q6.** If we list all the natural numbers below 10 that are multiples of 3 or 5, we get 3, 5, 6 and 9. The sum of these multiples is 23. (Euler problem #1)\n",
181 |     "\n",
182 |     "Write a program to find the sum of all the multiples of 3 or 5 below 1000.\n",
183 |     "\n",
184 |     "The answer sould be 233168."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {
191 |     "collapsed": true
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "\n",
196 |     "\n",
197 |     "\n",
198 |     "\n"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "**Q7**. A palindromic number reads the same both ways. The largest palindrome made from the product of two 2-digit numbers is 9009 = 91 × 99.\n",
206 |     "\n",
207 |     "Write a program to find the largest palindrome made from the product of two 3-digit numbers. (Euler problem #4)\n",
208 |     "\n",
209 |     "The answer should be 906609 = 913 × 993. (Thanks to Ilan Man for catching the error)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "collapsed": true
217 |    },
218 |    "outputs": [],
219 |    "source": [
220 |     "\n",
221 |     "\n",
222 |     "\n",
223 |     "\n"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "**Q8.** The sum of the squares of the first ten natural numbers is,\n",
231 |     "$$\n",
232 |     "1^2 + 2^2 + ... + 10^2 = 385\n",
233 |     "$$\n",
234 |     "The square of the sum of the first ten natural numbers is,\n",
235 |     "$$\n",
236 |     "(1 + 2 + ... + 10)^2 = 55^2 = 3025\n",
237 |     "$$\n",
238 |     "Hence the difference between the sum of the squares of the first ten natural numbers and the square of the sum is 3025 − 385 = 2640.\n",
239 |     "\n",
240 |     "Write a program to find the difference between the sum of the squares of the first one hundred natural numbers and the square of the sum. (Euler problem #6)\n",
241 |     "\n",
242 |     "The answer should be 25164150."
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "collapsed": true
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "\n",
254 |     "\n",
255 |     "\n",
256 |     "\n"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {},
262 |    "source": [
263 |     "**Q9.** Problem 8: The four adjacent digits in the 1000-digit number that have the greatest product are 9 × 9 × 8 × 9 = 5832.\n",
264 |     "```\n",
265 |     "73167176531330624919225119674426574742355349194934\n",
266 |     "96983520312774506326239578318016984801869478851843\n",
267 |     "85861560789112949495459501737958331952853208805511\n",
268 |     "12540698747158523863050715693290963295227443043557\n",
269 |     "66896648950445244523161731856403098711121722383113\n",
270 |     "62229893423380308135336276614282806444486645238749\n",
271 |     "30358907296290491560440772390713810515859307960866\n",
272 |     "70172427121883998797908792274921901699720888093776\n",
273 |     "65727333001053367881220235421809751254540594752243\n",
274 |     "52584907711670556013604839586446706324415722155397\n",
275 |     "53697817977846174064955149290862569321978468622482\n",
276 |     "83972241375657056057490261407972968652414535100474\n",
277 |     "82166370484403199890008895243450658541227588666881\n",
278 |     "16427171479924442928230863465674813919123162824586\n",
279 |     "17866458359124566529476545682848912883142607690042\n",
280 |     "24219022671055626321111109370544217506941658960408\n",
281 |     "07198403850962455444362981230987879927244284909188\n",
282 |     "84580156166097919133875499200524063689912560717606\n",
283 |     "05886116467109405077541002256983155200055935729725\n",
284 |     "71636269561882670428252483600823257530420752963450\n",
285 |     "```\n",
286 |     "Write a program to find the thirteen adjacent digits in the 1000-digit number that have the greatest product. What is the value of this product? (Euler problem #8)\n",
287 |     "\n",
288 |     "The answer shoud be 23514624000."
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {
295 |     "collapsed": true
296 |    },
297 |    "outputs": [],
298 |    "source": [
299 |     "\n",
300 |     "\n",
301 |     "\n",
302 |     "\n"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "**Q10.** A Pythagorean triplet is a set of three natural numbers, a < b < c, for which,\n",
310 |     "\n",
311 |     "$$\n",
312 |     "a^2 + b^2 = c^2\n",
313 |     "$$\n",
314 |     "For example, $3^2 + 4^2 = 9 + 16 = 25 = 5^2$\n",
315 |     "\n",
316 |     "There exists exactly one Pythagorean triplet for which a + b + c = 1000.\n",
317 |     "Write a program to find the product abc. (Euler problem #9)\n",
318 |     "\n",
319 |     "The answer should be (200, 375, 425, 31875000)."
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {
326 |     "collapsed": true
327 |    },
328 |    "outputs": [],
329 |    "source": [
330 |     "\n",
331 |     "\n",
332 |     "\n",
333 |     "\n"
334 |    ]
335 |   }
336 |  ],
337 |  "metadata": {
338 |   "kernelspec": {
339 |    "display_name": "Python 3",
340 |    "language": "python",
341 |    "name": "python3"
342 |   },
343 |   "language_info": {
344 |    "codemirror_mode": {
345 |     "name": "ipython",
346 |     "version": 3
347 |    },
348 |    "file_extension": ".py",
349 |    "mimetype": "text/x-python",
350 |    "name": "python",
351 |    "nbconvert_exporter": "python",
352 |    "pygments_lexer": "ipython3",
353 |    "version": "3.5.1"
354 |   }
355 |  },
356 |  "nbformat": 4,
357 |  "nbformat_minor": 0
358 | }
359 | 


--------------------------------------------------------------------------------
/homework/Homework03.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "**Due: 4 PM on Wednesday, 10 Feb 2016**"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Instructions\n",
 15 |     "-----\n",
 16 |     "\n",
 17 |     "Write code to solve all problems. The grading rubric includes the following criteria:\n",
 18 |     "\n",
 19 |     "- Correctness\n",
 20 |     "- Readability\n",
 21 |     "- Efficiency\n",
 22 |     "\n",
 23 |     "Please do not copy answers found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc is OK. Some discussion with friends is fine too - but again, do not just copy their answer. \n",
 24 |     "\n",
 25 |     "**Honor Code: By submitting this assignment, you certify that this is your original work.**\n",
 26 |     "\n",
 27 |     "**Note**: These exercises will involve quite a bit more code writing than the first 2 homework assignments so start early. They are also intentionally less specific so that you have to come up with your own plan to complete the exercises."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "We will use the following data sets:\n",
 35 |     "```python\n",
 36 |     "titanic = sns.load_dataset(\"titanic\")\n",
 37 |     "iris = sns.load_dataset(\"iris\")\n",
 38 |     "```"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "**Q1 (20 pts)** Working with `numpy.random`.\n",
 46 |     "\n",
 47 |     "**Part 1 (10 pts)** Consider a sequence of $n$ Bernoulli trials with success probabilty $p$ per trial. A string of consecutive successes is known as a success *run*. Write a function that returns the counts for runs of length $k$ for each $k$ observed in a dictionary.\n",
 48 |     "\n",
 49 |     "For example: if the trials were [0, 1, 0, 1, 1, 0, 0, 0, 0, 1], the function should return \n",
 50 |     "```\n",
 51 |     "{1: 2, 2: 1})\n",
 52 |     "```"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {
 59 |     "collapsed": true
 60 |    },
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "\n",
 64 |     "\n",
 65 |     "\n",
 66 |     "\n"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "**Part 2 (10 pts)** Continuing from Part 1, what is the probability of observing at least one run of length 5 or more when $n=100$ and $p=0.5$?. Estimate this from 100,000 simulated experiments. Is this more, less or equally likely than finding runs of length 7 or more when $p=0.7$?"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "collapsed": true
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "\n",
 85 |     "\n",
 86 |     "\n",
 87 |     "\n"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "**Q2. (30 pts)** \n",
 95 |     "\n",
 96 |     "Using `RandomForestClassifier` from `sklearn`, find the 5 most important predictors of survival on the Titanic. Compare the accuracy of prediction using only these 5 predictors and using all non-redundant predictors. Some intial pre-processing code is provided. Hint: check out the `pandas.get_dummies()` function."
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {
103 |     "collapsed": false
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "titanic = sns.load_dataset(\"titanic\")\n",
108 |     "titanic.head()"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "titanic.drop(['alive', 'embarked', 'class', 'who', 'adult_male'], axis=1, inplace=True)\n",
120 |     "titanic.dropna(axis=0, inplace=True)\n",
121 |     "titanic.head()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {
128 |     "collapsed": true
129 |    },
130 |    "outputs": [],
131 |    "source": [
132 |     "\n",
133 |     "\n",
134 |     "\n",
135 |     "\n",
136 |     "\n",
137 |     "\n",
138 |     "\n",
139 |     "\n",
140 |     "\n"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "**Q2. (25 pts)**\n",
148 |     "\n",
149 |     "Using `sklearn`, perform unsupervised learning of the iris data using 2 different clustering methods. Do NOT assume you know the number of clusters - rather the code should either determine it from the data or compare models with different numbers of components using some appropriate test statistic. Make a pairwise scatter  plot of the four predictor variables indicating cluster by color for each unsupervised learning method used."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "\n",
161 |     "\n",
162 |     "\n",
163 |     "\n",
164 |     "\n",
165 |     "\n",
166 |     "\n",
167 |     "\n",
168 |     "\n",
169 |     "\n"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "**Q3. (50 pts)**\n",
177 |     "\n",
178 |     "Write code to generate a plot similar to the following ![figure](http://mathworld.wolfram.com/images/eps-gif/ElementaryCA_850.gif) using the explanation for generation of 1D Cellular Automata found [here](http://mathworld.wolfram.com/ElementaryCellularAutomaton.html). You should only need to use standard Python, `numpy` and `matplotllib`."
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "To make it simpler, I have provided the code for plotting below. All you need to do is to supply the `make_ca` function (which may of course use as many ohter custom functons as you deem necessary). As you can see from the code below, the `make_ca` function takes 3 arguments\n",
186 |     "```\n",
187 |     "rule - an integer e.g. 30\n",
188 |     "init - an initial state i.e. the first row of the image\n",
189 |     "niter - the number of iterations i.e. the number of rows in the image\n",
190 |     "```"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {
197 |     "collapsed": true
198 |    },
199 |    "outputs": [],
200 |    "source": [
201 |     "\n",
202 |     "\n",
203 |     "\n",
204 |     "\n",
205 |     "\n",
206 |     "\n",
207 |     "\n",
208 |     "\n",
209 |     "\n",
210 |     "\n"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {
217 |     "collapsed": true
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "from matplotlib.ticker import NullFormatter, IndexLocator\n",
222 |     "\n",
223 |     "def plot_grid(rule, grid, ax=None):\n",
224 |     "    if ax is None:\n",
225 |     "        ax = plt.subplot(111)\n",
226 |     "    ax.grid(True, which='major', color='grey', linewidth=0.5)\n",
227 |     "    ax.imshow(grid, interpolation='none', cmap='Greys', aspect=1, alpha=0.8)\n",
228 |     "    ax.xaxis.set_major_locator(IndexLocator(1, 0))\n",
229 |     "    ax.yaxis.set_major_locator(IndexLocator(1, 0))\n",
230 |     "    ax.xaxis.set_major_formatter( NullFormatter() )\n",
231 |     "    ax.yaxis.set_major_formatter( NullFormatter() )\n",
232 |     "    ax.set_title('Rule %d' % rule)"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {
239 |     "collapsed": false
240 |    },
241 |    "outputs": [],
242 |    "source": [
243 |     "niter = 15\n",
244 |     "width = niter*2+1\n",
245 |     "init = np.zeros(width, 'int')\n",
246 |     "init[width//2] = 1\n",
247 |     "rules = np.array([30, 54, 60, 62, 90, 94, 102, 110, 122, 126, \n",
248 |     "                  150, 158, 182, 188, 190, 220, 222, 250]).reshape((-1, 3))\n",
249 |     "\n",
250 |     "nrows, ncols = rules.shape\n",
251 |     "fig, axes = plt.subplots(nrows, ncols, figsize=(ncols*3, nrows*2))\n",
252 |     "for i in range(nrows):\n",
253 |     "    for j in range(ncols):\n",
254 |     "        grid = make_ca(rules[i, j], init, niter)\n",
255 |     "        plot_grid(rules[i, j], grid, ax=axes[i,j])\n",
256 |     "plt.tight_layout()"
257 |    ]
258 |   }
259 |  ],
260 |  "metadata": {
261 |   "kernelspec": {
262 |    "display_name": "Python 3",
263 |    "language": "python",
264 |    "name": "python3"
265 |   },
266 |   "language_info": {
267 |    "codemirror_mode": {
268 |     "name": "ipython",
269 |     "version": 3
270 |    },
271 |    "file_extension": ".py",
272 |    "mimetype": "text/x-python",
273 |    "name": "python",
274 |    "nbconvert_exporter": "python",
275 |    "pygments_lexer": "ipython3",
276 |    "version": "3.5.1"
277 |   }
278 |  },
279 |  "nbformat": 4,
280 |  "nbformat_minor": 0
281 | }
282 | 


--------------------------------------------------------------------------------
/homework/Homework04.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "**Due: 4 PM on Wednesday, 17 Feb 2016**\n",
  8 |     "\n",
  9 |     "The usual warnings apply - the homework is not officially released until 11 Feb 2016, and we may make changes till then."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Instructions\n",
 17 |     "-----\n",
 18 |     "\n",
 19 |     "Write code to solve all problems. The grading rubric includes the following criteria:\n",
 20 |     "\n",
 21 |     "- Correctness\n",
 22 |     "- Readability\n",
 23 |     "- Efficiency\n",
 24 |     "\n",
 25 |     "Please do not copy answers found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc is OK. Some discussion with friends is fine too - but again, do not just copy their answer. \n",
 26 |     "\n",
 27 |     "**Honor Code: By submitting this assignment, you certify that this is your original work.**"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "**Question 1 (20 points)**. \n",
 35 |     "\n",
 36 |     "Euclid's algorithm for finding the greatest common divisor of two numbers is\n",
 37 |     "\n",
 38 |     "```python\n",
 39 |     "gcd(a, 0) = a\n",
 40 |     "gcd(a, b) = gcd(b, a modulo b)\n",
 41 |     "```\n",
 42 |     "\n",
 43 |     "- Write a function to find the greatest common divisor in Python (8 points)\n",
 44 |     "- What is the greatest common divisor of 17384 and 1928? (2 point)\n",
 45 |     "- Write a function to calculate the least common multiple (8 points)\n",
 46 |     "- What is the least common multiple of 17384 and 1928? (2 point)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "\n",
 58 |     "\n",
 59 |     "\n",
 60 |     "\n"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "**Question 2 (20 points)**. \n",
 68 |     "\n",
 69 |     "Consider the linear transformation $f(x)$ on $\\mathbb{R}^3$ that takes the standard basis $\\left\\{e_1,e_2,e_3\\right\\}$ to $\\left\\{v_1,v_2,v_3\\right\\}$ where\n",
 70 |     "\n",
 71 |     "$$v_1=\\left(\\begin{matrix}10\\\\-10\\\\16\\end{matrix}\\right), v_2=\\left(\\begin{matrix}2\\\\-5\\\\20\\end{matrix}\\right) \\textrm {and } v_3=\\left(\\begin{matrix}1\\\\-4\\\\13\\end{matrix}\\right)$$\n",
 72 |     "\n",
 73 |     "1. Write a matrix $A$ that represents the same linear transformaton. (4 points)\n",
 74 |     "\n",
 75 |     "2. Compute the rank of $A$ using two different methods (do not use `matrix_rank`!). (4 points)\n",
 76 |     "\n",
 77 |     "3. Find the eigenvalues and eigenvectors of $A$. (4 points)\n",
 78 |     "\n",
 79 |     "4. What is the matrix representation of $f$ with respect to the eigenbasis? (48 points)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "\n",
 91 |     "\n",
 92 |     "\n",
 93 |     "\n"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "**Exercise 3 (20 pts)**. Avodiing catastrophic cancellation.\n",
101 |     "\n",
102 |     "Read the Wikipedia entry on [loss of significance](https://en.wikipedia.org/wiki/Loss_of_significance). Then answer the following problem:\n",
103 |     "\n",
104 |     "The tail of the standard logistic distributon is given by $1 - F(t) = 1 - (1+e^{-t})^{-1}$.\n",
105 |     "\n",
106 |     "- Define a function `f1` to calculate the tail probability of the logistic distribution using the formula given above\n",
107 |     "- Use [`sympy`](http://docs.sympy.org/latest/index.html) to find the exact value of the tail distribution (using the same symbolic formula) to 20 decimal digits\n",
108 |     "- Calculate the *relative error* of `f1` when $t = 25$ (The relative error is given by `abs(exact - approximate)/exact`)\n",
109 |     "- Rewrite the expression for the tail of the logistic distribution using simple algebra so that there is no risk of cancellation, and write a function `f2` using this formula. Calculate the *relative error* of `f2` when $t = 25$. \n",
110 |     "- How much more accurate is `f2` compared with `f1` in terms of the relative error?"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {
117 |     "collapsed": true
118 |    },
119 |    "outputs": [],
120 |    "source": [
121 |     "\n",
122 |     "\n",
123 |     "\n"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "**Exercise 4 (40 pts)**. One of the goals of the course it that you will be able to implement novel algorihtms from the literature. \n",
131 |     "\n",
132 |     "- Implement the mean-shift algorithm in 1D as described [here](http://homepages.inf.ed.ac.uk/rbf/CVonline/LOCAL_COPIES/TUZEL1/MeanShift.pdf). \n",
133 |     "    - Use the following function signature\n",
134 |     "    ```python\n",
135 |     "    def mean_shift(xs, x, kernel, max_iters=100, tol=1e-6):\n",
136 |     "    ```\n",
137 |     "    - xs is the data set, x is the starting location, and kernel is a kernel function\n",
138 |     "    - tol is the difference in $||x||$ across iterations\n",
139 |     "- Use the following kernels with bandwidth $h$ (a default value of 1.0 will work fine)\n",
140 |     "    - Flat - return 1 if $||x|| < h$ and 0 otherwise\n",
141 |     "    - Gaussian \n",
142 |     "    $$\\frac{1}{\\sqrt{2 \\pi h}}e^{\\frac{-||x||^2}{h^2}}$$\n",
143 |     "    - Note that $||x||$ is the norm of the data point being evaluated minus the current value of $x$\n",
144 |     "- Use both kernels to find all 3 modes of the data set in `x1d.npy`\n",
145 |     "- Modify the algorihtm abd/or kernels so that it now works in an arbitrary number of dimensions.\n",
146 |     "- Uset both kernels to find all 3 modes of the data set in `x2d.npy`\n",
147 |     "- Plot the path of successive intermeidate solutions of the mean-shift algorithm starting from `x0 = (-4, 5)` until it converges onto a mode in the 2D data for each kernel. Superimposet the path on top of a contour plot of the data density. Repeat for `x0 = (0, 0)` and `x0 = (10, 10)` ."
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {
154 |     "collapsed": true
155 |    },
156 |    "outputs": [],
157 |    "source": []
158 |   }
159 |  ],
160 |  "metadata": {
161 |   "kernelspec": {
162 |    "display_name": "Python 3",
163 |    "language": "python",
164 |    "name": "python3"
165 |   },
166 |   "language_info": {
167 |    "codemirror_mode": {
168 |     "name": "ipython",
169 |     "version": 3
170 |    },
171 |    "file_extension": ".py",
172 |    "mimetype": "text/x-python",
173 |    "name": "python",
174 |    "nbconvert_exporter": "python",
175 |    "pygments_lexer": "ipython3",
176 |    "version": "3.5.1"
177 |   }
178 |  },
179 |  "nbformat": 4,
180 |  "nbformat_minor": 0
181 | }
182 | 


--------------------------------------------------------------------------------
/homework/Homework05.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "**Due: 4 PM on Wednesday, 24 Feb 2016**\n",
  8 |     "\n",
  9 |     "The usual warnings apply - the homework is not officially released until 18 Feb 2016, and we may make changes till then."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Instructions\n",
 17 |     "-----\n",
 18 |     "\n",
 19 |     "Write code to solve all problems. The grading rubric includes the following criteria:\n",
 20 |     "\n",
 21 |     "- Correctness\n",
 22 |     "- Readability\n",
 23 |     "- Efficiency\n",
 24 |     "\n",
 25 |     "Please do not copy answers found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc is OK. Some discussion with friends is fine too - but again, do not just copy their answer. \n",
 26 |     "\n",
 27 |     "**Honor Code: By submitting this assignment, you certify that this is your original work.**"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "**Question 1 (25 points).** Consider the following function on $\\mathbb{R}^2$:\n",
 35 |     "\n",
 36 |     "$$f(x_1,x_2) = -x_1x_2e^{-\\frac{(x_1^2+x_2^2)}{2}}$$\n",
 37 |     "\n",
 38 |     "1. Use `sympy` to compute its gradient.\n",
 39 |     "2. Compute the Hessian matrix. \n",
 40 |     "3. Find the critical points of $f$.\n",
 41 |     "4. Characterize the critical points as max/min or neither. \n",
 42 |     "5. Find the minimum under the constraint \n",
 43 |     "$$g(x) = x_1^2+x_2^2 \\leq 10$$\n",
 44 |     "and \n",
 45 |     "$$h(x) = 2x_1 + 3x_2 = 5$$ using `scipy.optimize.minimize`.\n",
 46 |     "5. Plot the function using `matplotlib`.\n"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "\n",
 58 |     "\n",
 59 |     "\n",
 60 |     "\n"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "**Question 2 (15 points).**\n",
 68 |     "\n",
 69 |     "A milkmaid is at point A and needs to get to point B. However, she also needs to fill a pail of water from the river en route from A to B. The equation of the river's path is shown in the figure below. What is the minimum distance she has to travel to do this?\n",
 70 |     "\n",
 71 |     "1. Solve using `scipy.optimize` and constrained minimization.\n",
 72 |     "2. 2. Create a plot of the solution using matplotlib (similar to provided figure but with optimal path added).\n",
 73 |     "\n",
 74 |     "Note: Beware of local optima. \n",
 75 |     "\n",
 76 |     "![Milkmaid problem](milkmaid.png)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {
 83 |     "collapsed": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "\n",
 88 |     "\n",
 89 |     "\n",
 90 |     "\n",
 91 |     "\n"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "Background to Q3 - Q5\n",
 99 |     "----\n",
100 |     "\n",
101 |     "Latent Semantic Analysis (LSA) is a method for reducing the dimnesionality of documents treated as a bag of words. It is used for document classification, clustering and retrieval. For example, LSA can be used to search for prior art given a new patent application. In this homework, we will implement a small library for simple latent semantic analysis as a practical example of the application of SVD. The ideas are very similar to PCA.\n",
102 |     "\n",
103 |     "We will implement a toy example of LSA to get familiar with the ideas. If you want to use LSA or similar methods for statiscal language analyis, the most efficient Python library is probably [gensim](https://radimrehurek.com/gensim/) - this also provides an online algorithm - i.e. the training information can be continuously updated. Other useful functions for processing natural language can be found in the [Natural Lnaguage Toolkit](http://www.nltk.org/).\n",
104 |     "\n",
105 |     "**Note**: The SVD from scipy.linalg performs a full decomposition, which is inefficient since we only need to decompose until we get the first k singluar values. If the SVD from `scipy.linalg` is too slow, please use the `sparsesvd` function from the [sparsesvd](https://pypi.python.org/pypi/sparsesvd/) package to perform SVD instead.  You can install in the usual way with \n",
106 |     "```\n",
107 |     "!pip install sparsesvd\n",
108 |     "```\n",
109 |     "\n",
110 |     "Then import the following\n",
111 |     "```python\n",
112 |     "from sparsesvd import sparsesvd \n",
113 |     "from scipy.sparse import csc_matrix \n",
114 |     "```\n",
115 |     "\n",
116 |     "and use as follows\n",
117 |     "```python\n",
118 |     "sparsesvd(csc_matrix(M), k=10)\n",
119 |     "```\n",
120 |     "\n"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "**Question 3 (20 points)**: Write 3 functions to calculate the term frequency (tf), the inverse document frequency (idf) and the product (tf-idf). Each function should take a single argument `docs`, which is a dictionary of (key=identifier, value=dcoument text) pairs, and return an appropriately sized array. Convert '-' to ' ' (space), remove punctuation, convert text to lowercase and split on whitespace to generate a collection of terms from the dcoument text.\n",
128 |     "\n",
129 |     "- tf = the number of occurrences of term $i$ in document $j$\n",
130 |     "- idf = $\\log \\frac{n}{1 + \\text{df}_i}$ where $n$ is the total number of documents and $\\text{df}_i$ is the number of documents in which term $i$ occurs.\n",
131 |     "\n",
132 |     "Print the table of tf-idf values for the following document collection\n",
133 |     "\n",
134 |     "```\n",
135 |     "s1 = \"The quick brown fox\"\n",
136 |     "s2 = \"Brown fox jumps over the jumps jumps jumps\"\n",
137 |     "s3 = \"The the the lazy dog elephant.\"\n",
138 |     "s4 = \"The the the the the dog peacock lion tiger elephant\"\n",
139 |     "\n",
140 |     "docs = {'s1': s1, 's2': s2, 's3': s3, 's4': s4}\n",
141 |     "```\n",
142 |     "\n",
143 |     "Note: You can use either a numpy array or pandas dataframe to store the matrix. However, we suggest using a Pnadas dataframe since that will allow you to keep track of the row (term) and column (document) names in a single object. Of course, you could also maintain a numpy matrix, a list of terms, and a list of documents separately if you prefer.\n",
144 |     "\n",
145 |     "\n",
146 |     "\n"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {
153 |     "collapsed": true
154 |    },
155 |    "outputs": [],
156 |    "source": [
157 |     "\n",
158 |     "\n",
159 |     "\n"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "**Question 4 (20 points)**\n",
167 |     "\n",
168 |     "1. Write a function that takes a matrix $M$ and an integer $k$ as arguments, and reconstructs a reduced matrix using only the $k$ largest singular values. Use the `scipy.linagl.svd` function to perform the decomposition. This is the least squares approximation to the matrix $M$ in $k$ dimensions.\n",
169 |     "\n",
170 |     "2. Apply the function you just wrote to the following term-frequency matrix for a set of $9$ documents using $k=2$ and print the reconstructed matrix $M'$.\n",
171 |     "```\n",
172 |     "M = np.array([[1, 0, 0, 1, 0, 0, 0, 0, 0],\n",
173 |     "       [1, 0, 1, 0, 0, 0, 0, 0, 0],\n",
174 |     "       [1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
175 |     "       [0, 1, 1, 0, 1, 0, 0, 0, 0],\n",
176 |     "       [0, 1, 1, 2, 0, 0, 0, 0, 0],\n",
177 |     "       [0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
178 |     "       [0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
179 |     "       [0, 0, 1, 1, 0, 0, 0, 0, 0],\n",
180 |     "       [0, 1, 0, 0, 0, 0, 0, 0, 1],\n",
181 |     "       [0, 0, 0, 0, 0, 1, 1, 1, 0],\n",
182 |     "       [0, 0, 0, 0, 0, 0, 1, 1, 1],\n",
183 |     "       [0, 0, 0, 0, 0, 0, 0, 1, 1]])\n",
184 |     "```\n",
185 |     "\n",
186 |     "3. Calculate the pairwise correlation matrix for the original matrix M and the reconstructed matrix using $k=2$ singular values (you may use [scipy.stats.spearmanr](http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html) to do the calculations). Consider the fist 5 sets of documents as one group $G1$ and the last 4 as another group $G2$ (i.e. first 5 and last 4 columns). What is the average within group correlation for $G1$, $G2$ and the average cross-group correlation for G1-G2 using either $M$ or $M'$. (Do not include self-correlation in the within-group calculations.)."
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {
193 |     "collapsed": true
194 |    },
195 |    "outputs": [],
196 |    "source": [
197 |     "\n",
198 |     "\n",
199 |     "\n",
200 |     "\n"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "**Question 5 (20 points)**. Clustering with LSA\n",
208 |     "\n",
209 |     "1. Begin by loading a pubmed database of selected article titles using:\n",
210 |     "```python\n",
211 |     "import pickle\n",
212 |     "docs = pickle.load(open('pubmed.pic', 'rb'))\n",
213 |     "```\n",
214 |     "    Create a tf-idf matrix for every term that appears at least once in any of the documents. What is the shape of the tf-idf matrix? \n",
215 |     "\n",
216 |     "2. Perform SVD on the tf-idf matrix to obtain $U \\Sigma V^T$ (often written as $T \\Sigma D^T$ in this context with $T$ representing the terms and $D$ representing the documents). If we set all but the top $k$ singular values to 0, the reconstructed matrix is essentially $U_k \\Sigma_k V_k^T$, where $U_k$ is $m \\times k$, $\\Sigma_k$ is $k \\times k$ and $V_k^T$ is $k \\times n$. Terms in this reduced space are represented by $U_k \\Sigma_k$ and documents by $\\Sigma_k V^T_k$. Reconstruct the matrix using the first $k=10$ singular values.\n",
217 |     "\n",
218 |     "3. Use agglomerative hierachical clustering with complete linkage to plot a dendrogram and comment on the likely number of  document clusters with $k = 100$. Use the dendrogram function from [SciPy ](https://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.cluster.hierarchy.dendrogram.html).\n",
219 |     "\n",
220 |     "4. Determine how similar each of the original documents is to the new document `mystery.txt`. Since $A = U \\Sigma V^T$, we also have $V = A^T U S^{-1}$ using orthogonality and the rule for transposing matrix products. This suggests that in order to map the new document to the same concept space, first find the tf-idf vector $v$ for the new document - this must contain all (and only) the terms present in the existing tf-idx matrix. Then the query vector $q$ is given by $v^T U_k \\Sigma_k^{-1}$. Find the 10 documents most similar to the new document and the 10 most dissimilar. \n",
221 |     "\n",
222 |     "5. Many documents often have some boilerplate material such as organization information, Copyright, etc. at the front or back of the document. Does it matter that the front and back matter of each document is essentially identical for either LSA-based clustering (part 3) or information retrieval (part 4)? Why or why not?"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 1,
228 |    "metadata": {
229 |     "collapsed": false
230 |    },
231 |    "outputs": [
232 |     {
233 |      "data": {
234 |       "text/plain": [
235 |        "<_io.TextIOWrapper name='mystery.txt' mode='r' encoding='UTF-8'>"
236 |       ]
237 |      },
238 |      "execution_count": 1,
239 |      "metadata": {},
240 |      "output_type": "execute_result"
241 |     }
242 |    ],
243 |    "source": [
244 |     "open('mystery.txt')\n",
245 |     "\n",
246 |     "\n",
247 |     "\n"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {
254 |     "collapsed": true
255 |    },
256 |    "outputs": [],
257 |    "source": []
258 |   }
259 |  ],
260 |  "metadata": {
261 |   "kernelspec": {
262 |    "display_name": "Python 3",
263 |    "language": "python",
264 |    "name": "python3"
265 |   },
266 |   "language_info": {
267 |    "codemirror_mode": {
268 |     "name": "ipython",
269 |     "version": 3
270 |    },
271 |    "file_extension": ".py",
272 |    "mimetype": "text/x-python",
273 |    "name": "python",
274 |    "nbconvert_exporter": "python",
275 |    "pygments_lexer": "ipython3",
276 |    "version": "3.5.1"
277 |   }
278 |  },
279 |  "nbformat": 4,
280 |  "nbformat_minor": 0
281 | }
282 | 


--------------------------------------------------------------------------------
/homework/Homework07.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "**Due: 4 PM on Wednesday, 24 Mar 2016**"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Instructions\n",
 15 |     "-----\n",
 16 |     "\n",
 17 |     "Write code to solve all problems. The grading rubric includes the following criteria:\n",
 18 |     "\n",
 19 |     "- Correctness\n",
 20 |     "- Readability\n",
 21 |     "- Efficiency\n",
 22 |     "\n",
 23 |     "Please do not copy answers found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc. is OK. Some discussion with friends is fine too - but again, do not just copy their answer. \n",
 24 |     "\n",
 25 |     "**Honor Code: By submitting this assignment, you certify that this is your original work.**"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "**Exercise 1 (50 points)**\n",
 33 |     "\n",
 34 |     "#### Gibbs sampler example from [Robert and Casella, 10.17](http://www.springer.com/statistics/statistical+theory+and+methods/book/978-0-387-21239-5)\n",
 35 |     "\n",
 36 |     "Suppose we have data of the number of failures ($y_i$) for each of 10 pumps in a nuclear plant. We also have the times ($t_i$) at which each pump was observed. We want to model the number of failures with a Poisson likelihood, where the expected number of failure $\\lambda_i$ differs for each pump. Since the time which we observed each pump is different, we need to scale each $\\lambda_i$ by its observed time $t_i$. To be explicit, we assume that $y_i$ has a Poisson distribution with rate $\\mu_i = \\lambda_i t_i$.\n",
 37 |     "\n",
 38 |     "The likelihood $f$ is \n",
 39 |     "$$\n",
 40 |     "\\prod_{i=1}^{10} \\text{Poisson}(\\mu_i)\n",
 41 |     "$$\n",
 42 |     "\n",
 43 |     "We let the prior $g$ for $\\lambda$ be \n",
 44 |     "\n",
 45 |     "$$\n",
 46 |     "\\lambda \\sim \\text{Gamma}(\\alpha_\\mu, \\beta_\\mu)\n",
 47 |     "$$\n",
 48 |     "\n",
 49 |     "and let the hyperprior $h$ for $\\alpha$ to be \n",
 50 |     "\n",
 51 |     "$$\n",
 52 |     "\\alpha \\sim \\text{Gamma}(\\alpha_\\alpha, \\beta_\\alpha)\n",
 53 |     "$$\n",
 54 |     "\n",
 55 |     "with $\\alpha_\\alpha = 1.8$ and $\\beta_\\alpha = 1.0$.\n",
 56 |     "\n",
 57 |     "and let the hyperprior $h$ for $\\beta$ to be \n",
 58 |     "\n",
 59 |     "$$\n",
 60 |     "\\beta \\sim \\text{Gamma}(\\alpha_\\beta, \\beta_\\beta)\n",
 61 |     "$$\n",
 62 |     "\n",
 63 |     "with $\\alpha_\\beta = 10.0$ and $\\beta_\\beta = 1.0$.\n",
 64 |     "\n",
 65 |     "There are 12 unknown parameters (10 $\\lambda$s, $\\alpha$ and $\\beta$) in this hierarchical model. Do th following using `pymc3` and some plotting package.\n",
 66 |     "\n",
 67 |     "- Wrtie the model and run for 10,000 iterations using the No U-Turn Sampler (30 points)\n",
 68 |     "- plot the traces and distributions of the last 10% for $\\lambda_i$, $\\alpha$ and $\\beta$ (there should be 12 sets of plots) (10 points)\n",
 69 |     "- Gnnerate 1,000 samples of the number of failures $y_i$ from the prior distribution and plot the histogram or density. That is, for each of the 10 pumps, we want to see the distribtuion of 1,000 draws of the number of failures (5 points).\n",
 70 |     "- Generate 1,000 posterior predictive samples of the number of failures $y_i$ and plot the histogram or density. This is similar to the previous question but using draws from the posterior  (5 points)\n",
 71 |     "\n",
 72 |     "Use the following data\n",
 73 |     "```python\n",
 74 |     "y = np.array([5, 1, 5, 14, 3, 19, 1, 1, 4, 22])\n",
 75 |     "t = np.array([94.32, 15.72, 62.88, 125.76, 5.24, 31.44, 1.05, 1.05, 2.10, 10.48])\n",
 76 |     "```"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 2,
 82 |    "metadata": {
 83 |     "collapsed": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "import pymc3 as pm"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 23,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "y = np.array([5, 1, 5, 14, 3, 19, 1, 1, 4, 22])\n",
 99 |     "t = np.array([94.32, 15.72, 62.88, 125.76, 5.24, 31.44, 1.05, 1.05, 2.10, 10.48])"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "collapsed": true
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "\n",
111 |     "\n",
112 |     "\n",
113 |     "\n"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "**Exercise 2 (50 points)**\n",
121 |     "\n",
122 |     "Repeat Exercise 1 using `pystan`."
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {
129 |     "collapsed": true
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "\n",
134 |     "\n",
135 |     "\n",
136 |     "\n"
137 |    ]
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "kernelspec": {
142 |    "display_name": "Python 3",
143 |    "language": "python",
144 |    "name": "python3"
145 |   },
146 |   "language_info": {
147 |    "codemirror_mode": {
148 |     "name": "ipython",
149 |     "version": 3
150 |    },
151 |    "file_extension": ".py",
152 |    "mimetype": "text/x-python",
153 |    "name": "python",
154 |    "nbconvert_exporter": "python",
155 |    "pygments_lexer": "ipython3",
156 |    "version": "3.5.1"
157 |   }
158 |  },
159 |  "nbformat": 4,
160 |  "nbformat_minor": 0
161 | }
162 | 


--------------------------------------------------------------------------------
/homework/Homework08.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "**Due: 4 PM on Wednesday, 30 Mar 2016**"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Instructions\n",
 15 |     "-----\n",
 16 |     "\n",
 17 |     "Write code to solve all problems. The grading rubric includes the following criteria:\n",
 18 |     "\n",
 19 |     "- Correctness\n",
 20 |     "- Readability\n",
 21 |     "- Efficiency\n",
 22 |     "\n",
 23 |     "Please do not copy answers found on the web or elsewhere as it will not benefit your learning. Searching the web for general references etc. is OK. Some discussion with friends is fine too - but again, do not just copy their answer. \n",
 24 |     "\n",
 25 |     "**Honor Code: By submitting this assignment, you certify that this is your original work.**"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "**Exercise 1 (25 points)**\n",
 33 |     "\n",
 34 |     "- Write a function in C that calculates the mean of an array of doubles, putting the function declaration and function definition in separate files (10 points)\n",
 35 |     "- Write a driver program to call the function with the inputs being an array containing the numbers 1,2,3,4,5 and print the results to standard output (5 pints)\n",
 36 |     "- Write a `makefile` that compiles the executable upon calling `make` at the command line and removes all generated files upon calling `make clean`  (10 points)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "\n",
 48 |     "\n",
 49 |     "\n"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "**Exercise 2 (25 points)**\n",
 57 |     "\n",
 58 |     "- Write a function `matrix_multiply` in C with the following function signature \n",
 59 |     "```\n",
 60 |     "void matrix_multiply(double** A, double **B, double **C, int m, int n, int p)\n",
 61 |     "```\n",
 62 |     "The function multiples an $m \\times n$ matrix $A$ with an $n \\times p$ matrix $B$ and gives the result in the matrix $C$ (10 points)\n",
 63 |     "- Write a function to pretty print a matrix to standard output, and use it to display $A$ and $B$. The output should look something like this (5 point):\n",
 64 |     "```\n",
 65 |     "[[3.0, 0.1, 5.0, 18.1],\n",
 66 |     "   [7.8, 7.9, 3.2, 1.0],\n",
 67 |     "   [6.1, 5.5, 8.9, 4.1]]\n",
 68 |     "```\n",
 69 |     "\n",
 70 |     "- Write a driver program to test it with the following matrices. Matrices should be generated using dynamic memory allocation, freeing up the memory when done (10 points)\n",
 71 |     "\n",
 72 |     "$$\n",
 73 |     "A = \\pmatrix{1 & 2 & 3\\\\4 & 5 & 6}, B = \\pmatrix{1 & 2 & 3 & 4\\\\5 & 6 & 7 & 8\\\\9 & 0 & 1 & 2}\n",
 74 |     "$$"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "\n",
 86 |     "\n",
 87 |     "\n"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "**Exercise 3 (25 points)**\n",
 95 |     "\n",
 96 |     "- Implement the secant method in 1D for root finding in C++. Pass in the function as a generalized function pointer. Use the method to find all roots of the polynomial equation $f(x) = x^3 - 7x - 6$ (20 points)\n",
 97 |     "- Write the roots to a text file that can be read in Python and plot the roots and polynomial using Python (5 points)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {
104 |     "collapsed": true
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "\n",
109 |     "\n",
110 |     "\n"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "**Exercise 4 (25 points)**\n",
118 |     "\n",
119 |     "You are given the following set of data\n",
120 |     "\n",
121 |     "$$\n",
122 |     "x = \\pmatrix{0 \\\\ 1 \\\\ 2 \\\\ 3 \\\\ 4 \\\\ 5 \\\\ 6 \\\\ 7 \\\\ 8 \\\\ 9}, \n",
123 |     "y = \\pmatrix{1.5 \\\\ 7.5 \\\\ 10.7 \\\\ 7.9 \\\\ -2.0 \\\\ -12.4 \\\\ -28.6 \\\\ -46.3 \\\\ -68.1 \\\\ -97.1}\n",
124 |     "$$\n",
125 |     "\n",
126 |     "- Write your own **gradient descent** optimization function in C++ to find the least squares solution for the coefficients $\\beta$ of a quadratic polynomial. You may use the `armadillo` library (20 points)\n",
127 |     "- Write the solution to a text file that can be read in Python and plot the least squares quadratic fit together with the data points using Python (5 points)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {
134 |     "collapsed": true
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "\n",
139 |     "\n",
140 |     "\n"
141 |    ]
142 |   }
143 |  ],
144 |  "metadata": {
145 |   "kernelspec": {
146 |    "display_name": "Python 3",
147 |    "language": "python",
148 |    "name": "python3"
149 |   },
150 |   "language_info": {
151 |    "codemirror_mode": {
152 |     "name": "ipython",
153 |     "version": 3
154 |    },
155 |    "file_extension": ".py",
156 |    "mimetype": "text/x-python",
157 |    "name": "python",
158 |    "nbconvert_exporter": "python",
159 |    "pygments_lexer": "ipython3",
160 |    "version": "3.5.1"
161 |   }
162 |  },
163 |  "nbformat": 4,
164 |  "nbformat_minor": 0
165 | }
166 | 


--------------------------------------------------------------------------------
/homework/milkmaid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/homework/milkmaid.png


--------------------------------------------------------------------------------
/homework/mystery.txt:
--------------------------------------------------------------------------------
 1 | ﻿Intensive blood-glucose control with sulphonylureas or insulin compared with
 2 |       conventional treatment and risk of complications in patients with type 2 diabetes
 3 | BACKGROUND: Improved blood-glucose control decreases the progression of diabetic 
 4 |       microvascular disease, but the effect on macrovascular complications is unknown. 
 5 |       There is concern that sulphonylureas may increase cardiovascular mortality in
 6 |       patients with type 2 diabetes and that high insulin concentrations may enhance
 7 |       atheroma formation. We compared the effects of intensive blood-glucose control
 8 |       with either sulphonylurea or insulin and conventional treatment on the risk of
 9 |       microvascular and macrovascular complications in patients with type 2 diabetes in
10 |       a randomised controlled trial. METHODS: 3867 newly diagnosed patients with type 2
11 |       diabetes, median age 54 years (IQR 48-60 years), who after 3 months' diet
12 |       treatment had a mean of two fasting plasma glucose (FPG) concentrations of
13 |       6.1-15.0 mmol/L were randomly assigned intensive policy with a sulphonylurea
14 |       (chlorpropamide, glibenclamide, or glipizide) or with insulin, or conventional
15 |       policy with diet. The aim in the intensive group was FPG less than 6 mmol/L. In
16 |       the conventional group, the aim was the best achievable FPG with diet alone;
17 |       drugs were added only if there were hyperglycaemic symptoms or FPG greater than
18 |       15 mmol/L. Three aggregate endpoints were used to assess differences between
19 |       conventional and intensive treatment: any diabetes-related endpoint (sudden
20 |       death, death from hyperglycaemia or hypoglycaemia, fatal or non-fatal myocardial 
21 |       infarction, angina, heart failure, stroke, renal failure, amputation [of at least
22 |       one digit], vitreous haemorrhage, retinopathy requiring photocoagulation,
23 |       blindness in one eye, or cataract extraction); diabetes-related death (death from
24 |       myocardial infarction, stroke, peripheral vascular disease, renal disease,
25 |       hyperglycaemia or hypoglycaemia, and sudden death); all-cause mortality. Single
26 |       clinical endpoints and surrogate subclinical endpoints were also assessed. All
27 |       analyses were by intention to treat and frequency of hypoglycaemia was also
28 |       analysed by actual therapy. FINDINGS: Over 10 years, haemoglobin A1c (HbA1c) was 
29 |       7.0% (6.2-8.2) in the intensive group compared with 7.9% (6.9-8.8) in the
30 |       conventional group--an 11% reduction. There was no difference in HbA1c among
31 |       agents in the intensive group. Compared with the conventional group, the risk in 
32 |       the intensive group was 12% lower (95% CI 1-21, p=0.029) for any diabetes-related
33 |       endpoint; 10% lower (-11 to 27, p=0.34) for any diabetes-related death; and 6%
34 |       lower (-10 to 20, p=0.44) for all-cause mortality. Most of the risk reduction in 
35 |       the any diabetes-related aggregate endpoint was due to a 25% risk reduction
36 |       (7-40, p=0.0099) in microvascular endpoints, including the need for retinal
37 |       photocoagulation. There was no difference for any of the three aggregate
38 |       endpoints between the three intensive agents (chlorpropamide, glibenclamide, or
39 |       insulin). Patients in the intensive group had more hypoglycaemic episodes than
40 |       those in the conventional group on both types of analysis (both p<0.0001). The
41 |       rates of major hypoglycaemic episodes per year were 0.7% with conventional
42 |       treatment, 1.0% with chlorpropamide, 1.4% with glibenclamide, and 1.8% with
43 |       insulin. Weight gain was significantly higher in the intensive group (mean 2.9
44 |       kg) than in the conventional group (p<0.001), and patients assigned insulin had a
45 |       greater gain in weight (4.0 kg) than those assigned chlorpropamide (2.6 kg) or
46 |       glibenclamide (1.7 kg). INTERPRETATION: Intensive blood-glucose control by either
47 |       sulphonylureas or insulin substantially decreases the risk of microvascular
48 |       complications, but not macrovascular disease, in patients with type 2
49 |       diabetes.(ABSTRACT TRUNCATED)
50 | 


--------------------------------------------------------------------------------
/homework/x1d.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/homework/x1d.npy


--------------------------------------------------------------------------------
/homework/x2d.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/homework/x2d.npy


--------------------------------------------------------------------------------
/images/Scraping data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/images/Scraping data.png


--------------------------------------------------------------------------------
/images/hw2_q4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/images/hw2_q4.png


--------------------------------------------------------------------------------
/lectures/02D_Classes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Classes\n",
  8 |     "====\n",
  9 |     "\n",
 10 |     "As you probably know, Python is an object-oriented language, and so has very strong support for objects. In fact, everything in Python is an object. We will mostly use an imperative or functional rather than object-oriented programming style in this course. \n",
 11 |     "\n",
 12 |     "Here is the bare minimum about Python objects. "
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "Defining a new class\n",
 20 |     "----\n",
 21 |     "\n",
 22 |     "We define a class A with 2 'special' double underscore methods and one normal method. This class will have an attribute `x` that is specified at the time of creating new instances of the class.\n",
 23 |     "\n",
 24 |     "- The __init__ method initializes properties of any new instance of A\n",
 25 |     "- The __repr__ method provides an accurate string representation of A. For example, if we print an instance of A, the __repr__ method will be used. If you don't specify a __repr__ (or __str__) special method, the default name when printing only gives the address in memory.\n",
 26 |     "\n",
 27 |     "There are many more special method, as described in the [official documentation](https://docs.python.org/3.5/reference/datamodel.html). We will not go there."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 62,
 33 |    "metadata": {
 34 |     "collapsed": true
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "class A:\n",
 39 |     "    \"\"\"Base class.\"\"\"\n",
 40 |     "\n",
 41 |     "    def __init__(self, x):\n",
 42 |     "        self.x = x\n",
 43 |     "\n",
 44 |     "    def __repr__(self):\n",
 45 |     "        return '%s(%a)' % (self.__class__.__name__, self.x)\n",
 46 |     "\n",
 47 |     "    def report(self):\n",
 48 |     "        \"\"\"Report type of contained value.\"\"\"\n",
 49 |     "\n",
 50 |     "        return 'My value is of type %s' % type(self.x)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "Docstrings\n",
 58 |     "----"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 72,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/plain": [
 71 |        "'Base class.'"
 72 |       ]
 73 |      },
 74 |      "execution_count": 72,
 75 |      "metadata": {},
 76 |      "output_type": "execute_result"
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "A.__doc__"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 74,
 86 |    "metadata": {
 87 |     "collapsed": false
 88 |    },
 89 |    "outputs": [
 90 |     {
 91 |      "name": "stdout",
 92 |      "output_type": "stream",
 93 |      "text": [
 94 |       "Help on class A in module __main__:\n",
 95 |       "\n",
 96 |       "class A(builtins.object)\n",
 97 |       " |  Base class.\n",
 98 |       " |  \n",
 99 |       " |  Methods defined here:\n",
100 |       " |  \n",
101 |       " |  __init__(self, x)\n",
102 |       " |      Initialize self.  See help(type(self)) for accurate signature.\n",
103 |       " |  \n",
104 |       " |  __repr__(self)\n",
105 |       " |      Return repr(self).\n",
106 |       " |  \n",
107 |       " |  report(self)\n",
108 |       " |      Report type of contained value.\n",
109 |       " |  \n",
110 |       " |  ----------------------------------------------------------------------\n",
111 |       " |  Data descriptors defined here:\n",
112 |       " |  \n",
113 |       " |  __dict__\n",
114 |       " |      dictionary for instance variables (if defined)\n",
115 |       " |  \n",
116 |       " |  __weakref__\n",
117 |       " |      list of weak references to the object (if defined)\n",
118 |       "\n"
119 |      ]
120 |     }
121 |    ],
122 |    "source": [
123 |     "help(A)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 73,
129 |    "metadata": {
130 |     "collapsed": false
131 |    },
132 |    "outputs": [
133 |     {
134 |      "data": {
135 |       "text/plain": [
136 |        "'Report type of contained value.'"
137 |       ]
138 |      },
139 |      "execution_count": 73,
140 |      "metadata": {},
141 |      "output_type": "execute_result"
142 |     }
143 |    ],
144 |    "source": [
145 |     "A.report.__doc__"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "Making instance of a class\n",
153 |     "----"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "#### Example of a class without __repr__."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 64,
166 |    "metadata": {
167 |     "collapsed": true
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "class X:\n",
172 |     "    \"\"\"Empty class.\"\"\""
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 65,
178 |    "metadata": {
179 |     "collapsed": false
180 |    },
181 |    "outputs": [
182 |     {
183 |      "name": "stdout",
184 |      "output_type": "stream",
185 |      "text": [
186 |       "<__main__.X object at 0x1115eda20>\n"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "x = X()\n",
192 |     "print(x)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "#### Make new instances of the class A"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 59,
205 |    "metadata": {
206 |     "collapsed": false
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "a0 = A('a')"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 67,
216 |    "metadata": {
217 |     "collapsed": false
218 |    },
219 |    "outputs": [
220 |     {
221 |      "name": "stdout",
222 |      "output_type": "stream",
223 |      "text": [
224 |       "A('a')\n"
225 |      ]
226 |     }
227 |    ],
228 |    "source": [
229 |     "print(a0)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 60,
235 |    "metadata": {
236 |     "collapsed": false
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "a1 = A(x = 3.14)"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 68,
246 |    "metadata": {
247 |     "collapsed": false
248 |    },
249 |    "outputs": [
250 |     {
251 |      "name": "stdout",
252 |      "output_type": "stream",
253 |      "text": [
254 |       "A(3.14)\n"
255 |      ]
256 |     }
257 |    ],
258 |    "source": [
259 |     "print(a1)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "#### Attribute access"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 51,
272 |    "metadata": {
273 |     "collapsed": false
274 |    },
275 |    "outputs": [
276 |     {
277 |      "data": {
278 |       "text/plain": [
279 |        "('a', 3.14)"
280 |       ]
281 |      },
282 |      "execution_count": 51,
283 |      "metadata": {},
284 |      "output_type": "execute_result"
285 |     }
286 |    ],
287 |    "source": [
288 |     "a0.x, a1.x"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "markdown",
293 |    "metadata": {},
294 |    "source": [
295 |     "#### Method access"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 52,
301 |    "metadata": {
302 |     "collapsed": false
303 |    },
304 |    "outputs": [
305 |     {
306 |      "data": {
307 |       "text/plain": [
308 |        "(\"My value is of type <class 'str'>\", \"My value is of type <class 'float'>\")"
309 |       ]
310 |      },
311 |      "execution_count": 52,
312 |      "metadata": {},
313 |      "output_type": "execute_result"
314 |     }
315 |    ],
316 |    "source": [
317 |     "a0.report(), a1.report()"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "Class inheritance\n",
325 |     "----"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 63,
331 |    "metadata": {
332 |     "collapsed": true
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "class B(A):\n",
337 |     "    \"\"\"Derived class inherits from A.\"\"\"\n",
338 |     "\n",
339 |     "    def report(self):\n",
340 |     "        \"\"\"Overwrite report() method of A.\"\"\"\n",
341 |     "        return self.x"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 71,
347 |    "metadata": {
348 |     "collapsed": false
349 |    },
350 |    "outputs": [
351 |     {
352 |      "data": {
353 |       "text/plain": [
354 |        "'Derived class inherits from A.'"
355 |       ]
356 |      },
357 |      "execution_count": 71,
358 |      "metadata": {},
359 |      "output_type": "execute_result"
360 |     }
361 |    ],
362 |    "source": [
363 |     "B.__doc__"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "#### Make new instances of class B"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 54,
376 |    "metadata": {
377 |     "collapsed": true
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "b0 = B(3 + 4j)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 55,
387 |    "metadata": {
388 |     "collapsed": true
389 |    },
390 |    "outputs": [],
391 |    "source": [
392 |     "b1 = B(x = a1)"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {},
398 |    "source": [
399 |     "#### Attribute access"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": 56,
405 |    "metadata": {
406 |     "collapsed": false
407 |    },
408 |    "outputs": [
409 |     {
410 |      "data": {
411 |       "text/plain": [
412 |        "(3+4j)"
413 |       ]
414 |      },
415 |      "execution_count": 56,
416 |      "metadata": {},
417 |      "output_type": "execute_result"
418 |     }
419 |    ],
420 |    "source": [
421 |     "b0.x"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 57,
427 |    "metadata": {
428 |     "collapsed": false
429 |    },
430 |    "outputs": [
431 |     {
432 |      "data": {
433 |       "text/plain": [
434 |        "A(3.14)"
435 |       ]
436 |      },
437 |      "execution_count": 57,
438 |      "metadata": {},
439 |      "output_type": "execute_result"
440 |     }
441 |    ],
442 |    "source": [
443 |     "b1.x"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "markdown",
448 |    "metadata": {},
449 |    "source": [
450 |     "#### Nested attribute access"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 58,
456 |    "metadata": {
457 |     "collapsed": false
458 |    },
459 |    "outputs": [
460 |     {
461 |      "data": {
462 |       "text/plain": [
463 |        "\"My value is of type <class 'float'>\""
464 |       ]
465 |      },
466 |      "execution_count": 58,
467 |      "metadata": {},
468 |      "output_type": "execute_result"
469 |     }
470 |    ],
471 |    "source": [
472 |     "b1.x.report()"
473 |    ]
474 |   }
475 |  ],
476 |  "metadata": {
477 |   "kernelspec": {
478 |    "display_name": "Python 3",
479 |    "language": "python",
480 |    "name": "python3"
481 |   },
482 |   "language_info": {
483 |    "codemirror_mode": {
484 |     "name": "ipython",
485 |     "version": 3
486 |    },
487 |    "file_extension": ".py",
488 |    "mimetype": "text/x-python",
489 |    "name": "python",
490 |    "nbconvert_exporter": "python",
491 |    "pygments_lexer": "ipython3",
492 |    "version": "3.5.1"
493 |   }
494 |  },
495 |  "nbformat": 4,
496 |  "nbformat_minor": 0
497 | }
498 | 


--------------------------------------------------------------------------------
/lectures/19A_Parallel_Programming.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Parallel Programming\n",
  8 |     "====\n",
  9 |     "\n",
 10 |     "The goal is to design parallel programs that are flexible, efficient and simple.\n",
 11 |     "\n",
 12 |     "**Step 0**: Start by profiling a serial program to identify bottlenecks\n",
 13 |     "\n",
 14 |     "**Step 1**: Are there for opportunities for parallelism?\n",
 15 |     "\n",
 16 |     "- Can tasks be performed in parallel?\n",
 17 |     "    - Function calls\n",
 18 |     "    - Loops\n",
 19 |     "- Can data be split and operated on in parallel?\n",
 20 |     "    - Decomposition of arrays along rows, columns, blocks\n",
 21 |     "    - Decomposition of trees into sub-trees\n",
 22 |     "- Is there a pipeline with a sequence of stages?\n",
 23 |     "    - Data preprocessing and analysis\n",
 24 |     "    - Graphics rendering\n",
 25 |     "\n",
 26 |     "**Step 2**: What is the nature of the parallelism?\n",
 27 |     "\n",
 28 |     "- Linear\n",
 29 |     "    - Embarrassingly parallel programs\n",
 30 |     "- Recursive\n",
 31 |     "    - Adaptive partitioning methods\n",
 32 |     "\n",
 33 |     "**Step 3**: What is the granularity?\n",
 34 |     "\n",
 35 |     "- 10s of jobs\n",
 36 |     "- 1000s of jobs\n",
 37 |     "\n",
 38 |     "**Step 4**: Choose an algorithm\n",
 39 |     "\n",
 40 |     "- Organize by tasks\n",
 41 |     "    - Task parallelism\n",
 42 |     "    - Divide and conquer\n",
 43 |     "\n",
 44 |     "- Organize by data\n",
 45 |     "    - Geometric decomposition\n",
 46 |     "    - Recursive decomposition\n",
 47 |     "\n",
 48 |     "- Organize by flow\n",
 49 |     "    - Pipeline\n",
 50 |     "    - Event-based processing\n",
 51 |     "\n",
 52 |     "**Step 5**: Map to program and data structures\n",
 53 |     "\n",
 54 |     "- Program structures\n",
 55 |     "    - Single program multiple data (SPMD)\n",
 56 |     "    - Master/worker\n",
 57 |     "    - Loop parallelism\n",
 58 |     "    - Fork/join\n",
 59 |     "- Data structures \n",
 60 |     "    - Shared data\n",
 61 |     "    - Shared queue\n",
 62 |     "    - Distributed array\n",
 63 |     "\n",
 64 |     "**Step 6**: Map to parallel environment\n",
 65 |     "\n",
 66 |     "- Multi-core shared memory\n",
 67 |     "    - Cython with OpenMP\n",
 68 |     "    - multiprocessing\n",
 69 |     "    - IPython.cluster\n",
 70 |     "- Multi-computer\n",
 71 |     "    - IPython.cluster\n",
 72 |     "    - MPI\n",
 73 |     "    - Hadoop / Spark\n",
 74 |     "- GPU\n",
 75 |     "    - CUDA\n",
 76 |     "    - OpenCL\n",
 77 |     "\n",
 78 |     "**Step 7**: Execute, debug, tune in parallel environment"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "Embarrassingly parallel programs\n",
 86 |     "----\n",
 87 |     "\n",
 88 |     "Many statistical problems are embarrassingly parallel and can be easily decomposed into independent tasks or data sets. Here are several examples:\n",
 89 |     "\n",
 90 |     "- Monte Carlo integration\n",
 91 |     "- Multiple chains of MCMC\n",
 92 |     "- Bootstrap for confidence intervals\n",
 93 |     "- Power calculations by simulation\n",
 94 |     "- Permutation-resampling tests \n",
 95 |     "- Fitting same model on multiple data sets\n",
 96 |     "\n",
 97 |     "Other problems are serial at small scale, but can be parallelized at large scales. For example, EM and MCMC iterations are inherently serial since there is a dependence on the previous state, but within a single iteration, there can be many thousands of density calculations (one for each data point to calculate the likelihood), and this is an embarrassingly parallel problem within a single iteration. \n",
 98 |     "\n",
 99 |     "These \"low hanging fruits\" are great because they offer a path to easy parallelism with minimal complexity."
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "Executing parallel code\n",
107 |     "----\n",
108 |     "\n",
109 |     "**The bigger the problem, the more scope there is for parallelism**\n",
110 |     "\n",
111 |     "**Amhdahls' law** says that the speedup from parallelization is bounded by the ratio of parallelizable to irreducibly serial code in the algorithm. However, for big data analysis, **Gustafson's Law** is more relevant. This says that we are nearly always interested in increasing the size of the parallelizable bits, and the ratio of parallelizable to irreducibly serial code is not a static quantity but depends on data size. For example, Gibbs sampling has an irreducibly serial nature, but for large samples, each iteration may be able perform PDF evaluations in parallel for zillions of data points."
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "Coming highlights\n",
119 |     "-----\n",
120 |     "\n",
121 |     "- Parallelism in pre-built packages \n",
122 |     "    -`sklearn`\n",
123 |     "    - `pymc3`\n",
124 |     "    - `pystan`\n",
125 |     "- Parallelism when compiling to native code\n",
126 |     "    - Using  `target=paraallel` in `numba.vectorize` and `numb.guvectorize`\n",
127 |     "    - Using `openmp` with `cython.parallel`, `cython.prange` and `cython.nogil`\n",
128 |     "- Parallelism for multi-core computers\n",
129 |     "    - Using `concurrent.futures`\n",
130 |     "    - Using `multiprocessing`\n",
131 |     "    - Using `ipyparallel` within Jupyter\n",
132 |     "- Data too big for memory but not for disk\n",
133 |     "    - `memmap`\n",
134 |     "    - `HDF5` and `h5py`\n",
135 |     "    - Using `dask`\n",
136 |     "    - Usign `blaze`\n",
137 |     "- Data too big for one computer\n",
138 |     "    - Distributed storage\n",
139 |     "    - Data sketches\n",
140 |     "    - Using `pyspark`"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {
147 |     "collapsed": true
148 |    },
149 |    "outputs": [],
150 |    "source": []
151 |   }
152 |  ],
153 |  "metadata": {
154 |   "kernelspec": {
155 |    "display_name": "Python 3",
156 |    "language": "python",
157 |    "name": "python3"
158 |   },
159 |   "language_info": {
160 |    "codemirror_mode": {
161 |     "name": "ipython",
162 |     "version": 3
163 |    },
164 |    "file_extension": ".py",
165 |    "mimetype": "text/x-python",
166 |    "name": "python",
167 |    "nbconvert_exporter": "python",
168 |    "pygments_lexer": "ipython3",
169 |    "version": "3.5.1"
170 |   }
171 |  },
172 |  "nbformat": 4,
173 |  "nbformat_minor": 0
174 | }
175 | 


--------------------------------------------------------------------------------
/lectures/21E_Spark_And_Sklearn.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": []
11 |   }
12 |  ],
13 |  "metadata": {
14 |   "kernelspec": {
15 |    "display_name": "Python 3",
16 |    "language": "python",
17 |    "name": "python3"
18 |   },
19 |   "language_info": {
20 |    "codemirror_mode": {
21 |     "name": "ipython",
22 |     "version": 3
23 |    },
24 |    "file_extension": ".py",
25 |    "mimetype": "text/x-python",
26 |    "name": "python",
27 |    "nbconvert_exporter": "python",
28 |    "pygments_lexer": "ipython3",
29 |    "version": "3.5.1"
30 |   }
31 |  },
32 |  "nbformat": 4,
33 |  "nbformat_minor": 0
34 | }
35 | 


--------------------------------------------------------------------------------
/lectures/21F_Spark_GraphX.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": []
11 |   }
12 |  ],
13 |  "metadata": {
14 |   "kernelspec": {
15 |    "display_name": "Python 3",
16 |    "language": "python",
17 |    "name": "python3"
18 |   },
19 |   "language_info": {
20 |    "codemirror_mode": {
21 |     "name": "ipython",
22 |     "version": 3
23 |    },
24 |    "file_extension": ".py",
25 |    "mimetype": "text/x-python",
26 |    "name": "python",
27 |    "nbconvert_exporter": "python",
28 |    "pygments_lexer": "ipython3",
29 |    "version": "3.5.1"
30 |   }
31 |  },
32 |  "nbformat": 4,
33 |  "nbformat_minor": 0
34 | }
35 | 


--------------------------------------------------------------------------------
/lectures/21G_Spark_Streaming,ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Spark Streaming\n",
  8 |     "====\n",
  9 |     "\n",
 10 |     "The Spark Streaming library takes a stream of data and breaks it up into micro-batches that are then processed, giving the illusion of a continually updated stream of results."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "Resources\n",
 18 |     "----\n",
 19 |     "\n",
 20 |     "[Spark Streaming Programming Guide](http://spark.apache.org/docs/latest/streaming-programming-guide.html)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "Streaming using sockets\n",
 28 |     "----\n",
 29 |     "\n",
 30 |     "We will first illustrate the idea of streaming data over TCP/IP with the Python standard library `socket` module. The consumer and producer should be run in separate terminals\n",
 31 |     "\n",
 32 |     "Terminal 1\n",
 33 |     "```bash\n",
 34 |     "python consumer.py localhost 10000\n",
 35 |     "```\n",
 36 |     "\n",
 37 |     "Terminal 2\n",
 38 |     "```bash\n",
 39 |     "python producer.py localhost 10000\n",
 40 |     "```"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "### Consumer keeps a running word count"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 1,
 53 |    "metadata": {
 54 |     "collapsed": false
 55 |    },
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stdout",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "Overwriting consumer.py\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "%%file consumer.py\n",
 67 |     "import sys\n",
 68 |     "import socket\n",
 69 |     "from collections import Counter\n",
 70 |     "\n",
 71 |     "HOST = sys.argv[1]\n",
 72 |     "PORT = int(sys.argv[2])\n",
 73 |     "\n",
 74 |     "s = socket.socket()\n",
 75 |     "s.bind((HOST, PORT))\n",
 76 |     "s.listen(4)\n",
 77 |     "connection, address = s.accept()\n",
 78 |     "\n",
 79 |     "c = Counter()\n",
 80 |     "\n",
 81 |     "while True:\n",
 82 |     "    line = connection.recv(64)\n",
 83 |     "    words = line.split()\n",
 84 |     "    if words:\n",
 85 |     "        c.update(words)\n",
 86 |     "        print(c.most_common(5))"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "### Producer sends data to server for processing"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 2,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "Overwriting client.py\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "%%file client.py\n",
113 |     "import socket\n",
114 |     "import time\n",
115 |     "import sys\n",
116 |     "\n",
117 |     "HOST = sys.argv[1]\n",
118 |     "PORT = int(sys.argv[2])\n",
119 |     "s = socket.socket()\n",
120 |     "s.connect((HOST, PORT))\n",
121 |     "while True:\n",
122 |     "    for line in open('data/Ulysses.txt'):\n",
123 |     "        s.sendall(str.encode(line))\n",
124 |     "        time.sleep(1)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "Using Spark Streaming\n",
132 |     "----\n",
133 |     "\n",
134 |     "Now we'll replace the consumer with a Spark application. This will work with micro-batches lasting 2 seconds."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 1,
140 |    "metadata": {
141 |     "collapsed": false
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "from pyspark import SparkContext\n",
146 |     "\n",
147 |     "sc = SparkContext('local[*]')"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 5,
153 |    "metadata": {
154 |     "collapsed": false
155 |    },
156 |    "outputs": [
157 |     {
158 |      "data": {
159 |       "text/plain": [
160 |        "[('the', 13600), ('of', 8127), ('and', 6542), ('a', 5842), ('to', 4787)]"
161 |       ]
162 |      },
163 |      "execution_count": 5,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "lines = sc.textFile('data/Ulysses.txt')\n",
170 |     "\n",
171 |     "counts = (lines.flatMap(lambda line: line.split())\n",
172 |     "          .map(lambda word: (word, 1))\n",
173 |     "          .reduceByKey(lambda x,y: x+ y))\n",
174 |     "\n",
175 |     "counts.takeOrdered(5, key=lambda x: -x[1])"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "### Monitor a directory for new or renamed files"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 3,
188 |    "metadata": {
189 |     "collapsed": false
190 |    },
191 |    "outputs": [
192 |     {
193 |      "name": "stdout",
194 |      "output_type": "stream",
195 |      "text": [
196 |       "Writing file_consumer.py\n"
197 |      ]
198 |     }
199 |    ],
200 |    "source": [
201 |     "%%file file_consumer.py\n",
202 |     "\n",
203 |     "import sys\n",
204 |     "from pyspark import SparkContext\n",
205 |     "from pyspark.streaming import StreamingContext\n",
206 |     "\n",
207 |     "sc = SparkContext('local[*]')\n",
208 |     "sc.setLogLevel(\"WARN\")\n",
209 |     "\n",
210 |     "ssc = StreamingContext(sc, 2)\n",
211 |     "lines = ssc.textFileStream(sys.argv[1])\n",
212 |     "\n",
213 |     "counts = (lines.flatMap(lambda line: line.split())\n",
214 |     "          .map(lambda word: (word, 1))\n",
215 |     "          .reduceByKey(lambda x,y: x+ y))\n",
216 |     "\n",
217 |     "counts.pprint()\n",
218 |     "\n",
219 |     "ssc.start()\n",
220 |     "ssc.awaitTermination()"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "### Usage\n",
228 |     "\n",
229 |     "Run in terminal\n",
230 |     "```bash\n",
231 |     "~/anaconda3/share/spark-1.6.0/bin/spark-submit file_consumer.py <folder>\n",
232 |     "```\n",
233 |     "\n",
234 |     "When you copy, move or save a file to `<folder>`, the word counts for that file will be updated .. "
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "### Monitor a TCP/IP socket"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 4,
247 |    "metadata": {
248 |     "collapsed": false
249 |    },
250 |    "outputs": [
251 |     {
252 |      "name": "stdout",
253 |      "output_type": "stream",
254 |      "text": [
255 |       "Overwriting socket_consumer.py\n"
256 |      ]
257 |     }
258 |    ],
259 |    "source": [
260 |     "%%file socket_consumer.py\n",
261 |     "\n",
262 |     "import sys\n",
263 |     "from pyspark import SparkContext\n",
264 |     "from pyspark.streaming import StreamingContext\n",
265 |     "\n",
266 |     "sc = SparkContext('local[*]')\n",
267 |     "sc.setLogLevel(\"WARN\")\n",
268 |     "\n",
269 |     "ssc = StreamingContext(sc, 2)\n",
270 |     "lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))\n",
271 |     "\n",
272 |     "counts = (lines.flatMap(lambda line: line.split())\n",
273 |     "          .map(lambda word: (word, 1))\n",
274 |     "          .reduceByKey(lambda x,y: x+ y))\n",
275 |     "\n",
276 |     "counts.pprint()\n",
277 |     "\n",
278 |     "ssc.start()\n",
279 |     "ssc.awaitTermination()"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "### Usage\n",
287 |     "\n",
288 |     "Run in terminal\n",
289 |     "```bash\n",
290 |     "~/anaconda3/share/spark-1.6.0/bin/spark-submit socket_consumer.py localhost 10000\n",
291 |     "```\n",
292 |     "\n",
293 |     "In a different terminal\n",
294 |     "```\n",
295 |     "nc -lk 10000\n",
296 |     "```\n",
297 |     "\n",
298 |     "Any text pasted in the `nc` terminal will have its words counted."
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {
304 |     "collapsed": true
305 |    },
306 |    "source": [
307 |     "### Keeping state"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 6,
313 |    "metadata": {
314 |     "collapsed": false
315 |    },
316 |    "outputs": [
317 |     {
318 |      "name": "stdout",
319 |      "output_type": "stream",
320 |      "text": [
321 |       "Overwriting stateful_socket_consumer.py\n"
322 |      ]
323 |     }
324 |    ],
325 |    "source": [
326 |     "%%file stateful_socket_consumer.py\n",
327 |     "\n",
328 |     "import sys\n",
329 |     "from pyspark import SparkContext\n",
330 |     "from pyspark.streaming import StreamingContext\n",
331 |     "\n",
332 |     "def updateFunc(new, last):\n",
333 |     "    if last is None:\n",
334 |     "        last = 0\n",
335 |     "    return sum(new) + last\n",
336 |     "\n",
337 |     "sc = SparkContext('local[*]')\n",
338 |     "sc.setLogLevel(\"WARN\")\n",
339 |     "\n",
340 |     "ssc = StreamingContext(sc, 2)\n",
341 |     "ssc.checkpoint(\"checkpoint\")\n",
342 |     "\n",
343 |     "lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))\n",
344 |     "\n",
345 |     "counts = (lines.flatMap(lambda line: line.split())\n",
346 |     "          .map(lambda word: (word, 1))\n",
347 |     "          .updateStateByKey(updateFunc)\n",
348 |     "          .transform(lambda x: x.sortByKey()))\n",
349 |     "\n",
350 |     "counts.pprint()\n",
351 |     "\n",
352 |     "ssc.start()\n",
353 |     "ssc.awaitTermination()"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {
359 |     "collapsed": true
360 |    },
361 |    "source": [
362 |     "### Usage\n",
363 |     "\n",
364 |     "Same as above, but the Spark program will now maintain an updated running count."
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {
371 |     "collapsed": true
372 |    },
373 |    "outputs": [],
374 |    "source": []
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "collapsed": true
381 |    },
382 |    "outputs": [],
383 |    "source": []
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {
389 |     "collapsed": true
390 |    },
391 |    "outputs": [],
392 |    "source": []
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {
398 |     "collapsed": true
399 |    },
400 |    "outputs": [],
401 |    "source": []
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "metadata": {
407 |     "collapsed": true
408 |    },
409 |    "outputs": [],
410 |    "source": []
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {
416 |     "collapsed": true
417 |    },
418 |    "outputs": [],
419 |    "source": []
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {
425 |     "collapsed": true
426 |    },
427 |    "outputs": [],
428 |    "source": []
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {
434 |     "collapsed": true
435 |    },
436 |    "outputs": [],
437 |    "source": []
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": null,
442 |    "metadata": {
443 |     "collapsed": true
444 |    },
445 |    "outputs": [],
446 |    "source": []
447 |   }
448 |  ],
449 |  "metadata": {
450 |   "kernelspec": {
451 |    "display_name": "Python 3",
452 |    "language": "python",
453 |    "name": "python3"
454 |   },
455 |   "language_info": {
456 |    "codemirror_mode": {
457 |     "name": "ipython",
458 |     "version": 3
459 |    },
460 |    "file_extension": ".py",
461 |    "mimetype": "text/x-python",
462 |    "name": "python",
463 |    "nbconvert_exporter": "python",
464 |    "pygments_lexer": "ipython3",
465 |    "version": "3.5.1"
466 |   }
467 |  },
468 |  "nbformat": 4,
469 |  "nbformat_minor": 0
470 | }
471 | 


--------------------------------------------------------------------------------
/lectures/21H_Spark_Cloud.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Spark on Cloud\n",
  8 |     "====\n",
  9 |     "\n",
 10 |     "How to set up and run Spark on Azure or AWS EC2 clusters."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "Azure\n",
 18 |     "----\n",
 19 |     "\n",
 20 |     "Follow [instructions provided by Microsoft](https://azure.microsoft.com/en-us/documentation/articles/hdinsight-apache-spark-jupyter-spark-sql/).\n",
 21 |     "\n",
 22 |     "To terminate the cluster, you have to **delete** it."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "AWS\n",
 30 |     "----\n",
 31 |     "\n",
 32 |     "AWS setup is more involved. We will show how to access `pyspark` via ssh to an `EMR` cluster, as well as how to set up the `Zeppelin` browser-based notebook (similar to Jupyter).\n",
 33 |     "\n",
 34 |     "**References**\n",
 35 |     "\n",
 36 |     "- [EMR Spark](http://docs.aws.amazon.com/ElasticMapReduce/latest/ReleaseGuide/emr-spark.html)\n",
 37 |     "- [AWS tutorial](http://docs.aws.amazon.com/ElasticMapReduce/latest/ManagementGuide/emr-gs.html)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "Know your AWS public and private [access keys](http://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSGettingStartedGuide/AWSCredentials.html)\n",
 45 |     "----\n",
 46 |     "\n",
 47 |     "These will look something like\n",
 48 |     "\n",
 49 |     "- public: `AKIAIOSFODNN7EXAMPLE`\n",
 50 |     "- private: `wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY`"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "Know your AWS EC2 [key-pair](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html)\n",
 58 |     "----\n",
 59 |     "\n",
 60 |     "This is a name that you give - mine is `cliburn-2016` and an associated PEM file - I keep mine at ~/AWS/cliburn-2016.pem.\n",
 61 |     "\n",
 62 |     "Set the correct permissions on the PEM file.\n",
 63 |     "```\n",
 64 |     "chmod 400 xxx.pem\n",
 65 |     "```"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "source": [
 74 |     "Install AWS command line client\n",
 75 |     "----\n",
 76 |     "\n",
 77 |     "```\n",
 78 |     "pip install awscli\n",
 79 |     "```\n",
 80 |     "\n",
 81 |     "If you run into problems, see [docs](http://docs.aws.amazon.com/cli/latest/userguide/installing.html)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "Configure the AWS command line client\n",
 89 |     "----\n",
 90 |     "\n",
 91 |     "```\n",
 92 |     "aws configure\n",
 93 |     "```\n",
 94 |     "\n",
 95 |     "```\n",
 96 |     "AWS Access Key ID: <<Your public access key>>\n",
 97 |     "AWS Secret Access Key: <<Your private access key>>\n",
 98 |     "Default region name: us-east-1\n",
 99 |     "Default output format: json\n",
100 |     "```"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "Create a cluster\n",
108 |     "----\n",
109 |     "\n",
110 |     "**Warning**: You will be charged for this.\n",
111 |     "\n",
112 |     "```\n",
113 |     "aws emr create-cluster --name \"<<NAME-FOR-CLUSTER>>\" --release-label  emr-4.5.0 --applications Name=Spark Name=Zeppelin-Sandbox  --ec2-attributes KeyName=<<Your key-pair>>> --instance-type m3.xlarge --instance-count 3 --use-default-roles\n",
114 |     "```\n",
115 |     "\n",
116 |     "For example, I start mine with\n",
117 |     "```\n",
118 |     "aws emr create-cluster --name \"spak-2016-d\" --release-label    emr-4.5.0 --applications Name=Spark Name=Zeppelin-Sandbox  --ec2-attributes KeyName=\"cliburn-2016\"  --instance-type m3.xlarge --instance-count 3 --use-default-role\n",
119 |     "```\n",
120 |     "\n",
121 |     "A cluster-id should be returned\n",
122 |     "```\n",
123 |     "{\n",
124 |     "    \"ClusterId\": \"j-XXXXXXXXXXXXXXX\"\n",
125 |     "}\n",
126 |     "```"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "Get information about the cluster\n",
134 |     "-----\n",
135 |     "\n",
136 |     "```\n",
137 |     "aws emr describe-cluster --cluster-id -XXXXXXXXXXXXXXX\n",
138 |     "```\n",
139 |     "\n",
140 |     "or just inspect the state\n",
141 |     "```\n",
142 |     "aws emr describe-cluster --cluster-id -XXXXXXXXXXXXXXX | grep \\\"State\\\"\n",
143 |     "```"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "Connect to the cluster via `ssh`\n",
151 |     "----\n",
152 |     "\n",
153 |     "```\n",
154 |     "aws emr ssh --cluster-id -XXXXXXXXXXXXXXX --key-pair-file cliburn-2016.pem \n",
155 |     "```"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "Note the IP address that is returned\n",
163 |     "----\n",
164 |     "\n",
165 |     "It will be something like `ec2-XX-X-XX-XXX.compute-1.amazonaws.com`"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "Run `pyspark`\n",
173 |     "----\n",
174 |     "\n",
175 |     "Run\n",
176 |     "```\n",
177 |     "pyspark\n",
178 |     "```\n",
179 |     "\n",
180 |     "And you will be in a `pyspark` console where you can issue Spark commands.\n",
181 |     "\n",
182 |     "When you've had enough fun playing in `pyspark` for a while, end the session with `Ctrl-D` and exit to leave the `ssh` session."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "Run the `Zepellin` [notebook](https://zeppelin.incubator.apache.org)\n",
190 |     "----\n",
191 |     "\n",
192 |     "Create an SSH tunnel to port 8890\n",
193 |     "\n",
194 |     "```\n",
195 |     "ssh -i xxx.pem -L 8192:ec2-xx-xx-xx.compute-1.amazonaws.com:8192 hadoop@ec2-xx-xx-xx-xx.compute-1.amazonaws.com -N -v\n",
196 |     "```\n",
197 |     "\n",
198 |     "Fill in the `xxx` with the locatin of your PEM file, and the appropriate IP address."
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "Connect to `Zeppelin` notebook\n",
206 |     "----\n",
207 |     "\n",
208 |     "Open a browser to http://localhost:8890/ - if it worked you should see this\n",
209 |     "\n",
210 |     "![Zeppelin screenshot](http://cloudacademy.com/blog/wp-content/uploads/2016/01/Zeppelin-Notebook-1.png)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "Create notebook and run Spark within it\n",
218 |     "----\n",
219 |     "\n",
220 |     "The default cell uses `scala`. For `pyspark` just start a cell with `%pyspark`."
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "Terminate the cluster\n",
228 |     "----\n",
229 |     "\n",
230 |     "When you are done, remember to terminate the cluster!\n",
231 |     "\n",
232 |     "```\n",
233 |     "aws emr terminate-clusters --cluster-id j-XXXXXXXXXXXXXXX\n",
234 |     "```\n",
235 |     "\n",
236 |     "and confirm that it is terminating\n",
237 |     "\n",
238 |     "```\n",
239 |     "aws emr describe-cluster --cluster-id j-XXXXXXXXXXXXXXX | grep \\\"State\\\"\n",
240 |     "```\n",
241 |     "\n",
242 |     "You should see\n",
243 |     "\n",
244 |     "```\n",
245 |     "                    \"State\": \"TERMINATING\"\n",
246 |     "                    \"State\": \"TERMINATING\"\n",
247 |     "            \"State\": \"TERMINATING\"\n",
248 |     "```\n",
249 |     "\n",
250 |     "If you are paranoid, log into the [AWS Management Console ](https://aws.amazon.com/console/) and click on `Services | EMR` and check the status of your cluster."
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {
257 |     "collapsed": true
258 |    },
259 |    "outputs": [],
260 |    "source": []
261 |   }
262 |  ],
263 |  "metadata": {
264 |   "kernelspec": {
265 |    "display_name": "Python 3",
266 |    "language": "python",
267 |    "name": "python3"
268 |   },
269 |   "language_info": {
270 |    "codemirror_mode": {
271 |     "name": "ipython",
272 |     "version": 3
273 |    },
274 |    "file_extension": ".py",
275 |    "mimetype": "text/x-python",
276 |    "name": "python",
277 |    "nbconvert_exporter": "python",
278 |    "pygments_lexer": "ipython3",
279 |    "version": "3.5.1"
280 |   }
281 |  },
282 |  "nbformat": 4,
283 |  "nbformat_minor": 0
284 | }
285 | 


--------------------------------------------------------------------------------
/lectures/Customizing_Jupyter.ipynb:
--------------------------------------------------------------------------------
1 | ../misc/Customizing_Jupyter.ipynb


--------------------------------------------------------------------------------
/lectures/Extra_Packages.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "Extra Packages\n",
 8 |     "====\n",
 9 |     "\n",
10 |     "These are packages used in the notebooks that are not part of the standard Anaconda distribution. If you get a package not found error, execute the appropriate installation code cell and restart the kernel in the notebook with the error."
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "markdown",
15 |    "metadata": {
16 |     "collapsed": true
17 |    },
18 |    "source": [
19 |     "```bash\n",
20 |     "\n",
21 |     "! conda update conda\n",
22 |     "! conda update -y matplotlib pandas scikit-learn seaborn\n",
23 |     "\n",
24 |     "! pip instll version_information\n",
25 |     "! pip install rpy2\n",
26 |     "! pip install ggplot\n",
27 |     "! pip install qgrid\n",
28 |     "! pip install ipython-sql\n",
29 |     "! pip install pandasql\n",
30 |     "! pip install lshash\n",
31 |     "! pip install hat-trie\n",
32 |     "! pip install hyperloglog\n",
33 |     "! pip install git+https://github.com/jaybaird/python-bloomfilter.git\n",
34 |     "```"
35 |    ]
36 |   },
37 |   {
38 |    "cell_type": "code",
39 |    "execution_count": null,
40 |    "metadata": {
41 |     "collapsed": true
42 |    },
43 |    "outputs": [],
44 |    "source": []
45 |   }
46 |  ],
47 |  "metadata": {
48 |   "kernelspec": {
49 |    "display_name": "Python 3",
50 |    "language": "python",
51 |    "name": "python3"
52 |   },
53 |   "language_info": {
54 |    "codemirror_mode": {
55 |     "name": "ipython",
56 |     "version": 3
57 |    },
58 |    "file_extension": ".py",
59 |    "mimetype": "text/x-python",
60 |    "name": "python",
61 |    "nbconvert_exporter": "python",
62 |    "pygments_lexer": "ipython3",
63 |    "version": "3.5.1"
64 |   }
65 |  },
66 |  "nbformat": 4,
67 |  "nbformat_minor": 0
68 | }
69 | 


--------------------------------------------------------------------------------
/lectures/HtWt.csv:
--------------------------------------------------------------------------------
 1 | male,height,weight
 2 | 0,63.2,168.7
 3 | 0,68.7,169.8
 4 | 0,64.8,176.6
 5 | 0,67.9,246.8
 6 | 1,68.9,151.6
 7 | 1,67.8,158.0
 8 | 1,68.2,168.6
 9 | 0,64.8,137.2
10 | 1,64.3,177.0
11 | 0,64.7,128.0
12 | 1,66.9,168.4
13 | 1,66.9,136.2
14 | 1,67.1,160.3
15 | 1,70.2,233.9
16 | 1,67.4,171.7
17 | 1,71.1,185.5
18 | 0,63.4,177.6
19 | 1,66.9,132.9
20 | 0,71.0,140.1
21 | 1,70.4,151.9
22 | 0,59.5,147.2
23 | 1,70.4,159.0
24 | 0,61.5,113.0
25 | 1,74.5,194.5
26 | 0,65.3,145.1
27 | 1,68.8,196.5
28 | 0,67.2,148.9
29 | 1,68.7,132.9
30 | 0,60.0,168.4
31 | 0,62.5,146.2
32 | 1,72.0,236.4
33 | 1,67.9,140.0
34 | 1,65.1,156.2
35 | 1,63.5,178.7
36 | 1,68.2,147.5
37 | 0,64.6,97.7
38 | 1,68.1,189.6
39 | 0,66.2,221.9
40 | 0,62.8,168.1
41 | 0,65.3,143.1
42 | 0,65.8,217.7
43 | 0,68.7,133.2
44 | 0,63.8,96.5
45 | 1,70.6,270.6
46 | 0,61.5,137.2
47 | 0,61.9,124.2
48 | 0,65.1,128.3
49 | 1,68.7,203.6
50 | 0,57.6,132.4
51 | 1,66.3,189.4
52 | 1,69.0,174.0
53 | 0,63.4,163.3
54 | 1,69.5,183.5
55 | 1,67.8,193.8
56 | 0,61.6,119.7
57 | 1,71.2,157.4
58 | 1,67.4,146.1
59 | 0,66.1,128.3
60 | 1,70.7,179.1
61 | 0,67.0,140.0
62 | 1,66.8,202.2
63 | 1,69.9,169.4
64 | 0,57.7,122.8
65 | 0,62.5,248.5
66 | 1,66.6,154.4
67 | 0,60.6,140.2
68 | 1,70.4,141.6
69 | 0,66.4,144.4
70 | 0,62.3,116.2
71 | 1,73.3,175.0
72 | 


--------------------------------------------------------------------------------
/lectures/Lagrange_multiplier.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/Lagrange_multiplier.png


--------------------------------------------------------------------------------
/lectures/Local_Installation.ipynb:
--------------------------------------------------------------------------------
1 | ../misc/Local_Installation.ipynb


--------------------------------------------------------------------------------
/lectures/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help
 23 | help:
 24 | 	@echo "Please use \`make <target>' where <target> is one of"
 25 | 	@echo "  html       to make standalone HTML files"
 26 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 27 | 	@echo "  singlehtml to make a single large HTML file"
 28 | 	@echo "  pickle     to make pickle files"
 29 | 	@echo "  json       to make JSON files"
 30 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 31 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 32 | 	@echo "  applehelp  to make an Apple Help Book"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 49 | 
 50 | .PHONY: clean
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | .PHONY: html
 55 | html:
 56 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 57 | 	@echo
 58 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 59 | 
 60 | .PHONY: dirhtml
 61 | dirhtml:
 62 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 63 | 	@echo
 64 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 65 | 
 66 | .PHONY: singlehtml
 67 | singlehtml:
 68 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 69 | 	@echo
 70 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 71 | 
 72 | .PHONY: pickle
 73 | pickle:
 74 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 75 | 	@echo
 76 | 	@echo "Build finished; now you can process the pickle files."
 77 | 
 78 | .PHONY: json
 79 | json:
 80 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 81 | 	@echo
 82 | 	@echo "Build finished; now you can process the JSON files."
 83 | 
 84 | .PHONY: htmlhelp
 85 | htmlhelp:
 86 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 89 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 90 | 
 91 | .PHONY: qthelp
 92 | qthelp:
 93 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 94 | 	@echo
 95 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 96 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 97 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/ComputationalStatisticsinPython.qhcp"
 98 | 	@echo "To view the help file:"
 99 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/ComputationalStatisticsinPython.qhc"
100 | 
101 | .PHONY: applehelp
102 | applehelp:
103 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
104 | 	@echo
105 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
106 | 	@echo "N.B. You won't be able to view it unless you put it in" \
107 | 	      "~/Library/Documentation/Help or install it in your application" \
108 | 	      "bundle."
109 | 
110 | .PHONY: devhelp
111 | devhelp:
112 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
113 | 	@echo
114 | 	@echo "Build finished."
115 | 	@echo "To view the help file:"
116 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/ComputationalStatisticsinPython"
117 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/ComputationalStatisticsinPython"
118 | 	@echo "# devhelp"
119 | 
120 | .PHONY: epub
121 | epub:
122 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
123 | 	@echo
124 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
125 | 
126 | .PHONY: latex
127 | latex:
128 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
129 | 	@echo
130 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
131 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
132 | 	      "(use \`make latexpdf' here to do that automatically)."
133 | 
134 | .PHONY: latexpdf
135 | latexpdf:
136 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | 	@echo "Running LaTeX files through pdflatex..."
138 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
139 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140 | 
141 | .PHONY: latexpdfja
142 | latexpdfja:
143 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
144 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
145 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
146 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
147 | 
148 | .PHONY: text
149 | text:
150 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
151 | 	@echo
152 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
153 | 
154 | .PHONY: man
155 | man:
156 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
157 | 	@echo
158 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
159 | 
160 | .PHONY: texinfo
161 | texinfo:
162 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
163 | 	@echo
164 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
165 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
166 | 	      "(use \`make info' here to do that automatically)."
167 | 
168 | .PHONY: info
169 | info:
170 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
171 | 	@echo "Running Texinfo files through makeinfo..."
172 | 	make -C $(BUILDDIR)/texinfo info
173 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
174 | 
175 | .PHONY: gettext
176 | gettext:
177 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
178 | 	@echo
179 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
180 | 
181 | .PHONY: changes
182 | changes:
183 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
184 | 	@echo
185 | 	@echo "The overview file is in $(BUILDDIR)/changes."
186 | 
187 | .PHONY: linkcheck
188 | linkcheck:
189 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
190 | 	@echo
191 | 	@echo "Link check complete; look for any errors in the above output " \
192 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
193 | 
194 | .PHONY: doctest
195 | doctest:
196 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
197 | 	@echo "Testing of doctests in the sources finished, look at the " \
198 | 	      "results in $(BUILDDIR)/doctest/output.txt."
199 | 
200 | .PHONY: coverage
201 | coverage:
202 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
203 | 	@echo "Testing of coverage in the sources finished, look at the " \
204 | 	      "results in $(BUILDDIR)/coverage/python.txt."
205 | 
206 | .PHONY: xml
207 | xml:
208 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
209 | 	@echo
210 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
211 | 
212 | .PHONY: pseudoxml
213 | pseudoxml:
214 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
215 | 	@echo
216 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
217 | 


--------------------------------------------------------------------------------
/lectures/Spark03.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "Spark Libraries\n",
 8 |     "====\n",
 9 |     "\n",
10 |     "A tour of the Spark SQL, Streaming, GraphX libraries and MLLib libraries."
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 1,
16 |    "metadata": {
17 |     "collapsed": true
18 |    },
19 |    "outputs": [],
20 |    "source": [
21 |     "from pyspark import SparkContext\n",
22 |     "sc = SparkContext('local[*]')"
23 |    ]
24 |   },
25 |   {
26 |    "cell_type": "code",
27 |    "execution_count": null,
28 |    "metadata": {
29 |     "collapsed": true
30 |    },
31 |    "outputs": [],
32 |    "source": []
33 |   }
34 |  ],
35 |  "metadata": {
36 |   "kernelspec": {
37 |    "display_name": "Python 3",
38 |    "language": "python",
39 |    "name": "python3"
40 |   },
41 |   "language_info": {
42 |    "codemirror_mode": {
43 |     "name": "ipython",
44 |     "version": 3
45 |    },
46 |    "file_extension": ".py",
47 |    "mimetype": "text/x-python",
48 |    "name": "python",
49 |    "nbconvert_exporter": "python",
50 |    "pygments_lexer": "ipython3",
51 |    "version": "3.5.1"
52 |   }
53 |  },
54 |  "nbformat": 4,
55 |  "nbformat_minor": 0
56 | }
57 | 


--------------------------------------------------------------------------------
/lectures/Spark04.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "Spark on a Cluster\n",
 8 |     "====\n",
 9 |     "\n",
10 |     "How to set up and run Spark on an AWS EC2 cluster."
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": null,
16 |    "metadata": {
17 |     "collapsed": true
18 |    },
19 |    "outputs": [],
20 |    "source": []
21 |   }
22 |  ],
23 |  "metadata": {
24 |   "kernelspec": {
25 |    "display_name": "Python 3",
26 |    "language": "python",
27 |    "name": "python3"
28 |   },
29 |   "language_info": {
30 |    "codemirror_mode": {
31 |     "name": "ipython",
32 |     "version": 3
33 |    },
34 |    "file_extension": ".py",
35 |    "mimetype": "text/x-python",
36 |    "name": "python",
37 |    "nbconvert_exporter": "python",
38 |    "pygments_lexer": "ipython3",
39 |    "version": "3.5.1"
40 |   }
41 |  },
42 |  "nbformat": 4,
43 |  "nbformat_minor": 0
44 | }
45 | 


--------------------------------------------------------------------------------
/lectures/Template01.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Jupyter notebook\n",
  8 |     "----\n",
  9 |     "```\n",
 10 |     "Quick tour\n",
 11 |     "Keyboard shortcuts\n",
 12 |     "Markdown cells\n",
 13 |     "Code cells\n",
 14 |     "Switching kernels\n",
 15 |     "```"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "\n"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "Unix shell\n",
 37 |     "-----\n",
 38 |     "```\n",
 39 |     "Getting information: man, info, Google\n",
 40 |     "File and directory naviagtion - pwd, ls, cd, mkdir, rm, cp, mv\n",
 41 |     "Pipes and I/O: |, >, <\n",
 42 |     "Finding stuff: find, grep, locate\n",
 43 |     "```"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {
 50 |     "collapsed": true
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "\n",
 55 |     "\n",
 56 |     "\n",
 57 |     "\n"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "Interactive Python\n",
 65 |     "----\n",
 66 |     "```\n",
 67 |     "help, ?, ??\n",
 68 |     "magic functions\n",
 69 |     "Calling R from Jupyter\n",
 70 |     "Simple calculations\n",
 71 |     "Everything is an object\n",
 72 |     "```"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {
 79 |     "collapsed": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "\n"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Types and Collections\n",
 94 |     "----\n",
 95 |     "```\n",
 96 |     "bool, int, float, complex\n",
 97 |     "string\n",
 98 |     "None\n",
 99 |     "tuple and named_tuple, tuple unpacking\n",
100 |     "list \n",
101 |     "set\n",
102 |     "dictionarie, ordered_dictionarie, defaultdict\n",
103 |     "numpy.array\n",
104 |     "```"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {
111 |     "collapsed": true
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "\n",
116 |     "\n",
117 |     "\n"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "Operators\n",
125 |     "----\n",
126 |     "```\n",
127 |     "+, -, *, /, //, **, %\n",
128 |     "==, != , <, >=\n",
129 |     "and, or, not, ~, |, &\n",
130 |     "<<, >>\n",
131 |     "+=, *= etc\n",
132 |     "in\n",
133 |     "Operator overloading - list, set, dict operators\n",
134 |     "```"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "collapsed": true
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "\n",
146 |     "\n",
147 |     "\n",
148 |     "\n"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "Names, assignment and identity\n",
156 |     "----\n",
157 |     "```\n",
158 |     "=, ==, is, id\n",
159 |     "```"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {
166 |     "collapsed": true
167 |    },
168 |    "outputs": [],
169 |    "source": [
170 |     "\n",
171 |     "\n",
172 |     "\n"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "Exercise: Word counter\n",
180 |     "----\n",
181 |     "\n",
182 |     "Coutn the number of times each word occurs in the poem Jabberwocky. Ignore case and remove all puncutation."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {
189 |     "collapsed": true
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "jabberwocky = '''\n",
194 |     "’Twas brillig, and the slithy toves\n",
195 |     "      Did gyre and gimble in the wabe:\n",
196 |     "All mimsy were the borogoves,\n",
197 |     "      And the mome raths outgrabe.\n",
198 |     "\n",
199 |     "“Beware the Jabberwock, my son!\n",
200 |     "      The jaws that bite, the claws that catch!\n",
201 |     "Beware the Jubjub bird, and shun\n",
202 |     "      The frumious Bandersnatch!”\n",
203 |     "\n",
204 |     "He took his vorpal sword in hand;\n",
205 |     "      Long time the manxome foe he sought—\n",
206 |     "So rested he by the Tumtum tree\n",
207 |     "      And stood awhile in thought.\n",
208 |     "\n",
209 |     "And, as in uffish thought he stood,\n",
210 |     "      The Jabberwock, with eyes of flame,\n",
211 |     "Came whiffling through the tulgey wood,\n",
212 |     "      And burbled as it came!\n",
213 |     "\n",
214 |     "One, two! One, two! And through and through\n",
215 |     "      The vorpal blade went snicker-snack!\n",
216 |     "He left it dead, and with its head\n",
217 |     "      He went galumphing back.\n",
218 |     "\n",
219 |     "“And hast thou slain the Jabberwock?\n",
220 |     "      Come to my arms, my beamish boy!\n",
221 |     "O frabjous day! Callooh! Callay!”\n",
222 |     "      He chortled in his joy.\n",
223 |     "\n",
224 |     "’Twas brillig, and the slithy toves\n",
225 |     "      Did gyre and gimble in the wabe:\n",
226 |     "All mimsy were the borogoves,\n",
227 |     "      And the mome raths outgrabe.\n",
228 |     "'''"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {
235 |     "collapsed": true
236 |    },
237 |    "outputs": [],
238 |    "source": [
239 |     "\n",
240 |     "\n",
241 |     "\n",
242 |     "\n"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "Control Flow\n",
250 |     "----\n",
251 |     "```\n",
252 |     "if, elif, else\n",
253 |     "for\n",
254 |     "while\n",
255 |     "continue\n",
256 |     "break\n",
257 |     "ternary operator\n",
258 |     "```"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {
265 |     "collapsed": true
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "\n",
270 |     "\n",
271 |     "\n",
272 |     "\n"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "Exercise: FizzBuzz\n",
280 |     "----\n",
281 |     "```\n",
282 |     "The \"Fizz-Buzz test\" is an interview question designed to help filter out the 99.5% of programming job candidates who can't seem to program their way out of a wet paper bag. The text of the programming assignment is as follows:\n",
283 |     "\"Write a program that prints the numbers from 1 to 100. But for multiples of three print “Fizz” instead of the number and for the multiples of five print “Buzz”. For numbers which are multiples of both three and five print “FizzBuzz”.\"\n",
284 |     "```\n",
285 |     "Source: [Using FizzBuzz to Find Developers who Grok Coding](http://tickletux.wordpress.com/2007/01/24/using-fizzbuzz-to-find-developers-who-grok-coding/)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {
292 |     "collapsed": true
293 |    },
294 |    "outputs": [],
295 |    "source": [
296 |     "\n",
297 |     "\n",
298 |     "\n",
299 |     "\n"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "Built-in functions\n",
307 |     "----"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 3,
313 |    "metadata": {
314 |     "collapsed": false
315 |    },
316 |    "outputs": [
317 |     {
318 |      "data": {
319 |       "text/plain": [
320 |        "['abs',\n",
321 |        " 'all',\n",
322 |        " 'any',\n",
323 |        " 'ascii',\n",
324 |        " 'bin',\n",
325 |        " 'bool',\n",
326 |        " 'bytearray',\n",
327 |        " 'bytes',\n",
328 |        " 'callable',\n",
329 |        " 'chr',\n",
330 |        " 'classmethod',\n",
331 |        " 'compile',\n",
332 |        " 'complex',\n",
333 |        " 'copyright',\n",
334 |        " 'credits',\n",
335 |        " 'delattr',\n",
336 |        " 'dict',\n",
337 |        " 'dir',\n",
338 |        " 'divmod',\n",
339 |        " 'dreload',\n",
340 |        " 'enumerate',\n",
341 |        " 'eval',\n",
342 |        " 'exec',\n",
343 |        " 'filter',\n",
344 |        " 'float',\n",
345 |        " 'format',\n",
346 |        " 'frozenset',\n",
347 |        " 'get_ipython',\n",
348 |        " 'getattr',\n",
349 |        " 'globals',\n",
350 |        " 'hasattr',\n",
351 |        " 'hash',\n",
352 |        " 'help',\n",
353 |        " 'hex',\n",
354 |        " 'id',\n",
355 |        " 'input',\n",
356 |        " 'int',\n",
357 |        " 'isinstance',\n",
358 |        " 'issubclass',\n",
359 |        " 'iter',\n",
360 |        " 'len',\n",
361 |        " 'license',\n",
362 |        " 'list',\n",
363 |        " 'locals',\n",
364 |        " 'map',\n",
365 |        " 'max',\n",
366 |        " 'memoryview',\n",
367 |        " 'min',\n",
368 |        " 'next',\n",
369 |        " 'object',\n",
370 |        " 'oct',\n",
371 |        " 'open',\n",
372 |        " 'ord',\n",
373 |        " 'pow',\n",
374 |        " 'print',\n",
375 |        " 'property',\n",
376 |        " 'range',\n",
377 |        " 'repr',\n",
378 |        " 'reversed',\n",
379 |        " 'round',\n",
380 |        " 'set',\n",
381 |        " 'setattr',\n",
382 |        " 'slice',\n",
383 |        " 'sorted',\n",
384 |        " 'staticmethod',\n",
385 |        " 'str',\n",
386 |        " 'sum',\n",
387 |        " 'super',\n",
388 |        " 'tuple',\n",
389 |        " 'type',\n",
390 |        " 'vars',\n",
391 |        " 'zip']"
392 |       ]
393 |      },
394 |      "execution_count": 3,
395 |      "metadata": {},
396 |      "output_type": "execute_result"
397 |     }
398 |    ],
399 |    "source": [
400 |     "([x for x in dir(__builtin__)  if x.islower() and not x.startswith('__')])"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "metadata": {
407 |     "collapsed": true
408 |    },
409 |    "outputs": [],
410 |    "source": [
411 |     "\n",
412 |     "\n",
413 |     "\n",
414 |     "\n"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {},
420 |    "source": [
421 |     "User-defined functions\n",
422 |     "----\n",
423 |     "```\n",
424 |     "def\n",
425 |     "lambda\n",
426 |     "higher order functions\n",
427 |     "```"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {
434 |     "collapsed": true
435 |    },
436 |    "outputs": [],
437 |    "source": [
438 |     "\n",
439 |     "\n",
440 |     "\n",
441 |     "\n"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "markdown",
446 |    "metadata": {},
447 |    "source": [
448 |     "Functional style\n",
449 |     "----\n",
450 |     "```\n",
451 |     "map, reduce, filter\n",
452 |     "comprehensions - list, set, dictionary\n",
453 |     "generator expressions\n",
454 |     "```"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": null,
460 |    "metadata": {
461 |     "collapsed": true
462 |    },
463 |    "outputs": [],
464 |    "source": [
465 |     "\n",
466 |     "\n",
467 |     "\n",
468 |     "\n"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "markdown",
473 |    "metadata": {},
474 |    "source": [
475 |     "Exericse\n",
476 |     "----\n",
477 |     "\n",
478 |     "Write a program to flatten a list of lists into a flat list. For example,\n",
479 |     "```\n",
480 |     "flatten([[1,2,3],[4,5],[6,7,8]]) should return [1,2,3,4,5,6,7,8].\n",
481 |     "```\n",
482 |     "\n",
483 |     "Do this using\n",
484 |     "- a for loop \n",
485 |     "- a list comprehension"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": null,
491 |    "metadata": {
492 |     "collapsed": true
493 |    },
494 |    "outputs": [],
495 |    "source": [
496 |     "\n",
497 |     "\n",
498 |     "\n",
499 |     "\n"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {},
505 |    "source": [
506 |     "Modules\n",
507 |     "----\n",
508 |     "```\n",
509 |     "Installing new modules\n",
510 |     "Writing your own module\n",
511 |     "Importing from a modue\n",
512 |     "```"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": null,
518 |    "metadata": {
519 |     "collapsed": true
520 |    },
521 |    "outputs": [],
522 |    "source": [
523 |     "\n",
524 |     "\n",
525 |     "\n"
526 |    ]
527 |   }
528 |  ],
529 |  "metadata": {
530 |   "kernelspec": {
531 |    "display_name": "Python 3",
532 |    "language": "python",
533 |    "name": "python3"
534 |   },
535 |   "language_info": {
536 |    "codemirror_mode": {
537 |     "name": "ipython",
538 |     "version": 3
539 |    },
540 |    "file_extension": ".py",
541 |    "mimetype": "text/x-python",
542 |    "name": "python",
543 |    "nbconvert_exporter": "python",
544 |    "pygments_lexer": "ipython3",
545 |    "version": "3.5.1"
546 |   }
547 |  },
548 |  "nbformat": 4,
549 |  "nbformat_minor": 0
550 | }
551 | 


--------------------------------------------------------------------------------
/lectures/commutative.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/commutative.png


--------------------------------------------------------------------------------
/lectures/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Computational Statistics in Python documentation build configuration file, created by
  5 | # sphinx-quickstart on Thu Jan 14 10:45:35 2016.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | import sys
 17 | import os
 18 | import shlex
 19 | import cloud_sptheme as csp
 20 | 
 21 | # If extensions (or modules to document with autodoc) are in another directory,
 22 | # add these directories to sys.path here. If the directory is relative to the
 23 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 24 | #sys.path.insert(0, os.path.abspath('.'))
 25 | 
 26 | # -- General configuration ------------------------------------------------
 27 | 
 28 | # If your documentation needs a minimal Sphinx version, state it here.
 29 | #needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = [
 35 |     'nbsphinx',
 36 |     'sphinx.ext.mathjax',
 37 |     'IPython.sphinxext.ipython_console_highlighting',
 38 | ]
 39 | 
 40 | # Add any paths that contain templates here, relative to this directory.
 41 | templates_path = ['_templates']
 42 | 
 43 | # The suffix(es) of source filenames.
 44 | # You can specify multiple suffix as a list of string:
 45 | # source_suffix = ['.rst', '.md']
 46 | source_suffix = '.rst'
 47 | 
 48 | # The encoding of source files.
 49 | #source_encoding = 'utf-8-sig'
 50 | 
 51 | # The master toctree document.
 52 | master_doc = 'index'
 53 | 
 54 | # General information about the project.
 55 | project = 'Computational Statistics in Python'
 56 | copyright = '2016, Cliburn Chan, Janice McCarthy'
 57 | author = 'Cliburn Chan, Janice McCarthy'
 58 | 
 59 | # The version info for the project you're documenting, acts as replacement for
 60 | # |version| and |release|, also used in various other places throughout the
 61 | # built documents.
 62 | #
 63 | # The short X.Y version.
 64 | version = '0.1'
 65 | # The full version, including alpha/beta/rc tags.
 66 | release = '0.1'
 67 | 
 68 | # The language for content autogenerated by Sphinx. Refer to documentation
 69 | # for a list of supported languages.
 70 | #
 71 | # This is also used if you do content translation via gettext catalogs.
 72 | # Usually you set "language" from the command line for these cases.
 73 | language = None
 74 | 
 75 | # There are two options for replacing |today|: either, you set today to some
 76 | # non-false value, then it is used:
 77 | #today = ''
 78 | # Else, today_fmt is used as the format for a strftime call.
 79 | #today_fmt = '%B %d, %Y'
 80 | 
 81 | # List of patterns, relative to source directory, that match files and
 82 | # directories to ignore when looking for source files.
 83 | exclude_patterns = ['_build',  '**.ipynb_checkpoints']
 84 | 
 85 | # The reST default role (used for this markup: `text`) to use for all
 86 | # documents.
 87 | #default_role = None
 88 | 
 89 | # If true, '()' will be appended to :func: etc. cross-reference text.
 90 | #add_function_parentheses = True
 91 | 
 92 | # If true, the current module name will be prepended to all description
 93 | # unit titles (such as .. function::).
 94 | #add_module_names = True
 95 | 
 96 | # If true, sectionauthor and moduleauthor directives will be shown in the
 97 | # output. They are ignored by default.
 98 | #show_authors = False
 99 | 
100 | # The name of the Pygments (syntax highlighting) style to use.
101 | pygments_style = 'sphinx'
102 | 
103 | # A list of ignored prefixes for module index sorting.
104 | #modindex_common_prefix = []
105 | 
106 | # If true, keep warnings as "system message" paragraphs in the built documents.
107 | #keep_warnings = False
108 | 
109 | # If true, `todo` and `todoList` produce output, else they produce nothing.
110 | todo_include_todos = False
111 | 
112 | 
113 | # -- Options for HTML output ----------------------------------------------
114 | 
115 | # The theme to use for HTML and HTML Help pages.  See the documentation for
116 | # a list of builtin themes.
117 | html_theme = 'cloud'
118 | 
119 | # Theme options are theme-specific and customize the look and feel of a theme
120 | # further.  For a list of options available for each theme, see the
121 | # documentation.
122 | #html_theme_options = {}
123 | 
124 | # Add any paths that contain custom themes here, relative to this directory.
125 | #html_theme_path = []
126 | 
127 | # The name for this set of Sphinx documents.  If None, it defaults to
128 | # "<project> v<release> documentation".
129 | html_title = "Computational Statistics in Python"
130 | 
131 | # A shorter title for the navigation bar.  Default is the same as html_title.
132 | #html_short_title = None
133 | 
134 | # The name of an image file (relative to this directory) to place at the top
135 | # of the sidebar.
136 | #html_logo = None
137 | 
138 | # The name of an image file (within the static path) to use as favicon of the
139 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
140 | # pixels large.
141 | #html_favicon = None
142 | 
143 | # Add any paths that contain custom static files (such as style sheets) here,
144 | # relative to this directory. They are copied after the builtin static files,
145 | # so a file named "default.css" will overwrite the builtin "default.css".
146 | html_static_path = ['_static']
147 | 
148 | # Add any extra paths that contain custom files (such as robots.txt or
149 | # .htaccess) here, relative to this directory. These files are copied
150 | # directly to the root of the documentation.
151 | #html_extra_path = []
152 | 
153 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
154 | # using the given strftime format.
155 | #html_last_updated_fmt = '%b %d, %Y'
156 | 
157 | # If true, SmartyPants will be used to convert quotes and dashes to
158 | # typographically correct entities.
159 | #html_use_smartypants = True
160 | 
161 | # Custom sidebar templates, maps document names to template names.
162 | #html_sidebars = {}
163 | 
164 | # Additional templates that should be rendered to pages, maps page names to
165 | # template names.
166 | #html_additional_pages = {}
167 | 
168 | # If false, no module index is generated.
169 | #html_domain_indices = True
170 | 
171 | # If false, no index is generated.
172 | #html_use_index = True
173 | 
174 | # If true, the index is split into individual pages for each letter.
175 | #html_split_index = False
176 | 
177 | # If true, links to the reST sources are added to the pages.
178 | #html_show_sourcelink = True
179 | 
180 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
181 | #html_show_sphinx = True
182 | 
183 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
184 | #html_show_copyright = True
185 | 
186 | # If true, an OpenSearch description file will be output, and all pages will
187 | # contain a <link> tag referring to it.  The value of this option must be the
188 | # base URL from which the finished HTML is served.
189 | #html_use_opensearch = ''
190 | 
191 | # This is the file name suffix for HTML files (e.g. ".xhtml").
192 | #html_file_suffix = None
193 | 
194 | # Language to be used for generating the HTML full-text search index.
195 | # Sphinx supports the following languages:
196 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
197 | #   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
198 | #html_search_language = 'en'
199 | 
200 | # A dictionary with options for the search language support, empty by default.
201 | # Now only 'ja' uses this config value
202 | #html_search_options = {'type': 'default'}
203 | 
204 | # The name of a javascript file (relative to the configuration directory) that
205 | # implements a search results scorer. If empty, the default will be used.
206 | #html_search_scorer = 'scorer.js'
207 | 
208 | # Output file base name for HTML help builder.
209 | htmlhelp_basename = 'ComputationalStatisticsinPythondoc'
210 | 
211 | # -- Options for LaTeX output ---------------------------------------------
212 | 
213 | latex_elements = {
214 | # The paper size ('letterpaper' or 'a4paper').
215 | #'papersize': 'letterpaper',
216 | 
217 | # The font size ('10pt', '11pt' or '12pt').
218 | #'pointsize': '10pt',
219 | 
220 | # Additional stuff for the LaTeX preamble.
221 | #'preamble': '',
222 | 
223 | # Latex figure (float) alignment
224 | #'figure_align': 'htbp',
225 | }
226 | 
227 | # Grouping the document tree into LaTeX files. List of tuples
228 | # (source start file, target name, title,
229 | #  author, documentclass [howto, manual, or own class]).
230 | latex_documents = [
231 |   (master_doc, 'ComputationalStatisticsinPython.tex', 'Computational Statistics in Python',
232 |    'Cliburn Chan, Janice McCarthy', 'manual'),
233 | ]
234 | 
235 | # The name of an image file (relative to this directory) to place at the top of
236 | # the title page.
237 | #latex_logo = None
238 | 
239 | # For "manual" documents, if this is true, then toplevel headings are parts,
240 | # not chapters.
241 | #latex_use_parts = False
242 | 
243 | # If true, show page references after internal links.
244 | #latex_show_pagerefs = False
245 | 
246 | # If true, show URL addresses after external links.
247 | #latex_show_urls = False
248 | 
249 | # Documents to append as an appendix to all manuals.
250 | #latex_appendices = []
251 | 
252 | # If false, no module index is generated.
253 | #latex_domain_indices = True
254 | 
255 | 
256 | # -- Options for manual page output ---------------------------------------
257 | 
258 | # One entry per manual page. List of tuples
259 | # (source start file, name, description, authors, manual section).
260 | man_pages = [
261 |     (master_doc, 'computationalstatisticsinpython', 'Computational Statistics in Python',
262 |      [author], 1)
263 | ]
264 | 
265 | # If true, show URL addresses after external links.
266 | #man_show_urls = False
267 | 
268 | 
269 | # -- Options for Texinfo output -------------------------------------------
270 | 
271 | # Grouping the document tree into Texinfo files. List of tuples
272 | # (source start file, target name, title, author,
273 | #  dir menu entry, description, category)
274 | texinfo_documents = [
275 |   (master_doc, 'ComputationalStatisticsinPython', 'Computational Statistics in Python',
276 |    author, 'ComputationalStatisticsinPython', 'One line description of project.',
277 |    'Miscellaneous'),
278 | ]
279 | 
280 | # Documents to append as an appendix to all manuals.
281 | #texinfo_appendices = []
282 | 
283 | # If false, no module index is generated.
284 | #texinfo_domain_indices = True
285 | 
286 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
287 | #texinfo_show_urls = 'footnote'
288 | 
289 | # If true, do not generate a @detailmenu in the "Top" node's menu.
290 | #texinfo_no_detailmenu = False
291 | 


--------------------------------------------------------------------------------
/lectures/data/adult.names.txt:
--------------------------------------------------------------------------------
  1 | | This data was extracted from the census bureau database found at
  2 | | http://www.census.gov/ftp/pub/DES/www/welcome.html
  3 | | Donor: Ronny Kohavi and Barry Becker,
  4 | |        Data Mining and Visualization
  5 | |        Silicon Graphics.
  6 | |        e-mail: ronnyk@sgi.com for questions.
  7 | | Split into train-test using MLC++ GenCVFiles (2/3, 1/3 random).
  8 | | 48842 instances, mix of continuous and discrete    (train=32561, test=16281)
  9 | | 45222 if instances with unknown values are removed (train=30162, test=15060)
 10 | | Duplicate or conflicting instances : 6
 11 | | Class probabilities for adult.all file
 12 | | Probability for the label '>50K'  : 23.93% / 24.78% (without unknowns)
 13 | | Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)
 14 | |
 15 | | Extraction was done by Barry Becker from the 1994 Census database.  A set of
 16 | |   reasonably clean records was extracted using the following conditions:
 17 | |   ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
 18 | |
 19 | | Prediction task is to determine whether a person makes over 50K
 20 | | a year.
 21 | |
 22 | | First cited in:
 23 | | @inproceedings{kohavi-nbtree,
 24 | |    author={Ron Kohavi},
 25 | |    title={Scaling Up the Accuracy of Naive-Bayes Classifiers: a
 26 | |           Decision-Tree Hybrid},
 27 | |    booktitle={Proceedings of the Second International Conference on
 28 | |               Knowledge Discovery and Data Mining},
 29 | |    year = 1996,
 30 | |    pages={to appear}}
 31 | |
 32 | | Error Accuracy reported as follows, after removal of unknowns from
 33 | |    train/test sets):
 34 | |    C4.5       : 84.46+-0.30
 35 | |    Naive-Bayes: 83.88+-0.30
 36 | |    NBTree     : 85.90+-0.28
 37 | |
 38 | |
 39 | | Following algorithms were later run with the following error rates,
 40 | |    all after removal of unknowns and using the original train/test split.
 41 | |    All these numbers are straight runs using MLC++ with default values.
 42 | |
 43 | |    Algorithm               Error
 44 | | -- ----------------        -----
 45 | | 1  C4.5                    15.54
 46 | | 2  C4.5-auto               14.46
 47 | | 3  C4.5 rules              14.94
 48 | | 4  Voted ID3 (0.6)         15.64
 49 | | 5  Voted ID3 (0.8)         16.47
 50 | | 6  T2                      16.84
 51 | | 7  1R                      19.54
 52 | | 8  NBTree                  14.10
 53 | | 9  CN2                     16.00
 54 | | 10 HOODG                   14.82
 55 | | 11 FSS Naive Bayes         14.05
 56 | | 12 IDTM (Decision table)   14.46
 57 | | 13 Naive-Bayes             16.12
 58 | | 14 Nearest-neighbor (1)    21.42
 59 | | 15 Nearest-neighbor (3)    20.35
 60 | | 16 OC1                     15.04
 61 | | 17 Pebls                   Crashed.  Unknown why (bounds WERE increased)
 62 | |
 63 | | Conversion of original data as follows:
 64 | | 1. Discretized agrossincome into two ranges with threshold 50,000.
 65 | | 2. Convert U.S. to US to avoid periods.
 66 | | 3. Convert Unknown to "?"
 67 | | 4. Run MLC++ GenCVFiles to generate data,test.
 68 | |
 69 | | Description of fnlwgt (final weight)
 70 | |
 71 | | The weights on the CPS files are controlled to independent estimates of the
 72 | | civilian noninstitutional population of the US.  These are prepared monthly
 73 | | for us by Population Division here at the Census Bureau.  We use 3 sets of
 74 | | controls.
 75 | |  These are:
 76 | |          1.  A single cell estimate of the population 16+ for each state.
 77 | |          2.  Controls for Hispanic Origin by age and sex.
 78 | |          3.  Controls by Race, age and sex.
 79 | |
 80 | | We use all three sets of controls in our weighting program and "rake" through
 81 | | them 6 times so that by the end we come back to all the controls we used.
 82 | |
 83 | | The term estimate refers to population totals derived from CPS by creating
 84 | | "weighted tallies" of any specified socio-economic characteristics of the
 85 | | population.
 86 | |
 87 | | People with similar demographic characteristics should have
 88 | | similar weights.  There is one important caveat to remember
 89 | | about this statement.  That is that since the CPS sample is
 90 | | actually a collection of 51 state samples, each with its own
 91 | | probability of selection, the statement only applies within
 92 | | state.
 93 | 
 94 | 
 95 | >50K, <=50K.
 96 | 
 97 | age: continuous.
 98 | workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
 99 | fnlwgt: continuous.
100 | education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
101 | education-num: continuous.
102 | marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
103 | occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
104 | relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
105 | race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
106 | sex: Female, Male.
107 | capital-gain: continuous.
108 | capital-loss: continuous.
109 | hours-per-week: continuous.
110 | native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
111 | 


--------------------------------------------------------------------------------
/lectures/data/sonar.names.txt:
--------------------------------------------------------------------------------
  1 | NAME: Sonar, Mines vs. Rocks
  2 | 
  3 | SUMMARY: This is the data set used by Gorman and Sejnowski in their study
  4 | of the classification of sonar signals using a neural network [1].  The
  5 | task is to train a network to discriminate between sonar signals bounced
  6 | off a metal cylinder and those bounced off a roughly cylindrical rock.
  7 | 
  8 | SOURCE: The data set was contributed to the benchmark collection by Terry
  9 | Sejnowski, now at the Salk Institute and the University of California at
 10 | San Deigo.  The data set was developed in collaboration with R. Paul
 11 | Gorman of Allied-Signal Aerospace Technology Center.
 12 | 
 13 | MAINTAINER: Scott E. Fahlman
 14 | 
 15 | PROBLEM DESCRIPTION:
 16 | 
 17 | The file "sonar.mines" contains 111 patterns obtained by bouncing sonar
 18 | signals off a metal cylinder at various angles and under various
 19 | conditions.  The file "sonar.rocks" contains 97 patterns obtained from
 20 | rocks under similar conditions.  The transmitted sonar signal is a
 21 | frequency-modulated chirp, rising in frequency.  The data set contains
 22 | signals obtained from a variety of different aspect angles, spanning 90
 23 | degrees for the cylinder and 180 degrees for the rock.
 24 | 
 25 | Each pattern is a set of 60 numbers in the range 0.0 to 1.0.  Each number
 26 | represents the energy within a particular frequency band, integrated over
 27 | a certain period of time.  The integration aperture for higher frequencies
 28 | occur later in time, since these frequencies are transmitted later during
 29 | the chirp.
 30 | 
 31 | The label associated with each record contains the letter "R" if the object
 32 | is a rock and "M" if it is a mine (metal cylinder).  The numbers in the
 33 | labels are in increasing order of aspect angle, but they do not encode the
 34 | angle directly.
 35 | 
 36 | METHODOLOGY: 
 37 | 
 38 | This data set can be used in a number of different ways to test learning
 39 | speed, quality of ultimate learning, ability to generalize, or combinations
 40 | of these factors.
 41 | 
 42 | In [1], Gorman and Sejnowski report two series of experiments: an
 43 | "aspect-angle independent" series, in which the whole data set is used
 44 | without controlling for aspect angle, and an "aspect-angle dependent"
 45 | series in which the training and testing sets were carefully controlled to
 46 | ensure that each set contained cases from each aspect angle in
 47 | appropriate proportions.
 48 | 
 49 | For the aspect-angle independent experiments the combined set of 208 cases
 50 | is divided randomly into 13 disjoint sets with 16 cases in each.  For each
 51 | experiment, 12 of these sets are used as training data, while the 13th is
 52 | reserved for testing.  The experiment is repeated 13 times so that every
 53 | case appears once as part of a test set.  The reported performance is an
 54 | average over the entire set of 13 different test sets, each run 10 times.
 55 | 
 56 | It was observed that this random division of the sample set led to rather
 57 | uneven performance.  A few of the splits gave poor results, presumably
 58 | because the test set contains some samples from aspect angles that are
 59 | under-represented in the corresponding training set.  This motivated Gorman
 60 | and Sejnowski to devise a different set of experiments in which an attempt
 61 | was made to balance the training and test sets so that each would have a
 62 | representative number of samples from all aspect angles.  Since detailed
 63 | aspect angle information was not present in the data base of samples, the
 64 | 208 samples were first divided into clusters, using a 60-dimensional
 65 | Euclidian metric; each of these clusters was then divided between the
 66 | 104-member training set and the 104-member test set.  
 67 | 
 68 | The actual training and testing samples used for the "aspect angle
 69 | dependent" experiments are marked in the data files.  The reported
 70 | performance is an average over 10 runs with this single division of the
 71 | data set.
 72 | 
 73 | A standard back-propagation network was used for all experiments.  The
 74 | network had 60 inputs and 2 output units, one indicating a cylinder and the
 75 | other a rock.  Experiments were run with no hidden units (direct
 76 | connections from each input to each output) and with a single hidden layer
 77 | with 2, 3, 6, 12, or 24 units.  Each network was trained by 300 epochs over
 78 | the entire training set.
 79 | 
 80 | The weight-update formulas used in this study were slightly different from
 81 | the standard form.  A learning rate of 2.0 and momentum of 0.0 was used.
 82 | Errors less than 0.2 were treated as zero.  Initial weights were uniform
 83 | random values in the range -0.3 to +0.3.
 84 | 
 85 | RESULTS: 
 86 | 
 87 | For the angle independent experiments, Gorman and Sejnowski report the
 88 | following results for networks with different numbers of hidden units:
 89 | 
 90 | Hidden	% Right on	Std.	% Right on	Std.
 91 | Units	Training set	Dev.	Test Set	Dev.
 92 | ------	------------	----	----------	----
 93 | 0	89.4		2.1	77.1		8.3
 94 | 2	96.5		0.7	81.9		6.2
 95 | 3	98.8		0.4	82.0		7.3
 96 | 6	99.7		0.2	83.5		5.6
 97 | 12	99.8		0.1	84.7		5.7
 98 | 24	99.8		0.1	84.5		5.7
 99 | 
100 | For the angle-dependent experiments Gorman and Sejnowski report the
101 | following results:
102 | 
103 | Hidden	% Right on	Std.	% Right on	Std.
104 | Units	Training set	Dev.	Test Set	Dev.
105 | ------	------------	----	----------	----
106 | 0	79.3		3.4	73.1		4.8
107 | 2	96.2		2.2	85.7		6.3
108 | 3	98.1		1.5	87.6		3.0
109 | 6	99.4		0.9	89.3		2.4
110 | 12	99.8		0.6	90.4		1.8
111 | 24     100.0		0.0	89.2		1.4
112 | 
113 | Not surprisingly, the network's performance on the test set was somewhat
114 | better when the aspect angles in the training and test sets were balanced.
115 | 
116 | Gorman and Sejnowski further report that a nearest neighbor classifier on
117 | the same data gave an 82.7% probability of correct classification.
118 | 
119 | Three trained human subjects were each tested on 100 signals, chosen at
120 | random from the set of 208 returns used to create this data set.  Their
121 | responses ranged between 88% and 97% correct.  However, they may have been
122 | using information from the raw sonar signal that is not preserved in the
123 | processed data sets presented here.
124 | 
125 | REFERENCES: 
126 | 
127 | 1. Gorman, R. P., and Sejnowski, T. J. (1988).  "Analysis of Hidden Units
128 | in a Layered Network Trained to Classify Sonar Targets" in Neural Networks,
129 | Vol. 1, pp. 75-89.
130 | 


--------------------------------------------------------------------------------
/lectures/em.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/em.png


--------------------------------------------------------------------------------
/lectures/index.rst:
--------------------------------------------------------------------------------
 1 | .. Computational Statistics in Python documentation master file, created by
 2 |    sphinx-quickstart on Thu Jan 14 10:45:35 2016.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Computational Statistics in Python
 7 | ==============================================================
 8 | 
 9 | `Notebooks and problem sets from GitHub repository <https://github.com/cliburn/sta-663-2016.git>`_
10 | 
11 | Topics:
12 | --------
13 | 
14 | .. toctree::
15 |    :maxdepth: 2
16 | 
17 |    01_Introduction_To_Python
18 |    02A_Functions
19 |    02B_Strings
20 |    02C_IO
21 |    02D_Classes
22 |    03A_Numbers
23 |    03B_Graphics
24 |    04A_Data
25 |    04B_SQL
26 |    05_Machine_Learning
27 |    06_LinearAlgebra1
28 |    07_LinearAlgebra2
29 |    08_LinearAlgebraExamples
30 |    09_PCA
31 |    10_SymbolicAlgebra
32 |    11_OptimizationOneDimension
33 |    12_MultivariateOptimizationAlgorithms
34 |    13_Optimization
35 |    14_ExpectationMaximization
36 |    15A_RandomNumbers
37 |    15B_ResamplingAndSimulation
38 |    15C_MonteCarloIntegration
39 |    16A_MCMC
40 |    16B_AuxiliaryVariableMCMC
41 |    16C_PyMC3
42 |    16D_PyStan
43 |    17A_C_Crash_Course
44 |    17B_C_InOneLecture
45 |    17C_C++_Primer_Solutions
46 |    17D_Review_C_C++
47 |    18A_CodeOptimization
48 |    18B_Foreing_Language_Interface
49 |    18C_Numba
50 |    18D_Cython
51 |    18E_Benchmarks
52 |    18F_Optimization_Bakeoff
53 |    19A_Parallel_Programming
54 |    19B_Threads_Processses_Concurrency
55 |    19C_IPyParallel
56 |    20A_Intermediate_Sized_Data
57 |    20B_Big_Data_Structures
58 |    21A_Introduction_To_Spark
59 |    21B_Efficiency_In_Spark
60 |    21C_Spark_SQL
61 |    21D_Spark_MLib
62 | 
63 | Setup
64 | -----------
65 | 
66 | - :doc:`Local_Installation`
67 | - :doc:`Customizing_Jupyter`
68 | 
69 | Homework
70 | ----------
71 | 
72 | - :doc:`homework/Homework01`
73 | - :doc:`homework/Homework01_Solutions`
74 | - :doc:`homework/Homework02`
75 | - :doc:`homework/Homework02_Solutions`
76 | - :doc:`homework/Homework03`
77 | - :doc:`homework/Homework03_Solutions`
78 | - :doc:`homework/Homework04`
79 | - :doc:`homework/Homework04_Solutions`
80 | - :doc:`homework/Homework05`
81 | - :doc:`homework/Homework05_Solutions`
82 | - :doc:`homework/Homework06`
83 | - :doc:`homework/Homework06_Solutions`
84 | - :doc:`homework/Homework07`
85 | - :doc:`homework/Homework07_Solutions`
86 | - :doc:`homework/Homework08`
87 | - :doc:`homework/Homework08_Solutions`
88 | 
89 | 
90 | Indices and tables
91 | ==================
92 | 
93 | * :ref:`genindex`
94 | * :ref:`modindex`
95 | * :ref:`search`
96 | 


--------------------------------------------------------------------------------
/lectures/jensen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/jensen.png


--------------------------------------------------------------------------------
/lectures/julia_benchmarks.pic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/julia_benchmarks.pic


--------------------------------------------------------------------------------
/lectures/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 1>NUL 2>NUL
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\ComputationalStatisticsinPython.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\ComputationalStatisticsinPython.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/lectures/mcmc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/mcmc.png


--------------------------------------------------------------------------------
/lectures/my_module.py:
--------------------------------------------------------------------------------
1 | 
2 | PI = 3.14
3 | 
4 | def my_f(x):
5 |     return PI*x


--------------------------------------------------------------------------------
/lectures/spectral.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/lectures/spectral.png


--------------------------------------------------------------------------------
/lectures/sphinx-readme:
--------------------------------------------------------------------------------
 1 | To build sphinx docs:
 2 | ```
 3 | pip install nbsphinx
 4 | pip install cloud_sptheme
 5 | cd lectures
 6 | make html
 7 | make latexpdf
 8 | ```
 9 | 
10 | For HTML, open _build/html/index.html
11 | For PDF, open _build/latex/ComputationalStatisticsinPython.pdf
12 | 


--------------------------------------------------------------------------------
/misc/Customizing_Jupyter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Customizing the Jupyter notebook\n",
  8 |     "====\n",
  9 |     "\n",
 10 |     "These are strictly optional."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "1. Creating a startup script\n",
 18 |     "----"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "It is convenient to have a bunch of default imports and set up inline plotting automatically. Here's how to do it."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Create an ipython profile"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "! ipython profile create"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "### Edit the next cell to set your defaults, then execute it."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {
 57 |     "collapsed": true
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "%%file ~/.ipython/profile_default/startup/start.ipy\n",
 62 |     "\n",
 63 |     "import os\n",
 64 |     "import sys\n",
 65 |     "import glob\n",
 66 |     "import operator as op\n",
 67 |     "import itertools as it\n",
 68 |     "from functools import reduce, partial\n",
 69 |     "import numpy as np\n",
 70 |     "import pandas as pd\n",
 71 |     "from pandas import DataFrame, Series\n",
 72 |     "import matplotlib.pyplot as plt\n",
 73 |     "import seaborn as sns\n",
 74 |     "sns.set_context(\"notebook\", font_scale=1.5)\n",
 75 |     "%matplotlib inline"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "### Stop and restart your Jupyter kernel"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "2. Change keybindings to emacs \n",
 90 |     "----"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {
 97 |     "collapsed": true
 98 |    },
 99 |    "outputs": [],
100 |    "source": [
101 |     "! pip install jupyter-emacskeys"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "3. Install slide mode (RISE)\n",
109 |     "----"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {
116 |     "collapsed": true
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "%%bash\n",
121 |     "\n",
122 |     "git clone https://github.com/damianavila/RISE.git\n",
123 |     "cd RISE\n",
124 |     "python setup.py install"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "4. Install Calico extensions (see videos for what they do)\n",
132 |     "----\n",
133 |     "\n",
134 |     "This will not work in the Docker container as you do not have the appropriate permissions. However, you can do this for your local installation if you wish."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "collapsed": true
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "import IPython"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "### Spell-check"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {
159 |     "collapsed": true
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "IPython.display.YouTubeVideo(\"Km3AtRynWFQ\")"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "### Document tools"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {
177 |     "collapsed": true
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "IPython.display.YouTubeVideo(\"YbM8rrj-Bms\")"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "### Cell tools"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {
195 |     "collapsed": true
196 |    },
197 |    "outputs": [],
198 |    "source": [
199 |     "IPython.display.YouTubeVideo(\"WwoTzvOkEJQ\")"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "### Execute the next 2 cells to install spell-check, document and cell tools"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {
213 |     "collapsed": true
214 |    },
215 |    "outputs": [],
216 |    "source": [
217 |     "%%bash\n",
218 |     "\n",
219 |     "ipython install-nbextension https://bitbucket.org/ipre/calico/downloads/calico-spell-check-1.0.zip\n",
220 |     "ipython install-nbextension https://bitbucket.org/ipre/calico/downloads/calico-document-tools-1.0.zip\n",
221 |     "ipython install-nbextension https://bitbucket.org/ipre/calico/downloads/calico-cell-tools-1.0.zip"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {
228 |     "collapsed": true
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "%%file ~/.jupyter/custom/custom.js\n",
233 |     "\n",
234 |     "require(['base/js/utils'],\n",
235 |     "        function(utils) {\n",
236 |     "            utils.load_extensions('calico-spell-check',\n",
237 |     "                                  'calico-document-tools',\n",
238 |     "                                  'calico-cell-tools');\n",
239 |     "        });"
240 |    ]
241 |   }
242 |  ],
243 |  "metadata": {
244 |   "kernelspec": {
245 |    "display_name": "Python 3",
246 |    "language": "python",
247 |    "name": "python3"
248 |   },
249 |   "language_info": {
250 |    "codemirror_mode": {
251 |     "name": "ipython",
252 |     "version": 3
253 |    },
254 |    "file_extension": ".py",
255 |    "mimetype": "text/x-python",
256 |    "name": "python",
257 |    "nbconvert_exporter": "python",
258 |    "pygments_lexer": "ipython3",
259 |    "version": "3.5.1"
260 |   }
261 |  },
262 |  "nbformat": 4,
263 |  "nbformat_minor": 0
264 | }
265 | 


--------------------------------------------------------------------------------
/misc/Local_Installation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Setting up a local install of Jupyter with multiple kernels (Python 3.5, Python 2.7, R, Juila)\n",
  8 |     "====\n",
  9 |     "\n",
 10 |     "The only installation you are recommended to do is to install Anaconda 3.5, so that you have a backup when the OIT version is flaky. The other kernels and the Docker version are **not required** and you should only do so if you are comforatble with command line installs. Even the Anaconda 3.5 installation is optional if the OIT version works well for you.\n",
 11 |     "\n",
 12 |     "Note: I have only done this on OSX 10.11.2 (El Capitan) with XCode and command line compilers installed.\n",
 13 |     "\n",
 14 |     "To install Anaconda for Python 3.5\n",
 15 |     "---- \n",
 16 |     "\n",
 17 |     "Download and install Anaconda Python 3.5 from https://www.continuum.io/downloads\n",
 18 |     "\n",
 19 |     "Open a terminal\n",
 20 |     "```bash\n",
 21 |     "conda update conda\n",
 22 |     "conda update anaconda\n",
 23 |     "```\n",
 24 |     "\n",
 25 |     "(OPTIONAL) To install Python 2.7 as well\n",
 26 |     "----\n",
 27 |     "\n",
 28 |     "Open a terminal\n",
 29 |     "```bash\n",
 30 |     "conda create -n py27 python=2.7 anaconda\n",
 31 |     "source activate py27\n",
 32 |     "ipython kernel install\n",
 33 |     "source deactivate\n",
 34 |     "```\n",
 35 |     "\n",
 36 |     "(OPTIONAL) To install R\n",
 37 |     "----\n",
 38 |     "\n",
 39 |     "- If you want `conda` to manage your R packages\n",
 40 |     "\n",
 41 |     "```bash\n",
 42 |     "conda install -y -c r r-irkernel r-recommended r-essentials\n",
 43 |     "```\n",
 44 |     "\n",
 45 |     "> Note: The bug that required this appears to have been fixed\n",
 46 |     "```\n",
 47 |     "wget https://anaconda.org/r/ncurses/5.9/download/osx-64/ncurses-5.9-1.tar.bz2 \\\n",
 48 |     "        https://anaconda.org/r/nlopt/2.4.2/download/osx-64/nlopt-2.4.2-1.tar.bz2 \\\n",
 49 |     "    && conda install --yes ncurses-5.9-1.tar.bz2 nlopt-2.4.2-1.tar.bz2\n",
 50 |     "```\n",
 51 |     "\n",
 52 |     "- If you have an existing R installation that you want to use\n",
 53 |     "\n",
 54 |     "Start R\n",
 55 |     "```R\n",
 56 |     "install.packages(c('rzmq','repr','IRkernel','IRdisplay'),\n",
 57 |     "                 repos = c('http://irkernel.github.io/', getOption('repos')))\n",
 58 |     "IRkernel::installspec()\n",
 59 |     "```\n",
 60 |     "\n",
 61 |     "(OPTIONAL) To install Julia\n",
 62 |     "----\n",
 63 |     "\n",
 64 |     "Download and install Julia from http://julialang.org/downloads/\n",
 65 |     "\n",
 66 |     "Start Julia\n",
 67 |     "```julia\n",
 68 |     "Pkg.add(\"IJulia\")\n",
 69 |     "Pkg.build(\"IJulia\")\n",
 70 |     "```\n",
 71 |     "\n",
 72 |     "(OPTIONAL) To install `pyspark`\n",
 73 |     "----\n",
 74 |     "\n",
 75 |     "Open a terminal\n",
 76 |     "```bash\n",
 77 |     "conda install -y -c anaconda-cluster spark\n",
 78 |     "```\n",
 79 |     "\n",
 80 |     "Check\n",
 81 |     "----\n",
 82 |     "Open terminal\n",
 83 |     "```\n",
 84 |     "jupyter notebook\n",
 85 |     "```\n",
 86 |     "\n",
 87 |     "See if the installed kernels are found in the drop-down menu."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "(OPTIONAL) Installing Spark via Docker\n",
 95 |     "----\n",
 96 |     "\n",
 97 |     "- Install Docker (https://docs.docker.com/engine/installation/)\n",
 98 |     "- Launch the Docker Quickstart Terminal\n",
 99 |     "\n",
100 |     "Be patient - this can take a while the first time you do it\n",
101 |     "\n",
102 |     "When done, it shouuld show something like this\n",
103 |     "```\n",
104 |     "                        ##         .\n",
105 |     "                  ## ## ##        ==\n",
106 |     "               ## ## ## ## ##    ===\n",
107 |     "           /\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\\___/ ===\n",
108 |     "      ~~~ {~~ ~~~~ ~~~ ~~~~ ~~~ ~ /  ===- ~~~\n",
109 |     "           \\______ o           __/\n",
110 |     "             \\    \\         __/\n",
111 |     "              \\____\\_______/\n",
112 |     "\n",
113 |     "\n",
114 |     "docker is configured to use the default machine with IP 192.168.99.100\n",
115 |     "For help getting started, check out the docs at https://docs.docker.com\n",
116 |     "```\n",
117 |     "\n",
118 |     "**Note the IP address given - you will need this to access the notebook.**\n",
119 |     "\n",
120 |     "In the Docker terminal\n",
121 |     "```\n",
122 |     "docker run -d -p 8888:8888 jupyter/all-spark-notebook\n",
123 |     "```\n",
124 |     "\n",
125 |     "Check by typing in the Docker terminal\n",
126 |     "```\n",
127 |     "docker ps\n",
128 |     "```\n",
129 |     "\n",
130 |     "Be patient - this can take a while the first time you do it.\n",
131 |     "\n",
132 |     "It shoudl show something like\n",
133 |     "```bash\n",
134 |     "CONTAINER ID        IMAGE                        COMMAND                  CREATED             STATUS              PORTS                    NAMES\n",
135 |     "965a6a80bf44        jupyter/all-spark-notebook   \"tini -- start-notebo\"   4 minutes ago       Up 4 minutes        0.0.0.0:8888->8888/tcp   big_kilby\n",
136 |     "```\n",
137 |     "\n",
138 |     "**Note the machine name (mine is big_kilby, yours will likely be different).**\n",
139 |     "\n",
140 |     "Open your browser at the following URL http://192.168.99.100:8888 (Use the IP given above)\n",
141 |     "\n",
142 |     "This should bring you to a Jupyter notebook. Open a Python3 notebook from the drop dwon menu and test:\n",
143 |     "\n",
144 |     "```python\n",
145 |     "import pyspark\n",
146 |     "sc = pyspark.SparkContext('local[*]')\n",
147 |     "\n",
148 |     "# do something to prove it works\n",
149 |     "rdd = sc.parallelize(range(1000))\n",
150 |     "rdd.takeSample(False, 5)\n",
151 |     "```\n",
152 |     "\n",
153 |     "If successful, you should get a list of 5 integers after a short delay.\n",
154 |     "\n",
155 |     "Save and exit the notebook.\n",
156 |     "\n",
157 |     "Cleap up in the Docker terminal\n",
158 |     "```\n",
159 |     "docker stop big_kilby\n",
160 |     "exit\n",
161 |     "```\n",
162 |     "\n",
163 |     "Use the machine name foudnd with `docker ps` in place of `big_kilby`.\n",
164 |     "\n"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "collapsed": true
172 |    },
173 |    "outputs": [],
174 |    "source": []
175 |   }
176 |  ],
177 |  "metadata": {
178 |   "kernelspec": {
179 |    "display_name": "Python 3",
180 |    "language": "python",
181 |    "name": "python3"
182 |   },
183 |   "language_info": {
184 |    "codemirror_mode": {
185 |     "name": "ipython",
186 |     "version": 3
187 |    },
188 |    "file_extension": ".py",
189 |    "mimetype": "text/x-python",
190 |    "name": "python",
191 |    "nbconvert_exporter": "python",
192 |    "pygments_lexer": "ipython3",
193 |    "version": "3.5.1"
194 |   }
195 |  },
196 |  "nbformat": 4,
197 |  "nbformat_minor": 0
198 | }
199 | 


--------------------------------------------------------------------------------
/misc/Recommended_Books.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Further Reading\n",
  8 |     "====\n",
  9 |     "\n",
 10 |     "Much of the material for learning Python for statistics and data science can be found online, but some of us still enjoy reading books ... For those of us diehard bibliophiles, these are books that I referred to for the course and enjoyed reading. They are lister in roughly the same order as the course lectures.\n",
 11 |     "\n",
 12 |     "- Python in a Nutshell by Steve Holden, Anna Ravenscroft and Alex Martelli (3rd edition)\n",
 13 |     "\n",
 14 |     "> A really nice reference for Python 3. \n",
 15 |     "\n",
 16 |     "- Python Cookbook:  Recipes for Mastering Python 3 by David Beazley, Brian K. Jones (3rd Edition)\n",
 17 |     "\n",
 18 |     "> When you want are stuck on a specific task and Stack Overflow is not working for you.\n",
 19 |     "\n",
 20 |     "- Learning IPython for Interactive Computing and Data Visualization by Cyrille Rossant  (2nd edition)\n",
 21 |     "\n",
 22 |     "> If you want to master Jupyter.\n",
 23 |     "\n",
 24 |     "- Fluent Python by Luciano Ramalho\n",
 25 |     "\n",
 26 |     "> Awesome resource for learning how to code in idiomatic Python like a Pythonista.\n",
 27 |     "\n",
 28 |     "- Python for Data Analysis: Data Wrangling with Pandas, NumPy, and IPython by Wes McKinney\n",
 29 |     "\n",
 30 |     "> Pandas by the developer. A little dated but you can supplement with the online material.\n",
 31 |     "\n",
 32 |     "- Python Data Science Handbook by Jake VanderPlas\n",
 33 |     "\n",
 34 |     "> Still a work in progress, but it looks like the single best book for this course.\n",
 35 |     "\n",
 36 |     "- Bayesian Methods for Hackers: Probabilistic Programming and Bayesian Inference by Cameron Davidson-Pilon\n",
 37 |     "\n",
 38 |     "> Examples of how to use PyMC3.\n",
 39 |     "\n",
 40 |     "- High Performance Python by Micha Gorelick and Ian Ozsvald\n",
 41 |     "\n",
 42 |     "> Make your Python code faster.\n",
 43 |     "\n",
 44 |     "- Cython: A Guide for Python Programmers by Kurt W Smith\n",
 45 |     "\n",
 46 |     "> If you want to master Cython, this book is your guide.\n",
 47 |     "\n",
 48 |     "- 21st Century C: C Tips from the New School by Ben Klemens (2nd edition)\n",
 49 |     "\n",
 50 |     "> Modern C for statisticians.\n",
 51 |     "\n",
 52 |     "- Discovering Modern C++: An Intensive Course for Scientists, Engineers, and Programmers by Peter Gottschling\n",
 53 |     "\n",
 54 |     "> Awesome introduction to modern C++ (C++11 and C++14) for numerical work. Possibly too dense if you don't already have some familiarity with C/C++. \n",
 55 |     "\n",
 56 |     "- Learning Spark: Lightning-Fast Big Data Analysis by Holden Karau, Andy Konwinski, Patrick Wendell, Matei Zaharia (unfortunately Spark books tend to be outdated the moment they are printed - this edition covers Spark 1.3 and we are already at Spark 1.6)\n",
 57 |     "\n",
 58 |     "> Introduction to Spark with Java, Scala and Python examples.\n",
 59 |     "\n",
 60 |     "- Data Algorithms: Recipes for Scaling Up with Hadoop and Spark by Mahmoud Parsian\n",
 61 |     "\n",
 62 |     "> Sort of a cookbook with examples in Hadoop and Spark. Emphasis on biomedical applications. \n",
 63 |     "\n",
 64 |     "- Data Visualization with Python and JavaScript: Scrape, Clean, Explore & Transform Your Data by Kyran Dale\n",
 65 |     "\n",
 66 |     "> Still in early stages, but looks very promising. If I ever include lectures on data visualization, I suspect this book will be my reference."
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {
 73 |     "collapsed": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": []
 77 |   }
 78 |  ],
 79 |  "metadata": {
 80 |   "kernelspec": {
 81 |    "display_name": "Python 3",
 82 |    "language": "python",
 83 |    "name": "python3"
 84 |   },
 85 |   "language_info": {
 86 |    "codemirror_mode": {
 87 |     "name": "ipython",
 88 |     "version": 3
 89 |    },
 90 |    "file_extension": ".py",
 91 |    "mimetype": "text/x-python",
 92 |    "name": "python",
 93 |    "nbconvert_exporter": "python",
 94 |    "pygments_lexer": "ipython3",
 95 |    "version": "3.5.1"
 96 |   }
 97 |  },
 98 |  "nbformat": 4,
 99 |  "nbformat_minor": 0
100 | }
101 | 


--------------------------------------------------------------------------------
/misc/Spark_Test_Drive.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pyspark\n",
 12 |     "sc = pyspark.SparkContext('local[*]')"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 3,
 18 |    "metadata": {
 19 |     "collapsed": false
 20 |    },
 21 |    "outputs": [
 22 |     {
 23 |      "data": {
 24 |       "text/plain": [
 25 |        "4"
 26 |       ]
 27 |      },
 28 |      "execution_count": 3,
 29 |      "metadata": {},
 30 |      "output_type": "execute_result"
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "sc.defaultParallelism"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "metadata": {
 41 |     "collapsed": false
 42 |    },
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "[452, 10, 725, 642, 670]"
 48 |       ]
 49 |      },
 50 |      "execution_count": 2,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "rdd = sc.parallelize(range(1000))\n",
 57 |     "rdd.takeSample(False, 5)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 6,
 63 |    "metadata": {
 64 |     "collapsed": true
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "squares = rdd.map(lambda x: x**2)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 7,
 74 |    "metadata": {
 75 |     "collapsed": false
 76 |    },
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "332833500"
 82 |       ]
 83 |      },
 84 |      "execution_count": 7,
 85 |      "metadata": {},
 86 |      "output_type": "execute_result"
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "squares.reduce(lambda x, y: x + y)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 11,
 96 |    "metadata": {
 97 |     "collapsed": false
 98 |    },
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "332833500"
104 |       ]
105 |      },
106 |      "execution_count": 11,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "# check\n",
113 |     "sum(n**2 for n in range(1000))"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {
120 |     "collapsed": true
121 |    },
122 |    "outputs": [],
123 |    "source": []
124 |   }
125 |  ],
126 |  "metadata": {
127 |   "kernelspec": {
128 |    "display_name": "Python 3",
129 |    "language": "python",
130 |    "name": "python3"
131 |   },
132 |   "language_info": {
133 |    "codemirror_mode": {
134 |     "name": "ipython",
135 |     "version": 3
136 |    },
137 |    "file_extension": ".py",
138 |    "mimetype": "text/x-python",
139 |    "name": "python",
140 |    "nbconvert_exporter": "python",
141 |    "pygments_lexer": "ipython3",
142 |    "version": "3.5.1"
143 |   }
144 |  },
145 |  "nbformat": 4,
146 |  "nbformat_minor": 0
147 | }
148 | 


--------------------------------------------------------------------------------
/misc/TopicCoverageForMidterm.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Basic Python\n",
  8 |     "----\n",
  9 |     "\n",
 10 |     "- Appropriate use of data structures\n",
 11 |     "- Write a function\n",
 12 |     "- Higher order functions\n",
 13 |     "- Equivalents of for-loops\n",
 14 |     "- Basic string processing\n",
 15 |     "- Importing packages\n",
 16 |     "- Read and write to file\n",
 17 |     "\n",
 18 |     "Scientific computation\n",
 19 |     "----\n",
 20 |     "\n",
 21 |     "- Symbolic algebra with `sympy` e.g. perform integration\n",
 22 |     "- Working with `numpy` vectors and arrays\n",
 23 |     "- Using `scipy.linalg` routines\n",
 24 |     "- Graphing in `matplotlib` and `seaborn`\n",
 25 |     "- Split-apply-combine with `pandas` Series and DataFrames\n",
 26 |     "- Finding roots and minima with `scipy.optimize`\n",
 27 |     "- Pre-processing, unsupervised and supervised learning with `sklearn`\n",
 28 |     "\n",
 29 |     "Linear Algebra\n",
 30 |     "----\n",
 31 |     "\n",
 32 |     "- Linear combinations and independence\n",
 33 |     "- Change of basis and similar matrices\n",
 34 |     "- $A = LU$ (Gaussian elimination)\n",
 35 |     "- $A = LL^T$ (Symmetry)\n",
 36 |     "- $A = QR$ (Orthogonality)\n",
 37 |     "- $A = S\\Lambda S^{-1}$ (Eigenvalues and eigenvectors)\n",
 38 |     "- $A = Q\\Lambda Q^T$ (Diagonalization of symmetric matrix)\n",
 39 |     "- $A = U\\Sigma V^T$ (Singular values)\n",
 40 |     "- Positive definite matrices\n",
 41 |     "- $A^TA\\hat{x} = A^Tb$ (Projection and least squares)\n",
 42 |     "\n",
 43 |     "Calculus\n",
 44 |     "----\n",
 45 |     "\n",
 46 |     "- Evaluating critical points\n",
 47 |     "- Taylor series\n",
 48 |     "- Gradient, Jacobian and Hessian\n",
 49 |     "- Calculating conjugate gradients\n",
 50 |     "- Newton method (univariate and multivariate)\n",
 51 |     "- Gradient descent (batch and stochastic)\n",
 52 |     "- Lagrange multipliers for constrained problems\n",
 53 |     "\n",
 54 |     "EM\n",
 55 |     "----\n",
 56 |     "\n",
 57 |     "- Likelihood and log-likelihood\n",
 58 |     "- Jensen's inequality\n",
 59 |     "- Basic concept of the EM algorithm\n",
 60 |     "- K-means algorithm\n",
 61 |     "\n",
 62 |     "Random numbers\n",
 63 |     "----\n",
 64 |     "\n",
 65 |     "- Inverse transform method\n",
 66 |     "- Familiar with  probability distributions from `numpy.random` and `scipy.stats`\n",
 67 |     "- How to calculate quantiles, PDF and CDF and other statistics of a random variable\n",
 68 |     "\n",
 69 |     "Resampling and Monte Carlo simulations\n",
 70 |     "----\n",
 71 |     "\n",
 72 |     "- Bootstrap\n",
 73 |     "- Leave-one out calculations (including LOOCV)\n",
 74 |     "- Empirical CDF and kernel density estimation\n",
 75 |     "\n",
 76 |     "Numerical integration\n",
 77 |     "----\n",
 78 |     "\n",
 79 |     "- Quadrature with `scipy.integrate` methods\n",
 80 |     "- Why Monte Carlo integration works\n",
 81 |     "- Change of variables\n",
 82 |     "- Importance sampler\n",
 83 |     "- Other Monte Carlo swindles\n",
 84 |     "\n",
 85 |     "Markov Chain Monte Carlo\n",
 86 |     "----\n",
 87 |     "\n",
 88 |     "- Working with Markov matrices (linear algebra revisited)\n",
 89 |     "- Conditions for convergence to equilibrium distribution\n",
 90 |     "- Detailed balance\n",
 91 |     "- How they work\n",
 92 |     "  - Random walk sampler\n",
 93 |     "  - Gibbs sampler\n",
 94 |     "  - Slice sampler\n",
 95 |     "  - HMC sampler\n",
 96 |     "- Use of PyMC3 and PyStan to fit simple hierarchical models\n"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {
103 |     "collapsed": true
104 |    },
105 |    "outputs": [],
106 |    "source": []
107 |   }
108 |  ],
109 |  "metadata": {
110 |   "kernelspec": {
111 |    "display_name": "Python 3",
112 |    "language": "python",
113 |    "name": "python3"
114 |   },
115 |   "language_info": {
116 |    "codemirror_mode": {
117 |     "name": "ipython",
118 |     "version": 3
119 |    },
120 |    "file_extension": ".py",
121 |    "mimetype": "text/x-python",
122 |    "name": "python",
123 |    "nbconvert_exporter": "python",
124 |    "pygments_lexer": "ipython3",
125 |    "version": "3.5.1"
126 |   }
127 |  },
128 |  "nbformat": 4,
129 |  "nbformat_minor": 0
130 | }
131 | 


--------------------------------------------------------------------------------
/misc/old-exams/Midterm-Sample-Revised.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Instructions\n",
  8 |     "----\n",
  9 |     "\n",
 10 |     "This is a \"closed book\" examination - in particular, you are not to use any resources outside of this notebook (except possibly pen and paper). You may consult help from within the notebook using ? but not any online references. You should turn wireless off or set your laptop in \"Airplane\" mode prior to taking the exam. \n",
 11 |     "\n",
 12 |     "You have 1 hour and 45 minutes to complete the exam."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {
 19 |     "collapsed": false
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import os\n",
 24 |     "import sys\n",
 25 |     "import glob\n",
 26 |     "import matplotlib.pyplot as plt\n",
 27 |     "import numpy as np\n",
 28 |     "import pandas as pd\n",
 29 |     "%matplotlib inline\n",
 30 |     "plt.style.use('ggplot')"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {
 37 |     "collapsed": false
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "np.set_printoptions(formatter={'float': '{: 0.3f}'.format})"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {
 48 |     "collapsed": false
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "%load_ext rpy2.ipython"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "**Question 1 (10 points)**. \n",
 60 |     "\n",
 61 |     "Euclid's algorithm for finding the greatest common divisor of two numbers is\n",
 62 |     "\n",
 63 |     "```python\n",
 64 |     "gcd(a, 0) = a\n",
 65 |     "gcd(a, b) = gcd(b, a modulo b)\n",
 66 |     "```\n",
 67 |     "\n",
 68 |     "1. Write a function to find the greatest common divisor in Python (4 poinst)\n",
 69 |     "2. What is the greatest common divisor of 17384 and 1928? (1 point)\n",
 70 |     "3. Write a function to calculate the least common multiple (4 points)\n",
 71 |     "4. What is the least common multiple of 17384 and 1928? (1 point)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": false
 79 |    },
 80 |    "outputs": [],
 81 |    "source": []
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {
 87 |     "collapsed": false
 88 |    },
 89 |    "outputs": [],
 90 |    "source": []
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {
 96 |     "collapsed": false
 97 |    },
 98 |    "outputs": [],
 99 |    "source": []
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {
105 |     "collapsed": false
106 |    },
107 |    "outputs": [],
108 |    "source": []
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {
114 |     "collapsed": false
115 |    },
116 |    "outputs": [],
117 |    "source": []
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "**Question 2 (10 points)**. \n",
124 |     "\n",
125 |     "Consider the following matrix $A$ with dimensions (4,6), to be interpreted as 4 rows of the measurements of 6 features.\n",
126 |     "```python\n",
127 |     "np.array([[5, 5, 2, 6, 2, 0],\n",
128 |     "          [8, 6, 7, 8, 9, 7],\n",
129 |     "          [9, 5, 0, 4, 6, 8],\n",
130 |     "          [8, 7, 9, 3, 6, 1]])\n",
131 |     "```\n",
132 |     "\n",
133 |     "1. Add 1 to the first row, 2 to the second row, 3 to the third row and 4 to the fourth row using a vector `v = np.array([1,2,3,4])` and broadcasting. (2 points)\n",
134 |     "2. Normalize A so that its row means are all 0 and call it A1. (2 points)\n",
135 |     "3. What are the singular values of A1? (2 points)\n",
136 |     "4. What are the eigenvalues of the covariance matrix of A1? (2 points)\n",
137 |     "5. Find the least squares solution vector $x$ if $Ax = y$ where `y = np.array([1,2,3,4]).T` (2 points)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {
144 |     "collapsed": false
145 |    },
146 |    "outputs": [],
147 |    "source": []
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {
153 |     "collapsed": false
154 |    },
155 |    "outputs": [],
156 |    "source": []
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {
162 |     "collapsed": false
163 |    },
164 |    "outputs": [],
165 |    "source": []
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "collapsed": false
172 |    },
173 |    "outputs": [],
174 |    "source": []
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {
180 |     "collapsed": false
181 |    },
182 |    "outputs": [],
183 |    "source": []
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "**Question 3 (10 points)**.\n",
190 |     "\n",
191 |     "1. Prove that $e^{x^2 + y^2}$ is a convex function. (5 points)\n",
192 |     "2. Using `scipy.optimize`, find the values of $x$ and $y$ that minimize $e^{x^2 + y^2}$ in the unconstrained case and in the presence of the constraint that $x + y = 3$. Use (1,1) as a starting guess (5 points)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {
199 |     "collapsed": false
200 |    },
201 |    "outputs": [],
202 |    "source": []
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {
208 |     "collapsed": false
209 |    },
210 |    "outputs": [],
211 |    "source": []
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "**Question 4 (10 points)**.\n",
218 |     "\n",
219 |     "A milkmaid is at point A and needs to get to point B. However, she also needs to fill a pail of water from the river en route from A to B. The equation of the river's path is shown in the figure below. What is the minimum distance she has to travel to do this?\n",
220 |     "\n",
221 |     "1. Solve using `scipy.optimize` and constrained minimization.\n",
222 |     "2. Solve without using `scipy.optimize`. Hint: Use Lagrange \n",
223 |     "\n",
224 |     "![Milkmaid problem](milkmaid.png)"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {
231 |     "collapsed": false
232 |    },
233 |    "outputs": [],
234 |    "source": []
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {
240 |     "collapsed": false
241 |    },
242 |    "outputs": [],
243 |    "source": []
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "**Question 5 (10 points)**. \n",
250 |     "\n",
251 |     "Find the minimum of the following quadratic function on $\\mathbb{R}^4$ \n",
252 |     "\n",
253 |     "$$f(x) = x^TAx +b^Tx +c$$\n",
254 |     "where\n",
255 |     "$$A = \\left(\\begin{matrix}13&5&0&0\\\\5&7&0&0\\\\0&0&20&-7\\\\0&0&-7&12\\end{matrix}\\right), b = \\left(\\begin{matrix}1\\\\1\\\\1\\\\1\\end{matrix}\\right) \\textrm {and } c = 2$$\n",
256 |     "\n",
257 |     "and $x$ is a column vector.\n",
258 |     "\n",
259 |     "a. Using scipy.optimize (4 points)\n",
260 |     "\n",
261 |     "b. Using a matrix decomposition method (library functions - no need to code your own).  Note: for full credit you should exploit matrix structure. (4 points)\n",
262 |     "\n",
263 |     "c. Find the minimum under the constraint $||x||^2 = 1$ (i.e. on the unit sphere in $\\mathbb{R}^4$). (2 points)\n",
264 |     "\n",
265 |     "**Note: Do not be overly concerned if your values for $x$ at the minimum do not match exactly **\n"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {
272 |     "collapsed": false
273 |    },
274 |    "outputs": [],
275 |    "source": []
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {
281 |     "collapsed": false
282 |    },
283 |    "outputs": [],
284 |    "source": []
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {
290 |     "collapsed": false
291 |    },
292 |    "outputs": [],
293 |    "source": []
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {},
298 |    "source": [
299 |     "**Question 6 (10 points)**.\n",
300 |     "\n",
301 |     "Given the set of vectors\n",
302 |     "\n",
303 |     "```\n",
304 |     "v1 = np.array([1,2,3])\n",
305 |     "v2 = np.array([2,4,7])\n",
306 |     "v3 = np.array([1,0,1])\n",
307 |     "```\n",
308 |     "\n",
309 |     "1. Calculate the pairwise Euclidean distance matrix using nested for loops. (2 points)\n",
310 |     "2. Calculate the pairwise Euclidean distance matrix using numpy broadcasting. (3 points)\n",
311 |     "3. Find an orthogonal basis for the space spanned by these vectors without using any functions from `numpy.linag` or `scipy.linalg` (5 points)"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {
318 |     "collapsed": false
319 |    },
320 |    "outputs": [],
321 |    "source": []
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {
327 |     "collapsed": false
328 |    },
329 |    "outputs": [],
330 |    "source": []
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {
336 |     "collapsed": false
337 |    },
338 |    "outputs": [],
339 |    "source": []
340 |   }
341 |  ],
342 |  "metadata": {
343 |   "kernelspec": {
344 |    "display_name": "Python 2",
345 |    "language": "python",
346 |    "name": "python2"
347 |   },
348 |   "language_info": {
349 |    "codemirror_mode": {
350 |     "name": "ipython",
351 |     "version": 2
352 |    },
353 |    "file_extension": ".py",
354 |    "mimetype": "text/x-python",
355 |    "name": "python",
356 |    "nbconvert_exporter": "python",
357 |    "pygments_lexer": "ipython2",
358 |    "version": "2.7.11"
359 |   }
360 |  },
361 |  "nbformat": 4,
362 |  "nbformat_minor": 0
363 | }
364 | 


--------------------------------------------------------------------------------
/misc/old-exams/milkmaid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cliburn/sta-663-2016/59f8683af6c04fd3f13ad907ab963c4fbcb8daea/misc/old-exams/milkmaid.png


--------------------------------------------------------------------------------
/projects/FinalProject.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Instructions for Final Project (25%)\n",
  8 |     "----\n",
  9 |     "\n",
 10 |     "**Note**: As uusal, this work should be your own original effort. Please do not just copy and paste someone else's code from the web - that would be a serious violation of academic integrity. However, if there is an existing implemetation, it would make sense to do a comparative analysis for accuracy, features and execution speed.\n",
 11 |     "\n",
 12 |     "**Important**: Papers can be found in Sakai/Resources/FinalProjectPapers. If you intend to use a ppaer not in the folder, please remember that it must be aproved by Cliburn/Janice first!\n",
 13 |     "\n",
 14 |     "- Deadline for submission of progress report 1 6 Arpil 2016 (10 points)\n",
 15 |     "    - Choice of paper\n",
 16 |     "    - Set up Github repository for final project\n",
 17 |     "    - Preliminary section and subsection headings\n",
 18 |     "    - Abstract of project (150-250 words)\n",
 19 |     "- Deadline for submission of progress report 2 22 April 2016 (10 points)\n",
 20 |     "    - Code written and working on simulated data set\n",
 21 |     "    - Background written\n",
 22 |     "    - Flesh out sections and subsections as appropriate\n",
 23 |     "- Deadline for submission of final report 30 April 2016 (80 points)\n",
 24 |     "    - See rubric at bottom of notebook\n",
 25 |     "\n",
 26 |     "For the final project, you will need to implement a \"new\" statistical algorithm in Python from the research literature and write a report on it (in an Jupyter notebook, of course!). As a guide, the report should include most or all of the following sections, although it is up to you which aspects you choose to focus on (e.g. if the algorithm is highly complex, just developing and testing the Python code might be the main emphasis; alternatively, if the algorithm is quite straightforward, the emphasis can be on making a Spark version and benchmarking it):\n",
 27 |     "\n",
 28 |     "Note that for this project, you will be working with IPython notebooks, python scripts (e.g. test code), possibly code in other languages (e.g. Cython, C), data files and a MakeFile.\n",
 29 |     "\n",
 30 |     "### Background\n",
 31 |     "\n",
 32 |     "State the research paper you are using. Describe the concept of the algorithm and why it is interesting and/or useful. If appropriate, describe the mathematical basis of the algorithm. Some potential topics for the background include:\n",
 33 |     "\n",
 34 |     "- What problem does it address? \n",
 35 |     "- What are known and possible applications of the algorithm? \n",
 36 |     "- What are its advantages and disadvantages relative to other algorithms?\n",
 37 |     "- How will you use it in your research?\n",
 38 |     "\n",
 39 |     "### Implementation\n",
 40 |     "\n",
 41 |     "Implement the algorithm as a Python function or family of functions as clearly as possible. Can your code be understood by someone with a similar quantitative background as yours? Does it follow good coding conventions?\n",
 42 |     "\n",
 43 |     "### Testing\n",
 44 |     "\n",
 45 |     "Write unit tests for your functions and make sure they pass. These should include both common and edge cases (e.g. test at boundaries - what happens if the input is an empty vector etc.).\n",
 46 |     "\n",
 47 |     "### Optimization\n",
 48 |     "\n",
 49 |     "Profile the performance of the algorithm and identify bottlenecks. Eliminate these bottlenecks where possible using vectorization, Cython and/or just-in-time compiling. Document what you did and the resulting performance improvement.\n",
 50 |     "\n",
 51 |     "### High performance computing\n",
 52 |     "\n",
 53 |     "If you can identify tasks that can be performed in parallel, parallelize the algorithm. Depending on the algorithm, this may involve the use of parallel or distributed computing (or some combination of them). If the algorithm cannot be parallelized, more weight will be given to how effectively you perform optimization using Cython or just-in-time compilation. Document what you did and the resulting performance improvement.\n",
 54 |     "\n",
 55 |     "### Application and comparison\n",
 56 |     "\n",
 57 |     "Apply the algorithm to a \"real\" problem (this can be simulated data) to show how it works and to compare it with at least one other algorithm that addresses the same problem from some available Python or R package. \n",
 58 |     "\n",
 59 |     "### Reproducible analysis\n",
 60 |     "\n",
 61 |     "You are expected to perform the entire development process using a new GitHub repository with regular commits. The GitHub repository should contain a Makefile that can run all tests, as well as perform profiling and benchmarking.\n",
 62 |     "\n",
 63 |     "Rough Rubric \n",
 64 |     "---\n",
 65 |     "\n",
 66 |     "Interim submissions will count towards 20% of the grade and the final submission will make up the remaining 80%. The rubric is only meant as a guide - for example, if the algorihtm is highly complex, we will take that into account and give extra credit for a correct algorithm, even if it is not fully optimized.\n",
 67 |     "\n",
 68 |     "- Review of algorithm and the problems it solves (10 points)\n",
 69 |     "    - Literature review\n",
 70 |     "    - Clear explanation of the main ideas behind the algorithm\n",
 71 |     "    - Describe why the algorithm is useful\n",
 72 |     "- Good programming practices (10 points)\n",
 73 |     "    - Use of Github\n",
 74 |     "    - Use of literate programming in Jupyter\n",
 75 |     "    - Generate final report as PDF via script (e.g. Makefile)\n",
 76 |     "    - Commenting and use of dcostrings\n",
 77 |     "- Implementation of a working program (30 points)\n",
 78 |     "    - Does it run?\n",
 79 |     "    - Is it correct? How do you know? Use of tests\n",
 80 |     "    - Is it written cleanly and efficiently?\n",
 81 |     "- Optimization efforts (15 points)\n",
 82 |     "    - Opportunities for vectorization\n",
 83 |     "    - Opportunities to use JIT or Cython\n",
 84 |     "    - Opportunities to use multi-core machines\n",
 85 |     "    - Opportunities to use Spark for distributed programming\n",
 86 |     "- Applications (15 points)\n",
 87 |     "    - Application to simulated or toy datasets\n",
 88 |     "    - Application to one real data set\n",
 89 |     "    - Discussion of results"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 0,
 95 |    "metadata": {
 96 |     "collapsed": false
 97 |    },
 98 |    "outputs": [],
 99 |    "source": []
100 |   }
101 |  ],
102 |  "metadata": {
103 |   "kernelspec": {
104 |    "display_name": "Python 2",
105 |    "language": "python",
106 |    "name": "python2"
107 |   },
108 |   "language_info": {
109 |    "codemirror_mode": {
110 |     "name": "ipython",
111 |     "version": 2
112 |    },
113 |    "file_extension": ".py",
114 |    "mimetype": "text/x-python",
115 |    "name": "python",
116 |    "nbconvert_exporter": "python",
117 |    "pygments_lexer": "ipython2",
118 |    "version": "2.7.11"
119 |   }
120 |  },
121 |  "nbformat": 4,
122 |  "nbformat_minor": 0
123 | }
124 | 


--------------------------------------------------------------------------------
/projects/FinalProjectGuide.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Final Project Workflow\n",
  8 |     "====\n",
  9 |     "\n",
 10 |     "There is a lot to do, and these are simply suggestions - nobody is expected to do *all* the suggested steps, and there may be alternative strategies you choose to implement. I suggest that you do most of your development in a Jupyter notebook, supplemented by a code editor if necessary (especially if you are writing C/C++), making use of Markdown cells to document what you are doing. That way, you only have to clean up and refine the notebook and you have a final project ready for submission.  \n",
 11 |     "\n",
 12 |     "### Week 0\n",
 13 |     "\n",
 14 |     "* Choose paper\n",
 15 |     "* Identify algorithm to implement\n",
 16 |     "* Write abstract and outline of approach\n",
 17 |     "\n",
 18 |     "### Week 1\n",
 19 |     "\n",
 20 |     "* Code algorithm in Python\n",
 21 |     "    * Write modular code\n",
 22 |     "    * Functional core - use pure functions where possible\n",
 23 |     "    * Imperative shell - minimize stateful code to interactions and I/O\n",
 24 |     "* Write tests to check correctness\n",
 25 |     "    * Check boundary conditions\n",
 26 |     "    * Are there known analytic/asymptotic solutions to compare against?\n",
 27 |     "    * Are there other packages implementing the algorithm to compare against?\n",
 28 |     "    * Are there alternative algorithms that should give the same answer?\n",
 29 |     "\n",
 30 |     "**Deadline for 1st progress report: 6th April 2016**\n",
 31 |     "\n",
 32 |     "### Week 2\n",
 33 |     "\n",
 34 |     "* Profile for speed\n",
 35 |     "    * Use cProfile and the `prun` magic\n",
 36 |     "    * Identify performance bottlenecks\n",
 37 |     "* Optimize slow functions \n",
 38 |     "    * Consider using `line_profiler` if necessary\n",
 39 |     "    * Consider the following strategies:\n",
 40 |     "        * More idiomatic Python\n",
 41 |     "        * Cache results (e.g. `lru_cache` decorator)?\n",
 42 |     "        * Better data structure?\n",
 43 |     "        * Better algorithm?\n",
 44 |     "        * Vectorize with `numpy` or `pandas`\n",
 45 |     "        * Use a JIT compiler (e.g. `numba`)\n",
 46 |     "        * USe `Cython` to recode function\n",
 47 |     "        * Write C/C++ function and wrap for use in Python\n",
 48 |     "\n",
 49 |     "### Week 3\n",
 50 |     "\n",
 51 |     "* Write parallel code\n",
 52 |     "    * Using Cython `prange` and `openmp`\n",
 53 |     "    * Using threads\n",
 54 |     "    * Using processes\n",
 55 |     "* Scaling for massive data sets\n",
 56 |     "    * Using appropriate data storage (e.g. HDF5, databases)\n",
 57 |     "    * Using `pyspark` for distributed computing\n",
 58 |     "* Re-run tests after optimization to check that output has not changed \n",
 59 |     "* Comparative analysis for each new version with `time` and `timeit` magic \n",
 60 |     "* Applications \n",
 61 |     "    * Apply to simulated data sets\n",
 62 |     "    * Apply to real data sets \n",
 63 |     "\n",
 64 |     "**Deadline for 2nd progress report: 22nd April 2016**\n",
 65 |     "\n",
 66 |     "### Week 4\n",
 67 |     "\n",
 68 |     "* Packaging\n",
 69 |     "    * Bundle code into a package for distribution\n",
 70 |     "    * Provide instructions for installation on GitHub\n",
 71 |     "    * Upload to Python Package Index if appropriate\n",
 72 |     "* Clean up work and documentation\n",
 73 |     "\n",
 74 |     "### Submission\n",
 75 |     "\n",
 76 |     "* Submit final project \n",
 77 |     "    * As a `Jupyter` notebook or series of notebooks using literate programming\n",
 78 |     "        * Generate PDF if possible\n",
 79 |     "        * Use `nbsphinx` to convert to HTML if appropriate\n",
 80 |     "    * As a LaTeX file, using `make` to automate document generation\n",
 81 |     "\n",
 82 |     "**Deadline for final report: 22nd April 2016**"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "collapsed": true
 90 |    },
 91 |    "outputs": [],
 92 |    "source": []
 93 |   }
 94 |  ],
 95 |  "metadata": {
 96 |   "kernelspec": {
 97 |    "display_name": "Python 3",
 98 |    "language": "python",
 99 |    "name": "python3"
100 |   },
101 |   "language_info": {
102 |    "codemirror_mode": {
103 |     "name": "ipython",
104 |     "version": 3
105 |    },
106 |    "file_extension": ".py",
107 |    "mimetype": "text/x-python",
108 |    "name": "python",
109 |    "nbconvert_exporter": "python",
110 |    "pygments_lexer": "ipython3",
111 |    "version": "3.5.1"
112 |   }
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 0
116 | }
117 | 


--------------------------------------------------------------------------------