├── .gitignore
├── LICENSE
├── advmcmc
    ├── advmcmc.Rmd
    └── advmcmc_HW10.Rmd
├── constrained
    └── constrained.Rmd
├── em
    ├── em.Rmd
    └── em_HW6.Rmd
├── large
    ├── datatable.Rmd
    ├── hdf5.Rmd
    ├── large_HW4.Rmd
    ├── sparse.Rmd
    └── sqlite.Rmd
├── mcmc
    ├── alzheimers.dat
    ├── envelope.png
    ├── mcmc.Rmd
    ├── mcmc_HW8.Rmd
    ├── mcmc_HW9.Rmd
    └── mixture.dat
├── ml
    ├── essentials.Rmd
    ├── essentials_HW11.Rmd
    ├── nn.Rmd
    ├── rf.Rmd
    ├── svm.Rmd
    └── svm_HW12.Rmd
├── numint
    ├── alzheimers.dat
    ├── chap6figrules.jpg
    ├── numint.Rmd
    └── numint_HW7.Rmd
├── optim
    ├── chap2figgoodnewt.jpg
    ├── chap2figgoodnewt.png
    ├── optim.Rmd
    └── optim_HW5.Rmd
└── rpkg
    ├── build.Rmd
    ├── debug.Rmd
    ├── document.Rmd
    ├── efficient.Rmd
    ├── profvis.png
    ├── rcpp.Rmd
    ├── rpkg_HW1.Rmd
    ├── rpkg_HW2.Rmd
    ├── rpkg_HW3.Rmd
    ├── s_diagram.png
    └── test.Rmd


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | *.html
6 | rpkg/foo*
7 | large/*
8 | ml/*
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 biodatascience
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/advmcmc/advmcmc_HW10.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "HW 10 - advMCMC"
  3 | author: "Naim Rashid"
  4 | date: "2/20/2019"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | # MCMC extension of HW 6
 13 | 
 14 | We would like to simulate from the posterior distribution of parameter $\boldsymbol{\theta} = (\pi,\lambda)$ pertaining to the fishing dataset and zero-inflated poisson model described in HW 6, assuming $\pi$ has a Unif(0,1) prior, and $\lambda$ has a Gamma(2,2) prior (shape and scale = 2).  The joint posterior can be written as $f(\pi,\lambda | \boldsymbol{y}) \propto f(\boldsymbol{y} | \pi, \lambda)f(\pi,\lambda) = f(\boldsymbol{y} | \pi, \lambda)f(\pi)f(\lambda)$, where $f(\boldsymbol{y} | \pi,\lambda)$ is the likelihood give in HW 6 except with $\lambda$ unknown, $f(\pi)$ is the specified prior for $\pi$, and $f(\lambda)$ is the specified prior for $\lambda$.  
 15 | 
 16 | Implement a MH random walk procedure to sample from the joint posterior of $\boldsymbol{\theta} = (\pi,\lambda)$.  You do not necessarily need to do a change of variable for $\pi$ or $\lambda$, however for proposals that exceed the boundaries of the parameter space of either parameter, the posterior for the propsal should be set = 0 (MH ratio = 0).  You may want to consider a narrower random walk variance in such as setting as well. 
 17 | 
 18 | You may use the following code below to get started, using $M = 20000$, random seed (1), starting values ($\pi^{(0)} = 0.3$, $\lambda = 3$), and burn-in period (2000 iterations) for all implementations of the algorithm below. Report the posterior means for $\pi$ and $\lambda$, as well as diagnostics such as trace plots and autocorrelation plots.
 19 | 
 20 | ```{r}
 21 | ### HELPER FUNCTIONS
 22 | 
 23 | # log prior for lambda, fill in 
 24 | lplambda = function(lambda){
 25 |   
 26 |   ## start solution
 27 | 
 28 |   ## end solution
 29 |   
 30 | }
 31 | 
 32 | # log prior for pi, fill in 
 33 | lppi = function(pi){
 34 |   
 35 |   ## start solution
 36 | 
 37 |   ## end solution  
 38 |   
 39 | }
 40 | 
 41 | # bivariate RW proposal function
 42 | # hint: bivariate proposal same as generating two indep proposals here
 43 | h.sim = function(){
 44 |   
 45 |   ## start solution
 46 | 
 47 |   ## end solution  
 48 |   
 49 | }
 50 | 
 51 | # returns ll, or log f(y|lambda, pi)
 52 | # compute given y and ny from table
 53 | ll = function(y, ny, x){
 54 |   pi = x[1]
 55 |   lambda = x[2]
 56 |   
 57 |   ## start solution
 58 | 
 59 |   ## end solution
 60 | }
 61 | 
 62 | # MH ratio
 63 | # Hint; since h symmetric, proposal density cancels out of ratio
 64 | R = function(y, y_weight, x, xt){
 65 |   # x is the proposal, xt is current state
 66 |   # x[1],xt[1] pertain to pi, x[2],xt[2] pertain to lambda
 67 |   ## start solution
 68 | 
 69 |   ## end solution
 70 | }
 71 | 
 72 | ```
 73 | 
 74 | Now start the main code for the sampler
 75 | 
 76 | ```{r}
 77 | # set the seed
 78 | set.seed(1)
 79 | 
 80 | # data fro HW 6
 81 | y = 0:6
 82 | ny = c(3062, 587, 284, 103, 33, 4, 2)
 83 | 
 84 | # Set chain length
 85 | M = 20000
 86 | 
 87 | # initialize the chain vector (alpha, lambda)
 88 | x.rw.chain = matrix(0, M, 2)
 89 | colnames(x.rw.chain) = c("pi","lambda")
 90 | 
 91 | # Initialize chain with specified initial values
 92 | # alpha, lambda
 93 | x.rw.chain[1,] = c(0.3, 3) 
 94 | 
 95 | # now start chain
 96 | for(i in 1:(M-1)){
 97 |   
 98 |   # set the value at current iteration of the chain to variable xt
 99 |   xt = 
100 |   
101 |   # draw a proposal from the proposal density
102 |   x = 
103 |   
104 |   # calculate MH ratio 
105 |   r = 
106 |   
107 |   # Generate draw from bernoulli(p).
108 |   keep = rbinom(1, 1, r)
109 |   
110 |   # if keep = 1, then set next iteration equal to then proposal
111 |   if(keep == 1){
112 |     x.rw.chain[i+1,] = x
113 |   }else{
114 |     # otherwise, carry over value from the current iteration
115 |     x.rw.chain[i+1,] = xt
116 |   }
117 | }
118 | 
119 | ## print posterior means and diagnostic plots.  Comment on convergence 
120 | 
121 | 
122 | ```
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/constrained/constrained.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "constrained"
  3 | author: "Naim Rashid"
  4 | date: "12/5/2018"
  5 | output: 
  6 |   html_document:
  7 |     number_sections: true
  8 | header_includes:
  9 | - \usepackage{amsmath}
 10 | - \usepackage{amssymb}
 11 | - \usepackage{amsthm}
 12 | include-before:
 13 | - '\newcommand{\bfm}[1]{\ensuremath{\mathbf{#1}}}'
 14 | - '\newcommand{\bftm}[1]{\ensuremath{\mathbf{#1$^{T}$}}}'
 15 | - '\newcommand{\bdm}[1]{\ensuremath{\boldsymbol{#1}}}'
 16 | - '$\def \A \bfm{A}$'
 17 | - '$\def \b \bfm{b}$'
 18 | - '$\def \tA \bftm{A}$'
 19 | - '$\def \d \bfm{d}$'
 20 | - '$\def \e \bfm{e}$'
 21 | - '$\def \g \bfm{g}$'
 22 | - '$\def \I \bfm{I}$'
 23 | - '$\def \l \bfm{l}$'
 24 | - '$\def \M \bfm{M}$'
 25 | - '$\def \W \bfm{W}$'
 26 | - '$\def \y \bfm{y}$'
 27 | - '$\def \Y \bfm{Y}$'
 28 | - '$\def \X \bfm{X}$'
 29 | - '$\def \x \bfm{x}$'
 30 | - '$\def \tx \bftm{x}$'
 31 | - '$\def \z \bfm{z}$'
 32 | - '$\def \betab \bdm{\beta}$'
 33 | - '$\def \Omegab \bdm{\Omega}$'
 34 | - '$\def \pib \bdm{\pi}$'
 35 | - '$\def \thetab \bdm{\theta}$'
 36 | - '$\def \epsilonb  \bdm{\epsilon}$'
 37 | ---
 38 | 
 39 | ```{r setup, include=FALSE}
 40 | knitr::opts_chunk$set(echo = TRUE)
 41 | ```
 42 | 
 43 | #  Introduction
 44 | 
 45 | In the previous lectures we talked about approaches that seek to optimize particular functions with respect to a given set of parameters.  Such functions could be simple with only a single parameter, or complex such as a likelihood function with multiple unknown parameters.
 46 | 
 47 | In this lecture we will connect some of the optimization approaches discussed earlier with alternative approaches such as Linear Programming and Quadratic Programming.  We will see that some of the problems we described earlier can be reformulated into and unconstrained Linear or Quadratic programming problem.  The reason for making such a connection is that once formulated into these settings, we may apply standard and general off the shelf solvers to obtain parameter estimates.  
 48 | 
 49 | We will discuss both the unconstrained and constrained optimization setting, where in the latter one often places some sort of constraint on the parameters to be optimized over.  There are various types of constaints that one may select, and again reformulating one's problem may allow for the application of avaliable solvers for constrained Linear/Quadratic Programming problems.  
 50 | 
 51 | From here, we will segway into penalized likelhood estimation and show that in certain cases we may use Linear/Quadratic programming to similarly solve such constrained maximization problems.   In the regression setting, penalized likelihood estimation is often used for the purposes of variable selection in high dimensional settings.  We will discuss several procedures in the literature for performing maximization in such settings, and talk about efficient implementations.  We will also discuss the impact of the choice of penalty.
 52 | 
 53 | As a side note, there is a rich literature on Linear and Quadratic Programming topics stemming back many decades, however we will only cover a small portion of this topic related to the upcoming topic of Support Vector Machines (SVMs) in Module 3 of this course.  We will also cover topics that are relevant to previously discussed statistical estimation problems and penalized likelihood estimations.  The discussion on variable selection via penalized likelihood will connect with the material presented in the chapter 4 of Bios 761.
 54 | 
 55 | # Unconstrained Optimization
 56 | 
 57 | The term "unconstrained"  in Unconstrained Optimization relates to the parameters in the function that are being optimized over.  That is, no bounds or limits are being placed on the parameters or functions of the parameters when trying to minimize the objective function of interest.   
 58 | 
 59 | To introduce the topic, let us start with a familiar problem.  In the first lecture, we talked about maximum likelihood estimation in the context of linear regression.  In this setting, let us assume that we have an $n\times 1$ vector of observed responses $\y$ and an $n \times p$ full rank matrix of predictors $\X$.  We assume that $$\y = \X\betab + \epsilonb,$$ where $\betab$ is a $p \times 1$ vector of unknown parameters to be estimated, $\epsilonb$ is an $n \times 1$ vector of unobserved errors such that $\epsilon_i\sim N(0,\sigma^2)$ for $i = 1\ldots n$, and $\sigma^2$ is the variance of each of the unobserved errors (also unknown).  In doing so, we assume that the relationship between $\y$ and $\X\beta$ is linear, with some error.
 60 | 
 61 | In our first lecture, we discussed for the intercept-only model case how one may perform maximum likelihood estimation to obtain estimates for $\betab$.  We can also show how this approach is equivalent to the problem of minimizing the regression sum of squares $\sum_{i = 1}^n (y_i - \X_i\betab)^2$, where $\X_i$ is the set of covariates pertaining to the $i'th$ subject.  We also may write this as $(\y - \X\betab)^T(\y - \X\betab)$.  There is a close form for the minimized of the RSS, and during the derivation of which we arrive at the normal equations $\tX\X\betab = \tX\y$.  This implies that $\hat{\betab} = (\tX\X)^{-1}\tX\y$, our closed form solution for the minimizer of the RSS.  
 62 | 
 63 | Recall that we do not explicitly make any assumptions regarding $\epsilonb$ when decide to obtain $\hat{\betab}$ through simply minimizing the RSS.  If we make the additional usual assumption that $\epsilon_i\sim N(0,\sigma^2)$ for $i = 1\ldots n$, then we know that this minimizer is also the UMVUE estimator for $\beta$.  This assumption is implicit when we obtain $\hat{\beta}$ instead through maximum likelihood estimation.
 64 | 
 65 | Alternatively, we may use unconstrained quadratic programming to obtain the minimizer of the RSS.  
 66 | 
 67 | Let us write the form of this Unconstrained Quadratic Programming problem as the following: 
 68 | 
 69 | <center>Minimize $||Ax - b||_2^2 = \tx\tA\Ax - 2\b^T\A\x + \b^T\b$ over $\x \in \mathbb{R}^p$</center>
 70 | 
 71 | 
 72 | For the current regression example, $\A = \X$, $\x = \betab$ and is assumed to be $p$-dimensional, and $\b = y$.  Note that $||Ax - b||_2^2 = ||b - Ax||_2^2$ and that we do not put any explicit bounds on $\x$.  With no surprise, we can derive the minimizer of this objective function in a manner very similar to our minimizer for the RSS.  That is, we have an analytic solution to this problem $\hat{x} = (\tA\A)^{-1}\tA\b$.  If we assume that $\A$ is full rank, then this implies that this solution (as in the regression case) is unique. 
 73 | 
 74 | Also, if $\A^T\A$ is positive definite, then this problem (as is usually the case when $\A$ is full rank), then the objective function is convex and is easily solvable by the methods discussed in the first lecture of this module.  
 75 | 
 76 | We may more commonly see a more general form of this problem, written as the following:
 77 | 
 78 | <center>Minimize $\frac{1}{2}\tx\Qx - \B^T\x + c$ over $\x \in \mathbb{R}^p$</center>
 79 | 
 80 | With respect to the linear regression problem above, $\Q = \A^T\A$, $\B = \b^T\A$, and $\c =  \b^T\b$.  Oftentimes you may see $\c$ omitted as it is a constant unrelated to $\x$, and therefore has no role in terms of minimization with respect to $\x$.  Here, it is assumed that $\Q$ is $p\times p$ and is symmetric.
 81 | 
 82 | 
 83 | Generally speaking, when $\Q$ is convex (positive definite and the cholesky decomposition form is full rank) we can use the methods for lecture 1 to perform optimization in this setting.  So why bother introduce this notation?  We do this to set up the unconstrained setting in the next section. 
 84 | 
 85 | In general, we rarely see many unconstrained versions of Linear Programming problems as the objective function is linear, and therefore does not have a natural minimum without constraints as in the quadratic case.  That is, optimization in the quadratic case when the objective function is convex may naturally have a unique solution in the absence of constraints, but with the linear objective function is is rarely the case without constraints for obvious reasons (no natural minimum).  
 86 | 
 87 | In general, when both the objective function and the constraints are both linear, we call this a Linear Programming problem.  We will show examples of linear programming later in this lecture. When the objective function is quadratic, and the constaints are linear, we call this a Quadratic Programming problem.  There are generalizations for quadratic constraints, but we will not cover them in this lecture. 
 88 | 
 89 | Making a note regarding convexity of the objective function in QP: is it required?
 90 | 
 91 | # Constrained Quadratic Optimization (Quadratic Programming)
 92 | 
 93 | Now lets move to the case where we may constraints on the quadratic optimization problem introduced earlier.  We will start with the general for specifying the constrained quadratic optimization problem, and then we will given examples of such constraints and their connection to common problems in statistics.  
 94 | 
 95 | We can write the general form of a constrained Quadratic Programming problem as the following:
 96 | 
 97 | <center>Minimize $\frac{1}{2}\tx\Qx - \B^T\x + c$ 
 98 | subject to $A_1\x = \d$ and
 99 | $A_2\x \leq \e$
100 | </center>
101 | 
102 | Where Q, B, and c are defined similarly to the previous section, $A_1$ is an $l\times p$ matrix, $A_2$ is an $m\times p$ matrix, $\d$ is a $l\times 1$ vector, and $\e$ is a $m\times 1$ vector.  The last two lines of the above represent the constraints on X.  This is a general form, and therefore there are several special cases of this that may have simple ways of solving for the minimizer with respect to $\x$.
103 | 
104 | ## Brief recap of langrange multipliers and the lagrangian
105 | 
106 | In order to solve constrained quadratic programming problems, it is helpful to introduce the concept of langrange multipliers and the langrangian. In each of the special cases we will cover we will see how they provide one avenue to arriving at a solution, however multiple approaches for solving may exist.  
107 | 
108 | For a generic problem where we have the following problem
109 | 
110 | <center>Maximize $f(x)$ 
111 | subject to $g(x) = 0$ 
112 | </center>
113 | 
114 | then we can introduce a new variable $\lambda$ and defined a function $$\mathcal{L}(x, \lambda) = f(x) - \lambda g(x)$$.  Here, $\lambda$ is called a Lagrange Multiplier, and $\mathcal{L}(x, \lambda)$ is called the Lagrangian.  Clearly, minimizing rather than maximizing is a trivial modification to this problem.  Here we assume that $f(x)$ and $g(x)$ have similar partial derivatives. 
115 | 
116 | Clearly, we can see how this may relate to our Quadratic Programming problems, where we are attempting to minimize some function (say $f(x)$) subject to some constraint ($g(x) = 0$).  This can be generalized to multiple constraints as well.  
117 | 
118 | So how does this approach help us solve the general QP problem?  The Langrangian helps us convert our originally constrained optimization problem to an unconstrained one.  
119 | 
120 | In this current setup with a single constraint, we simply solve for $x$ and $\lambda$ from the following system of equations. 
121 | 
122 | $0 = \frac{d}{dx}\mathcal{L}(x, \lambda)$ and 
123 | $0 = \frac{d}{d\lambda}\mathcal{L}(x, \lambda)$
124 | 
125 | Later we will see cases where there may not be able closed form solution for $\lambda$ and the other parameters, and that one may need to fix $\lambda$ into order to arrive at a unique solution for the rest (objective function is only convex with $\lambda$ fixed).  This obviously presents a clear problem as to what is the best $\lambda$ to choose but for now we can assume that we can solve for $\lambda$ and $x$ directly.  
126 | 
127 | ### Example 1:  Simple function
128 | 
129 | ### Example 2:  Likelihood function
130 | 
131 | ## Application to solving a Quadratic Programming problem
132 | 
133 | Now lets apply this to solving some common examples of Quadratic Programming problems
134 | 
135 | ## Quadratic programming with equality constraint
136 | 
137 | In this case we specify an equalty constraint, and we drop c in the objective function has it has no impact on the minimization.  We can write the setup as the following:
138 | <center>Minimize $\tx\Qx - 2\B^T\x$ over $\x \in \mathbb{R}^p$
139 | subject to $A_1\x = \d$ 
140 | </center>
141 | 
142 | Using the results from the previous section, we can express the Lagrangian as the following:  $$\mathcal{L}(\x, \lambda) = \frac{1}{2}\tx\Qx - 2\B^T\x- \lambda(A_1\x-\d).$$  Notice that in the previous section, the langrangian was defined only for constraints in the form of $g(x) = 0$.  We can format our current constraint such that $A_1\x-\d$, where $\d$ is considered a "slack variable" that allows for an adjustment to $A_1\x$ that ensures equality with 0.
143 | 
144 | CHECK DEFINITION OF SLACK VARIABLE
145 | 
146 | Taking derviatives with respect to $\x$ and $\lambda$, we arrive at the following:
147 | 
148 | $0 = \frac{d}{dx}\mathcal{L}(\x, \lambda) = \Qx  - B^T - \lambdaA_1$ 
149 | $0 = \frac{d}{d\lambda}\mathcal{L}(\x, \lambda) = 0 + 0 + A_1\x - d$
150 | 
151 | We can rewrite this as
152 | 
153 | $B^T  =  \Qx  -\lambdaA_1$ 
154 | $d = A_1\x$
155 | 
156 | Under certain conditions, we will have a unique solution to this problem.  
157 | 
158 | ### Example
159 | 
160 | ## Quadratic programming with inequality constraint
161 | 
162 | We can write the setup as the following:
163 | <center>Minimize $\tx\Qx - 2\B^T\x$ over $\x \in \mathbb{R}^p$
164 | subject to $A_1\x \leq \d$ 
165 | </center>
166 | 
167 | Using the results from the previous section, we can express the Lagrangian as the following:  $$\mathcal{L}(\x, \lambda) = \frac{1}{2}\tx\Qx - 2\B^T\x- \lambda(A_1\x-\d).$$  Notice that in the previous section, the langrangian was defined only for constraints in the form of $g(x) = 0$. 
168 | 
169 | # Primal and Dual
170 | 
171 | We can also represent an alternative form of the original maximization problem in the section prior.  We can denote the original optimization problem as the "primal" problem in terms of the langrangian defined.  We can also define what is called the "dual" function which can be defined as $$inf_x \mathcal{L}(\x, \lambda)$$, where this value can be dermined from solving for $x$ in $0 = \frac{d}{dx}\mathcal{L}(\x, \lambda)$.  After plugging this value back into the $ \mathcal{L}(\x, \lambda)$, the resulting fuction is now considered as the dual to the original problem.  Maximizing this function with respect to lambda is equivalent to minimizing with respect to the original problem
172 | 
173 | ## Examples
174 | 
175 | # Application of Unconstrained Optimization: Ridge Regression regualrization
176 | In the linear model context, ridge regression has several applications.  It can 1) allow to get a set of estimates from a linear regression model where $\X$ is less than full rank, and 2) shrink coefficients.   In overdetermined models, this can be helpful in being able to obtain a set of coefficient estimates.  
177 | 
178 | We can write the primarl of the minimization problem as the following:
179 | 
180 | <center>Minimize $||Ax - b||_2^2$ 
181 | over $\x \in \mathbb{R}^p$
182 | subject to $||x||_2^2 \leq t$
183 | </center>
184 | 
185 | We can write the dual of the minimization problem as the following: CHECK THIS
186 | 
187 | <center>Minimize $||Ax - b||_2^2 + \lambda||x||_2^2$ over $\x \in \mathbb{R}^p$</center>
188 | 
189 | Here \lambda, where $\lambda \geq 0$ s thought as the penalty or regularization parameter, where $\lambda = 0$ pertains to the regular least squares fit, and larger values of $\lambda$ results in more regularization.  For a fixed value of $\lambda$, we can show that there is a unique minimizer with respect to $x$, here $\hat{x} = (A^TA + \lambda I)^{-1}A^Tb$, where I is a $p\times p$ identity matrix. We can easily see that if $A$ is less than full rank, then $A^TA$ is less than full rank and therefore we cannot compute the inverse $(A^TA)^{-1}$ to obtain the standard least squares estimate.  In such cases where $A$ is LTFR, adding a diagonal matrix of fixed constants prior to taking the inverse will transform the matrix to be inverted into a full rank problem an therefore a unique solution exists.  Therefore, we can show that there is a unique minimizer for a given value of $\lambda$, however it is unknown apriori how to select the optimal value of lambda, which essentially controls the balance between model fit and model complexity. This LTFR situation may occur in extreme cases of multicollinearity and also when $p > n$ such as in high dimensional regression.  
190 | 
191 | We can see that this an example of unconstrained regression, where conditional on $\lambda$ the objective function is still convex and therefore has a unique analytical solution.  One common way to choose $\lambda$ is via cross validation, which will be covered in detail in module 3.  In essence, we choose the $\lambda$ that has the smallest cross validation error.  Other criteria includes computing traditional model selection criteria on the fitted model, and selecting the lambda with the best corresponding BIC.  
192 | 
193 | # Applications to variable selection: LASSO Regression
194 | One thing that is clear from ridge regression is that while it can shrink coefficients in the model, it cannot perform variable selection in the sense that it may remove variables from the model that are unhelpful (penalize coefficients to 0).  LASSO regression was introduced as a means to achieve this, where we can write the contstrained objective function in the linear model case such that 
195 | 
196 | <center>Minimize $||Ax - b||_2^2  $ over $\x \in \mathbb{R}^p$
197 | subject to $||x|| \leq t$
198 | </center>
199 | 
200 | When we attempt to solve this problem, we find that there is no closed form solution here. If we can rewrite the problem in a way that drops the absolute value in the constraint, then we reduce this problem to a standard QP problem.  Taking this route, we can write the langrangian of the rewritten problem as $$\mathcal{L}(\x, \lambda) = ||Ax - b||_2^2 - \lambda \sum_{i=1}^{p}|x_i|$$
201 | 
202 | For a fixed $\lambda$, this problem is now a convex optimization problem.  Again, since there is no closed form solution here, we evaluate a range of $\lambda$ values and evaluate some sort of model selection criterian on each fit, choosing $\lambda$ that minimizes this criterion (cross validation error is one such criterion).
203 | 
204 | # Alternative Approaches to Variable selection via Penalized Likelihood
205 | We will see that the above approachs in application to model selection problems in regression have overlap with the vast literature on variable selection methods via penalized likelihood.  The statistcal formulation of this problem was first introduced with the development of the LASSO by Hastie and Tibsirani, and some of the statistcal properties of general classes of penalities was introduced by Fan and Li in 2001.  
206 | 
207 | 
208 | # Fitting Penalized Likelihood Problems 
209 | 
210 | ## Penalty Functions and impact of choice
211 | 
212 | ### Properties and implications on penalty
213 | 
214 | ## Taylor Series Expansion
215 | 
216 | ## Coordinate Descent
217 | 
218 | ### Linear Models
219 | 
220 | ### Generalized Linear Models
221 | 
222 | ## Speeding upfitting
223 | 
224 | ### Initialization 
225 | 
226 | ### Warm Starts
227 | 
228 | ### Active Set
229 | 
230 | ### Prescreening (Sure Independence Screening for ultra high dimension)
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 


--------------------------------------------------------------------------------
/em/em_HW6.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Homework 6 - EM"
  3 | author: "Naim Rashid"
  4 | date: "2/13/2019"
  5 | output: html_document
  6 | header_includes:
  7 | - \usepackage{amsmath}
  8 | - \usepackage{amssymb}
  9 | - \usepackage{amsthm}
 10 | include-before:
 11 | - '\newcommand{\bfm}[1]{\ensuremath{\mathbf{#1}}}'
 12 | - '\newcommand{\bdm}[1]{\ensuremath{\boldsymbol{#1}}}'
 13 | - '$\def \d \bfm{d}$'
 14 | - '$\def \e \bfm{e}$'
 15 | - '$\def \g \bfm{g}$'
 16 | - '$\def \I \bfm{I}$'
 17 | - '$\def \l \bfm{l}$'
 18 | - '$\def \M \bfm{M}$'
 19 | - '$\def \W \bfm{W}$'
 20 | - '$\def \y \bfm{y}$'
 21 | - '$\def \Y \bfm{Y}$'
 22 | - '$\def \x \bfm{x}$'
 23 | - '$\def \X \bfm{X}$'
 24 | - '$\def \z \bfm{z}$'
 25 | - '$\def \thetab \boldsymbol{\theta}$'
 26 | - '$\def \betab \boldsymbol{\beta}$'
 27 | - '$\def \pib \boldsymbol{\pi}$'
 28 | ---
 29 | 
 30 | ```{r setup, include=FALSE}
 31 | knitr::opts_chunk$set(echo = TRUE)
 32 | ```
 33 | 
 34 | 
 35 | 
 36 | # Question 1:  Not So Simple Univariate Optimization
 37 | 
 38 | Let is revisit the problem from the last HW, now using BFGS to fit the model.  Report the results of the various starting values as last time, and comment on the convergence for each of the starting values relative the last HW that uses NR.  What properties about BFGS relative to NR could explain the different behavior in this setting? 
 39 | 
 40 | $$f(x) = 1.95 - e^{-2/x} - 2e^{-x^4}.$$
 41 | 
 42 | ```{r}
 43 | # f(x)
 44 | f = function(x){
 45 |   ## solution
 46 |  
 47 |   ## end solution
 48 | }
 49 | 
 50 | # first derivative
 51 | f1 = function(x){
 52 |   ## solution
 53 |  
 54 |   ## end solution
 55 | }
 56 | 
 57 | 
 58 | # to start the model, can use maxit/tolerance defaults from optimx
 59 | x = 1.2 # also try 0.5 and 0.99
 60 | 
 61 | 
 62 | 
 63 | ## solution
 64 | 
 65 | 
 66 | ## end solution
 67 | ```
 68 | 
 69 | 
 70 | ## EM:  Zero-inflated Poisson 
 71 | 
 72 | Revisiting problem 3 from HW5, let us implement an EM-based maximization approach to estimate the model parameters.
 73 | 
 74 | Please define the CDLL, E-step, and M-step below as we did in class.   
 75 | 
 76 | Then, fill in the relevant portions of the code below. 
 77 | 
 78 | Hint for writing the CDLL:  Let $z_i = 1$ represent the true (known) membership to the non-fishing population, and $z_i = 0$ to represent membership to the fishing population.  Start with defining the complete data likelihood based on the non-aggregated likelihood below, then take the log to get the final CDLL form. This will help derive the forms for the E and M-steps.  For the actual fitting, we give some direction in the code below in terms of how to use the table aggregated data by a weighting approach. 
 79 | 
 80 | ### Expression for Log Likelihood: from the previous HW
 81 | 
 82 | Lets rewrite the likelihood for the aggregated form of the data in terms of what it would look like when using the $n$ raw, non-aggregated responses:
 83 | 
 84 | $$ 
 85 | L(\boldsymbol{\theta}) = \prod_{i=1}^n (\pi + (1-\pi)e^{-\lambda})^{I[y_i=0]}\left((1-\pi)\frac{e^{-\lambda}\lambda^{y_i}}{y_i!}\right)^{I[y_i>0]}
 86 | $$
 87 | 
 88 | This is a simplified form of the PMF that was given at the beginning of the EM lecture. This corresponds to the following log-likelihood
 89 | 
 90 | $$\mathcal{l}(\boldsymbol{\theta}) = \sum_{i=1}^n I[y_i=0]\log(\pi + (1-\pi)e^{-\lambda}) + I[y_i>0]\left(\log(1-\pi) -\lambda + {y_i}\log(\lambda) + \log{y_i!}\right)$$
 91 | 
 92 | Therefore, if $y > 0$, we know automatically that that individual is from the fishing population.    
 93 | 
 94 | 
 95 | ### Expression for Complete Data Log Likelihood: Solution
 96 | 
 97 | Start with the CDL
 98 | 
 99 | 
100 | Now take the log
101 | 
102 | 
103 | ### Expression for E-step: Solution
104 | 
105 | 
106 | 
107 | ### Expression for M-step: Solution
108 | 
109 | 
110 | 
111 | ### Code implementation 
112 | 
113 | ```{r}
114 | 
115 | # data 
116 | y = 0:6
117 | ny = c(3062, 587, 284, 103, 33, 4, 2)
118 | 
119 | ## HINT:  to adjust using relative freq of counts in model/calculations when using aggregated data 
120 | y_weight = ny/sum(ny) 
121 | ## For example
122 | print(sum(ny*y)/sum(ny)) # mean of y based on aggregated data in table
123 | ## We get the same thing when fitting and intercept only poisson reg model, adjusting for relative freq of counts...
124 | print(exp(glm(y ~ 1, weight = y_weight)$coef))
125 | 
126 | # to start the model
127 | tol = 10^-8
128 | maxit = 50
129 | iter = 0
130 | eps = Inf
131 | ll = -10000
132 | 
133 | ## create posterior probability matrix
134 | pp = matrix(0,length(y), 2)
135 | colnames(pp) = c("non-fisher", "fisher")
136 | 
137 | ## initialize partion, everything  with count 0 is non-fisher, otherwise fisher
138 | pp[which(y ==0),1] = 1
139 | pp[,2] = 1 - pp[,1]
140 | 
141 | ## now start the EM algorithm
142 | while(eps > tol & iter < maxit){
143 |   
144 |   ## save old ll
145 |     ll0 = ll
146 |   
147 |   ## start M-step
148 |     # pi, 1 x 2 vector
149 |     pi = 
150 |     
151 |     # lambda, scalar
152 |     lambda = 
153 |   
154 |   ## start E-step
155 |     # update pp
156 |     
157 |   ## calculate LL
158 |     ll = 
159 |       
160 |   ## calculate relative change in log likelihood  
161 |     eps  = abs(ll-ll0)/abs(ll0)
162 |   
163 |   ## update iterator
164 |     iter = iter + 1
165 |     if(iter == maxit) warning("Iteration limit reached without convergence")
166 |   
167 |   ## print out info to keep track
168 |     cat(sprintf("Iter: %d logL: %.2f pi1: %.3f  eps:%f\n",iter, ll,pi[1],eps))
169 | }
170 | 
171 | ```
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/large/datatable.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Working with data.table"
  3 | author: "Michael Love"
  4 | date: 11/5/2018
  5 | output: 
  6 |   html_document:
  7 |     toc: true
  8 |     toc_float: true
  9 | ---
 10 | 
 11 | # Introduction
 12 | 
 13 | In this first lecture note on dealing with large datasets in R, we
 14 | will introduce a specialized package for manipulating large tables of
 15 | data, called *data.table*. For some background material, see the
 16 | following links. The first link has useful speed comparisons with
 17 | other packages as well as a table of basic operations, some of which
 18 | we will cover in this lecture note, but others which go beyond what we
 19 | cover here.
 20 | 
 21 | * [data.table website](https://github.com/Rdatatable/data.table/wiki)
 22 | * [data.table CRAN page](https://cran.r-project.org/web/packages/data.table/)
 23 | 
 24 | To motivate why we start with *data.table*, I refer to the following
 25 | sections of the book, 
 26 | [Efficient R Programming](https://csgillespie.github.io/efficientR), 
 27 | by Colin Gillespie and Robin Lovelace which was mentioned earlier in
 28 | the course:
 29 | 
 30 | * [Efficient input/output: plain text formats](https://csgillespie.github.io/efficientR/input-output.html#fread)
 31 | * [Efficient data carpentry: Data processing with data.table](https://csgillespie.github.io/efficientR/data-carpentry.html#data-processing-with-data.table)
 32 | 
 33 | We will also discuss data input and data processing below.
 34 | 
 35 | # Reading large data from plain text
 36 | 
 37 | One of the initial hurdles for working with large datasets is simply
 38 | reading in the data. *data.table* has a fast implementation `fread`
 39 | for *fast read*. Start by reading the help for `fread`:
 40 | 
 41 | ```{r, eval=FALSE}
 42 | ?fread
 43 | ```
 44 | 
 45 | Note that it has a number of arguments, some of which are not the same
 46 | as base R's `read.table`, e.g. `stringsAsFactors=FALSE` and
 47 | `sep="auto"`. A natural comparison is `fread` vs the `read_csv` and
 48 | `read_tsv` functions in the *readr* package. In the first 
 49 | *Efficient R Programming* link above, these
 50 | are compared and they state that for large files, e.g. > 100Mb, the
 51 | `fread` and `read_csv` functions are about equal, and 5x faster than
 52 | base R's `read.csv`.
 53 | 
 54 | Let's compare *fread* and *read.csv* on a large file we will work with
 55 | throughout the course, the *College Scorecard* dataset.
 56 | 
 57 | # Reading in College Scorecard dataset
 58 | 
 59 | Briefly, the 
 60 | [College Scorecard](https://collegescorecard.ed.gov/data/) dataset is
 61 | compiled by the Department of Education and has the following
 62 | descriptive paragraph from their website:
 63 | 
 64 | > The College Scorecard project is designed to increase transparency, putting the power in the hands of
 65 | > students and families to compare how well individual postsecondary institutions are preparing their
 66 | > students to be successful. This project provides data to help students and families compare college costs
 67 | > and outcomes as they weigh the tradeoffs of different colleges, accounting for their own needs and
 68 | > educational goals.
 69 | > These data are provided through federal reporting from institutions, data on federal financial aid, and
 70 | > tax information. These data provide insights into the performance of institutions that receive federal
 71 | > financial aid dollars, and the outcomes of the students of those institutions
 72 | 
 73 | We will discuss this dataset in more detail later, when we begin to
 74 | model and find associations in the data, but for now just consider it
 75 | as a large dataset (4.5 Gb uncompressed) in a series of
 76 | comma-separated value (CSV) files.
 77 | 
 78 | We have downloaded the dataset from the website, and combined a number
 79 | of the CSV files into a single file `Scorecard_2009-2016.csv`. This
 80 | file is constructed via the following shell commands:
 81 | 
 82 | ```
 83 | for i in `ls MERGED201*`; do echo $i; tail -n +2 $i > ${i/.csv}_nohead.csv; done
 84 | cp MERGED2009_10_PP.csv MERGED2009_head.csv
 85 | cat MERGED*head.csv > Scorecard_2009-2016.csv
 86 | ```
 87 | 
 88 | The merged file has 60,307 rows (including a column header) and 1899
 89 | columns. To demonstrate the speed of `fread`, we will try just reading
 90 | a subset of the full dataset.  We can see that just on the first
 91 | 10,000 rows, `fread` is more than 3x faster than *read.csv*:
 92 | 
 93 | ```{r cache=TRUE}
 94 | library(data.table)
 95 | n <- 10000
 96 | file <- "CollegeScorecard_Raw_Data/Scorecard_2009-2016.csv"
 97 | system.time({
 98 |   scores <- fread(file, nrows=n)
 99 | })
100 | system.time({
101 |   scores2 <- read.csv(file, nrows=n)
102 | })
103 | ```
104 | 
105 | The output is a bit different as well:
106 | 
107 | ```{r}
108 | class(scores)
109 | class(scores2)
110 | ```
111 | 
112 | # Data manipulation with data.table
113 | 
114 | Some syntax is shared with *data.frame* but there are also additional
115 | operations that are specially designed for speed and for reduced
116 | keystrokes. Here, `NUMBRANCH` is a column of the *data.table* and so
117 | we can pull out certain rows by invoking the column name without
118 | having to write `scores$NUMBRANCH`. This gives the scores which have
119 | more than 25 branches:
120 | 
121 | ```{r}
122 | z <- scores[NUMBRANCH > 25]
123 | nrow(z)
124 | ```
125 | 
126 | A preview to later, we could have also gotten this number with some
127 | special *data.table* code, where `.N` gives us the number of rows:
128 | 
129 | ```{r}
130 | scores[NUMBRANCH > 25, .N]
131 | ```
132 | 
133 | We can also pull out rows by matching on a string:
134 | 
135 | ```{r}
136 | scores[INSTNM == "University of North Carolina at Chapel Hill",1:20]
137 | ```
138 | 
139 | We can also specify a column to be a *key* for the *data.table*. 
140 | Specifying a key allows very fast subsetting based on the column you
141 | specify. Here, because the key is an integer, we wrap up the key in
142 | `.()`, otherwise it would interpret our request as a row number:
143 | 
144 | ```{r}
145 | setkey(scores, UNITID)
146 | scores[.(199120),1:20]
147 | ```
148 | 
149 | It happens that the *Id* column of this dataset is an integer, but we
150 | could also have made a string into the key:
151 | 
152 | ```{r}
153 | setkey(scores, CITY)
154 | scores["Chapel Hill",1:20]
155 | ```
156 | 
157 | As you can see the key does not have to be unique (unlike row names in 
158 | R which must be unique). Subsetting with a key column using
159 | *data.table* is much faster than subsetting via other methods. The 
160 | [Data processing with data.table](https://csgillespie.github.io/efficientR/data-carpentry.html#data-processing-with-data.table)
161 | chapter of the *Efficient R Programming* book shows that subsetting a
162 | *data.table* by key is more than 60x faster than base R *data.frame*
163 | and more than 40x faster than using *dplyr*.
164 | The *data.table* 
165 | [website](https://github.com/Rdatatable/data.table/wiki)
166 | also has updated speed comparisons of 
167 | *data.table* to *pandas* and *dplyr*, and included *Spark* and
168 | *pydatatable*, so you can get a sense of how different operations may
169 | differ across these packages. But the main takeaway should be that
170 | *data.table* is fast and if you have large datasets, you shouldn't be
171 | using *data.frame* and base R functions for subsetting or grouping and
172 | summarization.
173 | 
174 | # Functions inside the brackets
175 | 
176 | We can put functions inside of the square brackets. 
177 | 
178 | We first convert `TUITFTE` to numeric, which gives a warning about NAs
179 | introduced in the coercion step:
180 | 
181 | ```{r}
182 | scores$TUITFTE <- as.numeric(scores$TUITFTE)
183 | ```
184 | 
185 | Here a trivial example, to calculate the mean of the tuition per FTE
186 | (full-time equivalent) student.
187 | 
188 | ```{r}
189 | scores[,mean(TUITFTE,na.rm=TRUE)]
190 | ```
191 | 
192 | To make this a little easier to read, let's define our own functions:
193 | 
194 | ```{r}
195 | mean2 <- function(x) mean(x, na.rm=TRUE)
196 | q25 <- function(x) quantile(x, .25, na.rm=TRUE)
197 | q50 <- function(x) quantile(x, .50, na.rm=TRUE)
198 | q75 <- function(x) quantile(x, .75, na.rm=TRUE)
199 | ```
200 | 
201 | Now again. This example is trivial as we could just as well computed
202 | the function after having extracted the column:
203 | 
204 | ```{r}
205 | scores[,mean2(TUITFTE)]
206 | mean2(scores$TUITFTE)
207 | ```
208 | 
209 | # Grouping operations
210 | 
211 | The power of putting the function inside the square brackets is that
212 | it can be combined easily with subsetting and grouping operations. For
213 | example:
214 | 
215 | ```{r}
216 | scores[CONTROL==1,mean2(TUITFTE)]
217 | ```
218 | 
219 | Or with a grouping operation:
220 | 
221 | ```{r}
222 | scores[,mean2(TUITFTE),by=CONTROL]
223 | ```
224 | 
225 | We can also compute multiple functions of various columns, e.g. mean
226 | and standard deviation at the same time. We use the `.()` operator
227 | which is synonymous with `list()`.
228 | 
229 | ```{r}
230 | scores[,.(median=q50(TUITFTE),q25=q25(TUITFTE),q75=q75(TUITFTE)),by=CONTROL]
231 | ```
232 | 
233 | ```{r}
234 | library(ggplot2)
235 | dat <- scores[,.(median=q50(TUITFTE),q25=q25(TUITFTE),q75=q75(TUITFTE)),by=CONTROL]
236 | ggplot(dat, aes(CONTROL, median, ymin=q25, ymax=q75)) + geom_pointrange() +
237 |   xlab("category") + ylab("TUITION / FTE")
238 | ```
239 | 
240 | Again, there are other complex functionality that can be performed
241 | with *data.table*, which can be looked over at
242 | the
243 | [data.table website](https://github.com/Rdatatable/data.table/wiki),
244 | but the operations above cover some of the most common use cases for
245 | reading, subsetting and computing on a large tabular dataset.
246 | 


--------------------------------------------------------------------------------
/large/hdf5.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Working with rhdf5"
  3 | author: "Michael Love and Naim Rashid"
  4 | date: 11/7/2018
  5 | output: 
  6 |   html_document:
  7 |     toc: true
  8 |     toc_float: true
  9 | ---
 10 | 
 11 | # Introduction
 12 | 
 13 | In previous lectures, we have discussed reading in large data tables,
 14 | and working with large databases via *SQLite*. Here, we discuss a
 15 | middle way, using the popular *HDF5* format. The motivation for using
 16 | an *HDF5* data container is that, like *SQLite* we have a common
 17 | format for representing a complex set of tables that can be shared
 18 | simply be sharing a file, but unlike *SQLite* we are typically
 19 | interested in reading in entire tables into memory, so that we can
 20 | then analyze them. *HDF5* is typically smaller on disk, as well as
 21 | faster for writing or reading to or from disk, compared to *SQLite*. 
 22 | 
 23 | First some information from the *HDF5* group, on 
 24 | [Why HDF5?](https://support.hdfgroup.org/HDF5/faq/whyhdf5.html)
 25 | 
 26 | > An HDF5 data container is a standardized, highly-customizable data
 27 | > receptacle designed for portability. Unless your definition of
 28 | > 'container' is extremely broad, file systems are not commonly
 29 | > considered containers. 
 30 | >
 31 | > File systems aren't portable: For example, you might be able to
 32 | > mount an NTFS file system on an AIX machine, but the integers or
 33 | > floating point numbers written on an Intel processor will turn out
 34 | > to be garbage when read on a IBM Power processor. 
 35 | >
 36 | > HDF5 achieves portability by separating its "cargo" (data) from its
 37 | > environment (file system, processor architecture, etc.) and by
 38 | > encoding it in a self-describing file format. The HDF5 library
 39 | > serves the dual purpose of being a parser/encoder of this format and
 40 | > an API for user-level objects (datasets, groups, attributes, etc.). 
 41 | >
 42 | > ...
 43 | >
 44 | > The data stored in HDF5 datasets is shaped and it is typed. Datasets
 45 | > have (logically) the shape of multi-dimensional rectilinear
 46 | > arrays. All elements in a given dataset are of the same type, and
 47 | > HDF5 has one of the most extensive type systems and one that is
 48 | > user-extendable.
 49 | 
 50 | # The rhdf5 package
 51 | 
 52 | As we are focusing on how to interface with various large data formats
 53 | in R, we now introduce the *rhdf5* package. Unlike some of the other
 54 | packages we have shown, this package is maintained on the Bioconductor
 55 | repository and so has a special installation. 
 56 | 
 57 | ```{r eval=FALSE}
 58 | install.packages("BiocManager") # can be skipped after 1st time
 59 | BiocManager::install("rhdf5")
 60 | ```
 61 | 
 62 | Now we can load the package. Much of the following introduction to
 63 | *rhdf5* is modified from the package vignette.
 64 | 
 65 | ```{r}
 66 | library(rhdf5)
 67 | ```
 68 | 
 69 | Typically, we may already have an *HDF5* data container that we want
 70 | to work with, but as in the *SQLite* lecture note, we will show how to
 71 | create a new one first.
 72 | 
 73 | ```{r}
 74 | h5file <- "myDB.h5"
 75 | h5createFile(h5file)
 76 | ```
 77 | 
 78 | # Groups are like directories
 79 | 
 80 | *HDF5* containers have a hierarchy built around *groups* which act and
 81 | look a bit like directories:
 82 | 
 83 | ```{r}
 84 | h5createGroup(h5file, "A")
 85 | h5createGroup(h5file, "B")
 86 | h5createGroup(h5file, "A/C")
 87 | ```
 88 | 
 89 | We can list the groups:
 90 | 
 91 | ```{r}
 92 | h5ls(h5file)
 93 | ```
 94 | 
 95 | Finally, we show some examples of writing data to the *HDF5*
 96 | container, with `h5write`. Row and column names of matrices or arrays
 97 | in general will not be stored, however the column names of *compound*
 98 | data types (such as *data.frame*) will be stored:
 99 | 
100 | ```{r}
101 | x <- matrix(rnorm(1e4),nrow=100)
102 | h5write(x, h5file, "A/x")
103 | y <- matrix(letters, nrow=13)
104 | h5write(y, h5file,"A/C/y")
105 | df <- data.frame(a=1L:5L,
106 |                  b=seq(0,1,length.out=5),
107 |                  c=letters[1:5],
108 |                  stringsAsFactors=FALSE)
109 | h5write(df, h5file, "B/df")
110 | h5ls(h5file)
111 | ```
112 | 
113 | # Reading objects
114 | 
115 | We can read out these objects using `h5read`. Note that the column
116 | names of the *data.frame* have been preserved:
117 | 
118 | ```{r}
119 | xx <- h5read(h5file, "A/x")
120 | xx[1:3,1:3]
121 | yy <- h5read(h5file, "A/C/y")
122 | head(yy)
123 | df2 <- h5read(h5file, "B/df")
124 | head(df2)
125 | ```
126 | 
127 | ```{r echo=FALSE}
128 | # this hidden chunk to make the example work from the top...
129 | system("rm myDB.h5")
130 | ```
131 | 
132 | # Integration with Rcpp
133 | 
134 | During package development, you may find that it would be easier to directly read from or write to an HDF5 file directly from your C++ code. RcppArmadillo allows for this functionality, detailed in their [documentation](http://arma.sourceforge.net/docs.html).  If you search for hdf5 at this link you will find a few options for loading and saving objects in this format.  
135 | 
136 | One caveat listed in their documentation is the following:
137 | 
138 | > Caveat: for saving/loading HDF5 files, support for HDF5 must be enabled within Armadillo's configuration; the hdf5.h header file must be available on your system and you will need to link with the HDF5 library (eg. -lhdf5)
139 | 
140 | This can be achieved by adding a Makevars or Makevars.win file to your package's src/ directory indicating this.  General information on Makevars files can be found [here](https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Using-Makevars). A specific walkthough on how to do it in this specific instance  using HDF5 is given [here](https://www.bioconductor.org/packages/devel/bioc/vignettes/Rhdf5lib/inst/doc/Rhdf5lib.html).  An example of using the hdf5 library in practice can be found [here](https://github.com/plbaldoni/epigraHMM/blob/main/src/expStep.cpp).  This example uses the "H5Cpp.h" header instead of the "hdf5.h", both of which are referenced in the Rhdf5lib link earlier.
141 | 
142 | # DelayedArray
143 | 
144 | The [DelayedArray Bioconductor package](https://bioconductor.org/packages/3.12/bioc/html/DelayedArray.html) offers a an R-friendly way to work with datasets too large to load into memory, and can also leverage some of the advantages of the HDF5 format via the HDF5Array package.  Additional packages such as [DelayedMatrixStats](https://bioconductor.org/packages/3.12/bioc/html/DelayedMatrixStats.html) can be used to perform operations on DelayedMatrix objects from the  DelayedArray package. 
145 | 
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/large/large_HW4.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Homework 4 - Working with large datasets"
  3 | author: "your name here"
  4 | date: "`r format(Sys.time(), '%m/%d/%Y')`"
  5 | output: html_document
  6 | ---
  7 | 
  8 | # Question 1 - benchmark data.table's grouping on real data
  9 | 
 10 | Download the merged College Scorecard data from (2009-2016) from here: <https://www.dropbox.com/s/ex0u45rlrjr6h7e/Scorecard_2009-2016.csv?dl=0>
 11 | 
 12 | This file is the final merged form of the original data that was discussed in class, using the shell operation. Please use this file for the subsequent questions. Excluding the header, there should be 67,418 rows in this file. 
 13 | 
 14 | In class we performed some simple subsetting and grouping
 15 | operations. The lecture notes use a previous version of the dataset,
 16 | and since they were compiled, `CONTROL` is now integer valued, and
 17 | the `TUITFTE` and `SAT_AVG` columns will need to be coerced to a
 18 | numeric using `as.numeric`, before you can work with it. (This will
 19 | give a warning about NAs introduced, which you should ignore.)
 20 | 
 21 | Also you should convert `CONTROL` to a factor, and then change the
 22 | levels 1,2,3 to instead `pub`,`pnp`,`pfp`.
 23 | 
 24 | From the data dictionary, we have: 
 25 | 
 26 | | C | Value              |
 27 | |---|--------------------|
 28 | | 1 | Public             |
 29 | | 2 | Private nonprofit  |
 30 | | 3 | Private for-profit |
 31 | 
 32 | First, tabulate the number of schools you have in the table for each
 33 | value of `CONTROL` (you can use data.table or base R for this). Also
 34 | tabulate, the number of schools for each value of `CONTROL` that have
 35 | non-NA values for both `TUITFTE` *and* `SAT_AVG`.
 36 | 
 37 | Then, compute the mean and SD tuition per FTE and the mean and SD average
 38 | SAT for each of the classes of ownership (pub, pnp, pfp), (1) using
 39 | data.table, and (2) using `aggregate` with the columns `TUITFTE`,
 40 | `SAT_AVG`, `CONTROL` and your NA-removed mean and sd function. Confirm
 41 | by eye that they give the same result and compare speed. You can
 42 | benchmark with `times=10`.
 43 | 
 44 | A typical use of aggregate is:
 45 | 
 46 | ```
 47 | aggregate(df[,c("col1","col2")], df[,"grouping"], function(x) ...)
 48 | ```
 49 | 
 50 | # Question 2- doing more with "by" in data.table
 51 | 
 52 | Make a subset of the data, called `scores.sub`, which has complete
 53 | data for both `TUITFTE` and `SAT_AVG`. You can look up the `na.omit`
 54 | function in data.table.
 55 | 
 56 | Make a plot of `SAT_AVG` over `TUITFTE`, and color the points by
 57 | `CONTROL`, with x-limits of [0-40,000] and y-limits of [500-1600].
 58 | 
 59 | Now tabulate the number of schools that have tuition per FTE over
 60 | 20,000 and/or average SAT over 1200, grouped by ownership
 61 | category. Your output should be sorted on the groupings you define, so
 62 | the first row should be public, TUITFTE < 20,000 and SAT_AVG < 1200,
 63 | and so on for 12 rows. See the Introduction vignette for data.table
 64 | for insight on how to perform this operation. Hint: "sorted by" and
 65 | "expressions in by".
 66 | 
 67 | # Question 3 - subsets of data 
 68 | 
 69 | Use data.table to obtain the tuition per FTE and average SAT for the
 70 | two schools with the top average SAT within each ownership
 71 | group. Hint: I performed this in two steps, first by ordering
 72 | `scores.sub`, and then using "subset of data". Make sure to avoid
 73 | returning all of the columns...
 74 | 
 75 | # Question 4 - MovieLens sparse dataset
 76 | 
 77 | As we mentioned in class, one common form of sparse data is when we
 78 | have information about individuals and their interaction with a large
 79 | set of items (e.g. movies, products, etc.). The interactions may be
 80 | ratings or purchases. One publicly available dataset of movie ratings
 81 | is *MovieLens*, which has a 1 MB download available here:
 82 | 
 83 | <https://grouplens.org/datasets/movielens/>
 84 | 
 85 | Download the `ml-latest-small.zip` dataset. Take a look at each of the
 86 | CSV files. How many of the movies have the "Comedy" genre attached to
 87 | them? 
 88 | 
 89 | Build a sparse matrix of the movies by users, and just put a 1 for if
 90 | the user rated the movie (don't actually record the value of the
 91 | rating itself). You can do this by specifying `x=1`. In
 92 | the abstract, this is a very large matrix, but this is because the
 93 | user IDs go up to nearly 200,000. Remove the rows of the sparse matrix
 94 | where there are no ratings to produce a sparse matrix that is roughly
 95 | ~10,000 by ~600. Use `summary` to investigate the range, quartiles,
 96 | etc. of number of movies rated by each user.
 97 | 
 98 | There are multiple ways to compute the SVD of a sparse matrix. If
 99 | after manipulating the matrix in its sparse form, it is not too large
100 | (as in this case), one can just run `svd` which will coerce the matrix
101 | into a dense one. Or there are special functions in packages which are
102 | designed to compute (potentially sparse) SVD solutions on sparse
103 | matrices. Two such functions are `sparsesvd::sparsesvd` and
104 | `irlba::ssvd`. You can choose any of these three methods, in either
105 | case you should specify to return only 3 left singular vectors
106 | (`nu=3`, `rank=3`, or `k=3`, respectively). For `ssvd` in the irlba
107 | package, you should specify that the number of nonzero components
108 | in the right singular vectors should be all (the number of rows of x),
109 | which will give a warning that you should ignore. All of these methods
110 | will produce roughly the same decomposition, with arbitrary sign
111 | changes on the singular vectors. The sparse versions are about 1000
112 | times faster, as they do not coerce the matrix into a dense version.
113 | 
114 | Compute the SVD of the matrix using one of the methods above. Plot the
115 | columns of the U matrix against each other: 1 vs 2, 2 vs 3, 1
116 | vs 3. Note that column 1 and 3 are correlated, with a long tail of
117 | movies. Investigate the names of these movies. What property can you
118 | infer about the top 6 movies in the tail w.r.t. column 1 and 3? Now
119 | look at the extremes of column 2 of U. What difference can you tell
120 | about the movies, between the smallest values and the largest values
121 | in column 2?
122 | 
123 | Hint: there are a few movies which are in the `movies.csv` file, but
124 | are not in the `ratings.csv` file. I recommend to subset the list of
125 | movies first, which will help with this problem.
126 | 


--------------------------------------------------------------------------------
/large/sparse.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Sparse data manipulation"
  3 | author: "Michael Love"
  4 | date: 11/7/2018
  5 | output: 
  6 |   html_document:
  7 |     toc: true
  8 |     toc_float: true
  9 | ---
 10 | 
 11 | In this last lecture note on large data manipulation in R, we change
 12 | tactics a bit. Previously we discussed fast reading and subsetting
 13 | with *data.table*, and the advantages of *SQLite* vs *HDF5* for
 14 | storing collections of tables in a single file, and then working with
 15 | these in R using the *RSQLite* and *rhdf5* libraries. Here we discuss
 16 | an alternative approach for dealing with large arrays in which many
 17 | of the features are equal to zero. There are special classes and
 18 | methods in R that allow us to work with such data in a memory and
 19 | computationally efficient manner. These data are typically referred to
 20 | as *sparse* data, in that the non-zero elements of the array are
 21 | sparse. We will focus in this lecture note on the classes in the
 22 | *Matrix* package, and some functionality in the *glmnet* package for
 23 | fitting regularized linear or generalized linear models to sparse
 24 | feature matrices.
 25 | 
 26 | # Representing sparse matrices
 27 | 
 28 | Let's dive right into representing sparse matrices. Here we have a
 29 | large-ish matrix wherein the non-zero elements make up only ~5% of the
 30 | total:
 31 | 
 32 | ```{r}
 33 | m <- matrix(rbinom(1e6, 1, .05), ncol=1e3)
 34 | m[1:5,1:5]
 35 | sum(m)
 36 | prod(dim(m))
 37 | ```
 38 | 
 39 | This matrix takes up about 4 Mb in memory:
 40 | 
 41 | ```{r}
 42 | print(object.size(m), units="Mb")
 43 | ```
 44 | 
 45 | That's actually not so big that we encounter problems on a laptop
 46 | computer, but if we multiply either or both of the dimensions by a
 47 | factor of 1000, we will start to hit a limit in terms of working with
 48 | the matrix.
 49 | 
 50 | Let's get a sense of how much space we save if we represent this as a
 51 | sparse matrix.
 52 | 
 53 | ```{r}
 54 | library(Matrix)
 55 | mm <- Matrix(m, sparse=TRUE)
 56 | mm[1:5,1:5]
 57 | sum(mm)
 58 | print(object.size(mm), units="Mb")
 59 | as.numeric(object.size(m)/object.size(mm))
 60 | ```
 61 | 
 62 | The sparse version takes up less than 1/6 of the space of the *dense*
 63 | version. 
 64 | 
 65 | # How to construct sparse matrices
 66 | 
 67 | This coercion above, of a dense matrix into a sparse one doesn't make
 68 | any sense: we would never want to first build the memory-intensive
 69 | dense version of the matrix and then convert down to the sparse
 70 | version. Instead, we would use the `sparseMatrix` function to build
 71 | the matrix by specifying only the non-zero elements, and where they
 72 | occur.
 73 | 
 74 | First look up the help page for `sparseMatrix`:
 75 | 
 76 | ```{r eval=FALSE}
 77 | ?sparseMatrix
 78 | ```
 79 | 
 80 | The most common way to construct a sparse matrix would be to specify
 81 | `i`, `j`, and `x` (this last argument optional, if not included, the
 82 | values will be equal to 1).
 83 | 
 84 | ```{r}
 85 | s <- sparseMatrix(i=c(1,3,5),j=c(1,2,3),x=c(4,5,6),dims=c(6,4))
 86 | s
 87 | ```
 88 | 
 89 | This creates an object of type `dgCMatrix`. Take a look at the help
 90 | page for this class
 91 | 
 92 | ```{r eval=FALSE}
 93 | ?dgCMatrix-class
 94 | ```
 95 | 
 96 | You can see that this class is *column-oriented* which means it should 
 97 | be faster to index columns of these objects than rows. Likewise,
 98 | if we had not specified `x`, it would also be column-oriented by
 99 | default, but instead it would be `ngCMatrix`. Let's do a
100 | microbenchmark to see the degree of difference. For this example,
101 | column indexing is about twice as fast.
102 | 
103 | ```{r warning=FALSE}
104 | library(microbenchmark)
105 | n <- 1e3
106 | nn <- 1e5
107 | s <- sparseMatrix(i=sample(n,nn,TRUE),
108 |                   j=sample(n,nn,TRUE),
109 |                   dims=c(n,n))
110 | microbenchmark(sum(s[,10]),sum(s[10,]))
111 | ```
112 | 
113 | # Manipulating sparse matrices
114 | 
115 | We can do many operations to sparse matrices using specialized
116 | functions which are different than the ones defined for regular
117 | matrices. These are described in `?dgCMatrix-class`, but some of the
118 | important ones are `%*%`, `crossprod`, `tcrossprod`, `solve`, `qr`,
119 | `lu`. Using these operations will preserve the sparsity of the object
120 | (so keeping us under our memory budger), and will perform much faster
121 | than coercion to dense would, if the matrices have a high degree of
122 | sparsity. 
123 | 
124 | Note that some operations destroy the sparsity, such as adding 1, and
125 | therefore must be avoided (in the case where the dense matrix would
126 | not fit in memory):
127 | 
128 | ```{r}
129 | s[1:10,1:10] + 1
130 | ```
131 | 
132 | Other operations maintain the sparsity:
133 | 
134 | ```{r}
135 | s[1:10,1:10] * 2
136 | ```
137 | 
138 | We can also plot an `image`, which avoids creating the dense
139 | matrix: 
140 | 
141 | ```{r}
142 | image(s[1:100,1:100])
143 | ```
144 | 
145 | # Use of sparse matrices in glmnet
146 | 
147 | A common use case of sparse matrices is in prediction of a target,
148 | let's call `y`, using a high-dimensional, sparse matrix of features
149 | `x`. We are often in situation that there are more features in `x`
150 | than there are observations (rows of `x` and length of `y`). In this
151 | case it may make sense to first try linear modeling of `y` on `x`,
152 | and to use some combination of L1 and L2 regularization to stabilize
153 | the regression. The *glmnet* package allows one to fit elastic net
154 | models for such a problem, where the `x` matrix can be sparse, and it
155 | builds off of the sparse matrices defined in the *Matrix*
156 | package. Read over the help file for the main function:
157 | 
158 | ```{r echo=FALSE}
159 | library(glmnet)
160 | ```
161 | 
162 | ```{r eval=FALSE}
163 | library(glmnet)
164 | ?glmnet
165 | ```
166 | 
167 | We can mock up some simulated data to show the kinds of models that
168 | can be fit using `glmnet`. Here we simulate 50 columns of `x` with a
169 | coefficient of `1` and the rest of the columns of `x` are not used in
170 | constructing `y`.
171 | 
172 | ```{r}
173 | n <- 1e3
174 | nn <- 1e5
175 | x <- sparseMatrix(i=sample(n,nn,TRUE),
176 |                   j=sample(n,nn,TRUE),
177 |                   dims=c(n,n))
178 | beta <- rep(c(1,0),c(50,950))
179 | y <- x %*% beta + rnorm(n,0,.25)
180 | ```
181 | 
182 | Running `glmnet` gives us our regularization path. Here we set
183 | `alpha=1` which corresponds to only the L1 penalty (lasso). Plotting
184 | the regularization path reveals a range of `lambda` where the 50
185 | coefficients have been correctly identified (non-zero coefficients)
186 | while the rest of the coefficients have been shrunk to 0.
187 | 
188 | Notably, for this lecture note, we never had to convert `x` into its
189 | dense form, thereby allowing much higher dimensions than would be
190 | possibly if `glmnet` only took dense matrices as input.
191 | 
192 | ```{r}
193 | fit <- glmnet(x, y, family="gaussian", alpha=1)
194 | plot(fit)
195 | ```
196 | 


--------------------------------------------------------------------------------
/large/sqlite.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Working with RSQLite"
  3 | author: "Michael Love"
  4 | date: 11/7/2018
  5 | output: 
  6 |   html_document:
  7 |     toc: true
  8 |     toc_float: true
  9 | ---
 10 | 
 11 | # Introduction
 12 | 
 13 | In the previous lecture note, we introduced the *data.table* package
 14 | and showed how it can be used to read in large datasets into R (so
 15 | storing the dataset in memory), and then how specialized functions
 16 | allow for fast subsetting and grouping/summarization operations.  This
 17 | works fairly well for many tasks on large tabular data until we hit
 18 | the limit in terms of the size of dataset that can be read into
 19 | memory. After we hit this memory limit, we can turn instead to on-disk
 20 | storage of tables of data, and a convenient format for this is
 21 | *SQLite*. A critical design point of *SQLite* (from
 22 | the [Wikipedia](https://en.wikipedia.org/wiki/SQLite) page): 
 23 | 
 24 | >  SQLite stores the entire database (definitions, tables, indices,
 25 | >  and the data itself) as a single cross-platform file on a host
 26 | >  machine.  It implements this simple design by locking the entire
 27 | >  database file during writing. SQLite read operations can be
 28 | >  multitasked, though writes can only be performed sequentially. 
 29 | 
 30 | We will jump right in to trying out a connection to a *SQLite*
 31 | database. We use the *RSQLite* package which provides an interface to
 32 | the *SQLite* library, and the *DBI* package which provides a generic
 33 | interface from R to various database backends. The following example
 34 | is derived from the example code in `?SQLite`, which has additional
 35 | information on working with *RSQLite*.
 36 | 
 37 | The following will connect to a database `myDB.sqlite` and if it does
 38 | not exist, it will create the file:
 39 | 
 40 | ```{r}
 41 | library(RSQLite)
 42 | library(DBI)
 43 | con <- dbConnect(SQLite(), "myDB.sqlite")
 44 | con
 45 | ```
 46 | 
 47 | If we wanted to try out the *RQLite* package without writing a file to
 48 | disk we could have also used `":memory:"` instead of writing a
 49 | filename, which creates an in-memory database.
 50 | 
 51 | # Write tables
 52 | 
 53 | Let's write a table from R to the database. Typically you would most
 54 | likely just be *reading* very large databases from *SQLite* rather
 55 | than writing tables, but we do so as an example anyway:
 56 | 
 57 | ```{r}
 58 | data(mtcars)
 59 | dbWriteTable(con, "cars", mtcars)
 60 | dbListTables(con)
 61 | ```
 62 | 
 63 | # Queries
 64 | 
 65 | We can then pull rows of data from the table using standard SQL-style
 66 | queries. If you've never performed SQL queries before, it's pretty
 67 | easy to learn by example, and [w3schools](https://www.w3schools.com/sql/)
 68 | has a reference for learning or reviewing if you haven't seen this in
 69 | a while.
 70 | 
 71 | The following pulls all rows from the `cars` table:
 72 | 
 73 | ```{r}
 74 | rows <- dbGetQuery(con, "SELECT * FROM cars")
 75 | head(rows)
 76 | nrow(rows)
 77 | ```
 78 | 
 79 | We can also select subsets of the data easily:
 80 | 
 81 | ```{r}
 82 | rows <- dbGetQuery(con, "SELECT * FROM cars WHERE cyl=6")
 83 | head(rows)
 84 | nrow(rows)
 85 | ```
 86 | 
 87 | # Fetch chunks of data
 88 | 
 89 | However, the whole motivation in this lecture note was that we
 90 | potentially have more data than can fit in memory, and so we can also
 91 | fetch data from the table in *chunks*. Here we formulate a query `rs`,
 92 | and then fetch 10 rows at a time with `dbFetch`:
 93 | 
 94 | ```{r}
 95 | rs <- dbSendQuery(con, "SELECT * FROM cars")
 96 | d1 <- dbFetch(rs, n=10)
 97 | dbHasCompleted(rs)
 98 | ```
 99 | 
100 | We can continue to fetch batches of 10 (or any number), or we can
101 | extract all remaining data by specifying -1:
102 | 
103 | ```{r}
104 | d2 <- dbFetch(rs, n=-1)
105 | dbHasCompleted(rs)
106 | dbClearResult(rs)
107 | ```
108 | 
109 | Finally, we close the connection when we are finished working with the
110 | database:
111 | 
112 | ```{r}
113 | dbDisconnect(con)
114 | ```
115 | 
116 | This short lecture note was to give a brief overview to the use of
117 | *RSQLite* as an interface to *SQLite* on-disk databases. These are
118 | very powerful ways to work with large data when the datasets no longer
119 | fit into memory, or generally as a way to share datasets as a single
120 | file and in a format that is incredibly widely used and well
121 | tested. We do not teach SQL queries in this class, as these are fairly
122 | easy to learn on your own through reading over example queries or
123 | trying them out on example datasets as shown here.
124 | 
125 | ```{r echo=FALSE}
126 | # this hidden chunk to make the example work from the top...
127 | con <- dbConnect(SQLite(), "myDB.sqlite")
128 | dbRemoveTable(con, "cars")
129 | dbDisconnect(con)
130 | ```
131 | 


--------------------------------------------------------------------------------
/mcmc/alzheimers.dat:
--------------------------------------------------------------------------------
  1 | subject month words
  2 | 1 1  9
  3 | 1 2 12
  4 | 1 3 16
  5 | 1 4 17
  6 | 1 5 18
  7 | 2 1  6
  8 | 2 2  7
  9 | 2 3 10
 10 | 2 4 15
 11 | 2 5 16
 12 | 3 1 13
 13 | 3 2 18
 14 | 3 3 14
 15 | 3 4 21
 16 | 3 5 21
 17 | 4 1  9
 18 | 4 2 10
 19 | 4 3 12
 20 | 4 4 14
 21 | 4 5 15
 22 | 5 1  6
 23 | 5 2  7
 24 | 5 3  8
 25 | 5 4  9
 26 | 5 5 12
 27 | 6 1 11
 28 | 6 2 11
 29 | 6 3 12
 30 | 6 4 14
 31 | 6 5 16
 32 | 7 1  7
 33 | 7 2 10
 34 | 7 3 11
 35 | 7 4 12
 36 | 7 5 14
 37 | 8 1  8
 38 | 8 2 18
 39 | 8 3 19
 40 | 8 4 19
 41 | 8 5 22
 42 | 9 1  3
 43 | 9 2  3
 44 | 9 3  3
 45 | 9 4  7
 46 | 9 5  8
 47 | 10 1  4
 48 | 10 2 10
 49 | 10 3 11
 50 | 10 4 17
 51 | 10 5 18
 52 | 11 1 11
 53 | 11 2 10
 54 | 11 3 10
 55 | 11 4 15
 56 | 11 5 16
 57 | 12 1  1
 58 | 12 2  3
 59 | 12 3  2
 60 | 12 4  4
 61 | 12 5  5
 62 | 13 1  6
 63 | 13 2  7
 64 | 13 3  7
 65 | 13 4  9
 66 | 13 5 10
 67 | 14 1  0
 68 | 14 2  3
 69 | 14 3  3
 70 | 14 4  4
 71 | 14 5  6
 72 | 15 1 18
 73 | 15 2 18
 74 | 15 3 19
 75 | 15 4 22
 76 | 15 5 22
 77 | 16 1 15
 78 | 16 2 15
 79 | 16 3 15
 80 | 16 4 18
 81 | 16 5 19
 82 | 17 1 10
 83 | 17 2 14
 84 | 17 3 16
 85 | 17 4 17
 86 | 17 5 19
 87 | 18 1  6
 88 | 18 2  6
 89 | 18 3  7
 90 | 18 4  9
 91 | 18 5 10
 92 | 19 1  9
 93 | 19 2  9
 94 | 19 3 13
 95 | 19 4 16
 96 | 19 5 20
 97 | 20 1  4
 98 | 20 2  3
 99 | 20 3  4
100 | 20 4  7
101 | 20 5  9
102 | 21 1  4
103 | 21 2 13
104 | 21 3 13
105 | 21 4 16
106 | 21 5 19
107 | 22 1 10
108 | 22 2 11
109 | 22 3 13
110 | 22 4 17
111 | 22 5 21
112 | 


--------------------------------------------------------------------------------
/mcmc/envelope.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biodatascience/statcomp_src/558195d8bb181b582263678914107f12872a4443/mcmc/envelope.png


--------------------------------------------------------------------------------
/mcmc/mcmc_HW8.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "HW 8 - MCMC"
 3 | author: "Naim Rashid"
 4 | date: "2/20/2019"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | # Importance and Rejection Sampling
13 | 
14 | Consider finding $\sigma^2 = E[X^2]$ when $X$ has the density that is proportional to $q(x) =\exp({−|x|^3/3})$.  Write the form of the integral pertaining to this expectation below, assuming the normalizing constant for $q(x)$ is $c$.
15 | 
16 | Solution:
17 | 
18 | 
19 | ## Find the estimate for $\sigma^2$ using importance sampling.  
20 | 
21 | Write the expression for the integral using an importance sampling scheme, specifying your choice for the importance sampling density $g$ and reexpress the integral in terms of $X^2$, $g(x)$ and $q(x)$.  Then, write the form of the approximation of this integrand based on the importance samples drawn from $g$. Then, carry out this approximation of the expectation in the code below. 
22 | 
23 | ```{r}
24 | ## Solution: write any helper functions here (q(x), etc)
25 | 
26 | ## End Solution
27 | 
28 | # set M
29 | M = 10^5
30 | 
31 | ## Solution: place primary code evaluating the expectation here with importance sampling
32 | 
33 | ```
34 | 
35 | ## Repeat the procedure using rejection sampling. 
36 | 
37 | Similar to before, write out the form of the integral and then its approximation, this time based on sampling from the target density $q(x)$.  Then, carry out this approximation of the expectation in the code below. 
38 | 
39 | Hint:  We just need to show that $e$ exceeds the target density everywhere, and that the distribution $g$ that we draw samples from only differs from $e$ by a constant $\alpha$.  
40 | 
41 | 
42 | ```{r}
43 | 
44 | # set M
45 | M = 10^5
46 | 
47 | ## Solution: place primary code evaluating the expectation here with rejection sampling
48 | 
49 | 
50 | ## End Solution
51 | 
52 | 
53 | 
54 | ```
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/mcmc/mcmc_HW9.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "HW 9 - MCMC"
 3 | author: "Naim Rashid"
 4 | date: "2/20/2019"
 5 | output: html_document
 6 | ---
 7 |  
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | # Maximization of poisson GLMM from lecture
13 | 
14 | Lets now maximize the poisson GLMM model given in lecture, now using an MCEM approach.  In a previous HW, you used numerical integration tools to approximate the likelihood for this model, then applied numerical optimization to obtain the estimates of the model parameters.  
15 | 
16 | Here we wish to use MCEM, where in lecture we have already gone over a similar implementation using a rejection sampler in the E-step.  
17 | 
18 | For this HW, please use a Metropolis Hastings Random Walk proposal distribution to approximate the Q-function in the E-step.   Specify your proposal distribution.  Write functions implementing the E-step, the M-step, and then write the main code for the MCEM algorithm.  
19 | 
20 | Feel free to reuse/modify the lecture code to do this.   However, you can implement the M-step and other parts of the EM algorithm however is most convenient to you.  Not required by any means, but it may be helpful from a speed perspective to recode the sampler into Rcpp. 
21 | 
22 | ```{r}
23 | ## Solution: place relevant helper functions pertaining to the E step here 
24 | 
25 | ## End Solution
26 | 
27 | 
28 | ## Solution: place relevant helper functions pertaining to the M step here 
29 | 
30 | ## End Solution
31 | 
32 | 
33 | ## Solution: place primary code for the MCEM algorithm here, calling functions in the above two sections
34 | ## Remember to print your primary results and use the following starting values, and evaluate chain diagnostics for the final model
35 | 
36 | # set initial parameters
37 |   tol = 10^-5
38 |   maxit = 100
39 |   iter = 0
40 |   eps = 10000
41 |   qfunction = -10000 # using Qfunction for convergence
42 |   
43 | # starting values, taken from rejection sampling example
44 |   beta = c(1.804, 0.165) 
45 |   s2gamma =  0.000225 + .01 
46 | 
47 | # Length of chain
48 |   M = 10000
49 | 
50 | # burn in
51 |   burn.in = 2000
52 | 
53 |   
54 |     
55 | ## End Solution
56 | 
57 | 
58 | ```
59 | 


--------------------------------------------------------------------------------
/mcmc/mixture.dat:
--------------------------------------------------------------------------------
  1 | y
  2 | 7.25325222659913
  3 | 6.85652267046824
  4 | 7.23643792894966
  5 | 7.03343611519664
  6 | 6.9186591609056
  7 | 6.65649879051228
  8 | 6.42308043084932
  9 | 7.46636287619574
 10 | 10.3497865413661
 11 | 6.93593298389149
 12 | 6.83974994639286
 13 | 10.1477534866707
 14 | 7.18844547660898
 15 | 8.79161716373787
 16 | 6.77135115622428
 17 | 9.89206349173715
 18 | 10.6292620609587
 19 | 6.17109362928208
 20 | 9.44878709751433
 21 | 7.12422462946795
 22 | 6.75066335182976
 23 | 7.42808832040163
 24 | 9.4949511197615
 25 | 6.74956775652862
 26 | 9.46445384762244
 27 | 7.27348041082583
 28 | 6.98896265672564
 29 | 7.26262394415349
 30 | 6.94244760575449
 31 | 7.28846831817204
 32 | 9.70904705672207
 33 | 10.9878054216487
 34 | 7.45111574465272
 35 | 6.97036693452533
 36 | 6.53291089305878
 37 | 6.52220443343591
 38 | 6.10163473472885
 39 | 10.2820394025033
 40 | 8.13866685075031
 41 | 9.51099560173583
 42 | 7.74154300863383
 43 | 6.14372115300404
 44 | 6.68548657458669
 45 | 10.8689484723994
 46 | 6.73064827757487
 47 | 10.5866677031468
 48 | 9.56384573435206
 49 | 6.99562383496413
 50 | 6.18576529608999
 51 | 10.5254577115642
 52 | 9.44970647261562
 53 | 9.84118730329914
 54 | 7.21312721539275
 55 | 7.83136245649827
 56 | 10.2825737379552
 57 | 6.36363038852991
 58 | 6.74285813989089
 59 | 6.98035358880607
 60 | 6.92964132433787
 61 | 6.84202550407557
 62 | 7.38016635912841
 63 | 9.78362588855716
 64 | 7.58508950152404
 65 | 9.50912675753626
 66 | 6.55132388271122
 67 | 6.88852617303433
 68 | 7.90209596243866
 69 | 7.01301915336949
 70 | 9.93871470060288
 71 | 7.51086841627442
 72 | 6.67441840075985
 73 | 6.10225392630878
 74 | 7.40059858311095
 75 | 7.30520952023732
 76 | 7.21041994910681
 77 | 10.519655486735
 78 | 10.1288125396083
 79 | 7.04575576918983
 80 | 9.8008750675778
 81 | 10.3528228749625
 82 | 7.25796001419125
 83 | 7.67456458165231
 84 | 6.52507043820612
 85 | 6.57628671699353
 86 | 6.52670898857082
 87 | 7.44576437202348
 88 | 10.5860658476161
 89 | 6.97650854570433
 90 | 6.89904378626167
 91 | 7.48642531607213
 92 | 6.39227547871176
 93 | 7.32020055238666
 94 | 9.64975498620833
 95 | 6.92112490660115
 96 | 10.4213997762887
 97 | 6.3455538999162
 98 | 6.99875637216512
 99 | 6.98618105048244
100 | 5.96048736574766
101 | 10.2852677946904
102 | 


--------------------------------------------------------------------------------
/ml/essentials_HW11.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Homework 11 - Machine learning essentials"
 3 | author: "your name here"
 4 | date: "`r format(Sys.time(), '%m/%d/%Y')`"
 5 | output: html_document
 6 | ---
 7 | 
 8 | # Use of `caret` with various methods
 9 | 
10 | Run three machine learning models over the following training dataset
11 | with features `x` and labels `y`. You can use default tuning, e.g.
12 | bootstrap based resampling for tuning, as set by `trainControl`.
13 | 
14 | * SVM with radial kernel `"svmRadial"`
15 | * Random forest `"rf"`
16 | * Gradient boosting machine `"gbm"` (use `verbose=FALSE`)
17 | 
18 | Record the time to train, and the best Kappa value for each method
19 | over the tuning grid (`rf` does not use tuning parameters via
20 | `train` for this dataset). Which method obtains the best Kappa?
21 | 
22 | Finally, make a `pointrange` plot (see `geom_pointrange`), with the
23 | optimal Kappa and the SD for the optimal Kappa. Is there a clear
24 | winner, or all the methods mostly overlapping?
25 | 
26 | ```{r}
27 | data(faithful)
28 | n <- nrow(faithful)
29 | faithful <- data.frame(lapply(faithful, scale))
30 | plot(faithful)
31 | faithful$cl <- factor(kmeans(faithful, centers=2)$cluster)
32 | plot(faithful[,1:2], col=faithful$cl)
33 | # make it more challenging
34 | set.seed(1)
35 | faithful[,1] <- faithful[,1] + rt(n,df=5)/2
36 | faithful[,2] <- faithful[,2] + rt(n,df=5)/2
37 | plot(faithful[,1:2], col=faithful$cl)
38 | x <- faithful[,1:2]
39 | y <- faithful[,3]
40 | ```
41 | 
42 | 


--------------------------------------------------------------------------------
/ml/rf.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Random forests"
  3 | author: "Michael Love"
  4 | date: 12/14/2018
  5 | output: html_document
  6 | ---
  7 | 
  8 | In this lecture note we will introduce classification and regression
  9 | trees (CART), and *random forests*, which incorporate trees into a
 10 | bagging ensemble method. We will first describe the dataset that we
 11 | will use in the following section as a motivating example.
 12 | 
 13 | ```{r include=FALSE}
 14 | knitr::opts_chunk$set(cache=TRUE)
 15 | ```
 16 | 
 17 | # APS Failure data set
 18 | 
 19 | To begin, we download the
 20 | [APS Failure at Scania Trucks Data Set](https://archive.ics.uci.edu/ml/datasets/APS+Failure+at+Scania+Trucks)
 21 | from the 
 22 | [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
 23 | 
 24 | The dataset description is:
 25 | 
 26 | > The dataset consists of data collected from heavy Scania 
 27 | > trucks in everyday usage. The system in focus is the 
 28 | > Air Pressure system (APS) which generates pressurised 
 29 | > air that are utilized in various functions in a truck, 
 30 | > such as braking and gear changes. The datasets' 
 31 | > positive class consists of component failures 
 32 | > for a specific component of the APS system. 
 33 | > The negative class consists of trucks with failures 
 34 | > for components not related to the APS. The data consists 
 35 | > of a subset of all available data, selected by experts. 
 36 | 
 37 | The dataset has additional information associated with it. It is
 38 | mentioned that a false positive (cost 1) has a cost of 10, while a
 39 | false negative (cost 2) has a cost of 500. So false negatives are 50x
 40 | more costly as false positives.
 41 | 
 42 | > In this case Cost 1 refers to the cost that an unnessecary 
 43 | > check needs to be done by an mechanic at an workshop, while 
 44 | > Cost 2 refer to the cost of missing a faulty truck, 
 45 | > which may cause a breakdown. 
 46 | 
 47 | There is an imbalance in the dataset, such that there are 59x more
 48 | negative observations than positive observations:
 49 | 
 50 | > The training set contains 60000 examples in total in which 
 51 | > 59000 belong to the negative class and 1000 positive class. 
 52 | > The test set contains 16000 examples. 
 53 | 
 54 | There are 170 predictors plus the outcome (`class`):
 55 | 
 56 | > The attribute names of the data have been anonymized for 
 57 | > proprietary reasons. It consists of both single numerical 
 58 | > counters and histograms consisting of bins with different 
 59 | > conditions. 
 60 | > ...
 61 | > The attributes are as follows: class, then 
 62 | > anonymized operational data. The operational data have 
 63 | > an identifier and a bin id, like 'Identifier_Bin'. 
 64 | > In total there are 171 attributes, of which 7 are 
 65 | > histogram variabels. Missing values are denoted by 'na'.
 66 | 
 67 | We begin by exploring the training data. Note that we have variable
 68 | amount of missing data across the columns:
 69 | 
 70 | ```{r}
 71 | library(readr)
 72 | col_types <- paste0(c("c",rep("n",170)),collapse="")
 73 | dat <- read_csv("aps_failure_training_set.csv", skip=20, na="na", col_types=col_types)
 74 | table(dat$class)
 75 | table(sapply(dat[,-1], class))
 76 | summary(sapply(dat[,-1], function(z) sum(is.na(z))))
 77 | ```
 78 | 
 79 | The very first column has decent discrimination of the outcome:
 80 | 
 81 | ```{r}
 82 | with(dat, boxplot(aa_000 ~ class))
 83 | with(dat, boxplot(ab_000 ~ class))
 84 | ```
 85 | 
 86 | ## Imbalanced data and unequal costs
 87 | 
 88 | There is a technical report, 
 89 | [Using Random Forest to Learn Imbalanced Data](https://statistics.berkeley.edu/sites/default/files/tech-reports/666.pdf),
 90 | with Leo Breiman, the creator of the random forest, as a co-author,
 91 | which goes into detail on how to use random forests to deal with
 92 | imbalanced data. Here we have many more negative examples than
 93 | positives, although we want to have very high sensitivity (false
 94 | negatives are very costly, relative to false positives). One of the
 95 | strategies mentioned in the technical report is to down-sample the
 96 | majority class. For convenience, we will take this approach for this
 97 | lecture note, producing equal classes by down-sampling the
 98 | negatives. However, given that we have explicit costs associated with
 99 | the types of error, the down-sampling may not be sufficient, and a
100 | method with customizable loss function may actually be preferred for
101 | this dataset.
102 | 
103 | We note that another approach aside from down-sampling is to use the
104 | `classwt` parameter in the `randomForest` function from the package of
105 | the same name. The report mentions that, of the class weights and the
106 | down-sampling approaches, "We show that both of our methods have
107 | favorable prediction performance."
108 | 
109 | ```{r}
110 | set.seed(1)
111 | idx.dat <- c(which(dat$class == "pos"),
112 |              sample(which(dat$class == "neg"), 1000))
113 | dat2 <- dat[idx.dat,]
114 | table(dat2$class)
115 | with(dat2, boxplot(aa_000 ~ class))
116 | ```
117 | 
118 | Some quick examination of the quantiles of the first variable for each
119 | class: 
120 | 
121 | ```{r}
122 | by(dat2$aa_000, dat2$class, quantile, .5)
123 | by(dat2$aa_000, dat2$class, quantile, .9)
124 | ```
125 | 
126 | ## Picking a cut point for a single variable
127 | 
128 | We can already start to motivate decision trees by looking at this
129 | first variable. As it seems to discriminate the groups pretty well,
130 | what if we put a cut point at an arbitrary value, say 75,000. How
131 | would our prediction on the training data turn out? We end up with
132 | about 92% prediction accuracy:
133 | 
134 | ```{r}
135 | (tab <- table(obs=dat2$class, pred=dat2$aa_000 > 75000))
136 | round(prop.table(tab), 2)
137 | (cost <- tab["neg","TRUE"] * 10 + tab["pos","FALSE"] * 500)
138 | ```
139 | 
140 | We can also try a much higher value of the first variable to define a
141 | cut point. This gives us 93% accuracy, but actually the total cost
142 | nearly doubles, because we have more false negatives:
143 | 
144 | ```{r}
145 | (tab <- table(obs=dat2$class, pred=dat2$aa_000 > 125000))
146 | round(prop.table(tab), 2)
147 | (cost <- tab["neg","TRUE"] * 10 + tab["pos","FALSE"] * 500)
148 | ```
149 | 
150 | We will come back to the unequal costs later in the note, by examining
151 | a continuous prediction score and finding a new cut point which
152 | minimizes the cost. But for now, we will focus instead on prediction
153 | accuracy and Cohen's kappa as the metric. If we wanted to also
154 | incorporate unequal costs into our parameter tuning, we could define a
155 | [alternate performance metric](https://topepo.github.io/caret/model-training-and-tuning.html#alternate-performance-metrics).
156 | 
157 | ## Imputing missing values
158 | 
159 | Before we dive into decision trees and random forests, we need to
160 | clean up the predictors a bit. The *caret* package offers imputation
161 | of missing values using k-nearest neighbors, via the `preProcess`
162 | function. We train a `preProcess` fit similar to how we use `train`,
163 | and then apply it to the training data using `predict`.
164 | 
165 | **Note:** because the dataset has many predictors (170 possible), it
166 | would take a long time to run a random forest on the entire set of
167 | features (about half an hour on the 2000 observation training
168 | set). For demonstration, we subset to the first 20 predictors, which
169 | then can be fit in a few minutes. For a real application, one would
170 | instead use all of the predictors.
171 | 
172 | ```{r}
173 | library(caret)
174 | x <- as.data.frame(dat2[,2:21])
175 | y <- factor(dat2$class)
176 | summary(sapply(x, function(z) sum(is.na(z))))
177 | ppfit <- preProcess(x, method=c("center","scale","knnImpute"))
178 | x <- predict(ppfit, x)
179 | summary(sapply(x, function(z) sum(is.na(z))))
180 | ```
181 | 
182 | # Decision trees
183 | 
184 | Decision trees are a class of classification algorithms which subject
185 | the observations to a series of binary decisions based on the
186 | predictors, with the goal of separating the classes in the *leaves* of
187 | the tree. If all of the samples begin at the root node (typically
188 | drawn at the top), then each internal node represents one of the
189 | binary decisions. The terminal nodes, or leaves, are drawn at the
190 | bottom, and the goal is to achieve high *purity* of the classes in the
191 | leaves. Purity can be measured by multiple metrics, typically using the
192 | [Gini impurity](https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity), or the 
193 | [entropy](https://en.wikipedia.org/wiki/Decision_tree_learning#Information_gain). The
194 | *CART* (classification and regression trees) method that we will use
195 | here makes use of the Gini impurity, which is similar to the
196 | definition of Cohen's kappa. The Gini impurity is the probability of
197 | misclassification using random labeling (but proportional to the
198 | distribution of labels). If we have *K* classes, each with probability
199 | $p_i$, the Gini impurity can be calculated as:
200 | 
201 | $$ GI = \sum_{i=1}^K p_i (1 - p_i) = 1 - \sum_{i=1}^K p_i^2 $$
202 | 
203 | With two classes, this becomes:
204 | 
205 | $$ GI = 1 - p_1^2 - (1 - p_1)^2 $$
206 | 
207 | $$ = 2 p_1 - 2 p_1^2 $$
208 | 
209 | ```{r echo=FALSE}
210 | plot(function(p) 2*p - 2*p^2, xlim=c(0,1), ylim=c(0,1),
211 |      xlab=expression(p[1]), main="Gini impurity")
212 | ```
213 | 
214 | From this function we can see that an algorithm will try to create
215 | leaves where the probabilities for a given class are close to 0
216 | or 1. A decision tree is built using an algorithm as follows: each
217 | node is recursively partitioned using a splitting rule. To consider
218 | whether to split a node, one can consider the gain (impurity of parent
219 | minus the impurity of the proposed child nodes):
220 | 
221 | $$ \textrm{gain} = I(\textrm{parent}) - \sum_{j=1}^2 \frac{N(j)}{N} I(j) $$
222 | 
223 | where *N* is the number of observations at the parent node, $N(j)$ is
224 | the number of observations at the *j*th child node, and $I(j)$ is the
225 | impurity at the *j*th child node. Here, as we are considering binary
226 | splits, we sum over the two proposed child nodes. Instead of
227 | attempting to find the optimal decision tree, a greedy algorithm is
228 | used to find the optimal split using the covariates for each terminal
229 | node. There are various options for stopping criteria. Some of these
230 | can be found in `?rpart.control`:
231 | 
232 | > minsplit: the minimum number of observations that must exist in a node
233 | >           in order for a split to be attempted.
234 | >
235 | > minbucket: the minimum number of observations in any terminal ‘<leaf>’
236 | >            node.  If only one of ‘minbucket’ or ‘minsplit’ is specified,
237 | >            the code either sets ‘minsplit’ to ‘minbucket*3’ or
238 | >            ‘minbucket’ to ‘minsplit/3’, as appropriate.
239 | 
240 | Here we can demonstrate building a tree with our reduced 20 variable
241 | dataset. We use the `rpart` function from the *rpart* package, and
242 | then plot with a nice function from the *rpart.plot* package. We also
243 | print the "Complexity Parameter" table, which offers a tuning variable
244 | `cp`. The `xerror` column in the table gives the cross-validation
245 | error, where cross-validation is being performed within `rpart`.
246 | 
247 | ```{r}
248 | library(rpart)
249 | library(rpart.plot)
250 | df <- data.frame(class=y,x)
251 | rfit <- rpart(class ~ ., data=df)
252 | printcp(rfit)
253 | rpart.plot(rfit)
254 | ```
255 | 
256 | Taking a look at the tree above, each node gives a label, the fraction
257 | of positive observations in the node, and the percent of observations
258 | at that node. Below the node is the decision rule that decides the
259 | split: the variable name and the critical value that was chosen by the
260 | greedy algorithm. A decision tree can use any variable at any step in
261 | the process, so they can generate, for example, interactions between
262 | variables or step-like functions of a single variable. We will see in
263 | a final example how decision trees can approximate any smooth function
264 | as long as the tree is given enough splits and data.
265 | 
266 | We can also prune back the tree to one with a higher complexity
267 | parameter (think of CP as a penalty on complex trees).
268 | 
269 | ```{r}
270 | rfit$cptable
271 | # try a higher complexity parameter
272 | cp <- rfit$cptable[2,"CP"]
273 | # this is the minimizing complexity parameter:
274 | # cp <- rfit$cptable[which.min(rfit$cptable[,"xerror"]),"CP"]
275 | pfit <- prune(rfit, cp=cp)
276 | rpart.plot(pfit)
277 | ```
278 | 
279 | This results in a tree with a single split, but which nevertheless has
280 | leaves with high purity.
281 | 
282 | # Ensemble methods
283 | 
284 | A quick motivation for why ensembles of simple learners like decisions
285 | trees would do well is to consider the law of large numbers. Suppose
286 | we have a decision tree which will mis-classify a given observation
287 | with fixed probability 0.35. And then suppose we build 25 such trees,
288 | *and each tree is independent*. Then using the binomial density, we can
289 | see that a majority vote of the ensemble will do much better than any
290 | individual tree:
291 | 
292 | ```{r}
293 | ntree <- 25
294 | p <- 0.35
295 | trees <- function() rbinom(ntree, 1, p)
296 | # one ensemble vote:
297 | sum(trees()) >= ceiling(ntree/2)
298 | # probability the ensemble will mis-classify:
299 | pbinom(floor(ntree/2), ntree, p, lower.tail=FALSE)
300 | ```
301 | 
302 | However, we will not necessarily be able to generate *independent*
303 | classifiers as they use the same covariates and training data. This
304 | motivates the method of random forests as well.
305 | 
306 | # Random forests
307 | 
308 | Random forests are ensembles of decision trees which are built both by
309 | bootstrapping the observations (bagging) as well as randomly
310 | subsetting the set of predictors that are used to build a tree. The
311 | trees are grown without pruning. And the final prediction is made by a
312 | majority vote of the ensemble of trees. The combination of
313 | bootstrapping the samples, and subsampling the predictors will lead to
314 | reduction in the correlation of the trees, which will help if one
315 | considers the logic in the previous section with wholly *independent*
316 | classifiers. 
317 | 
318 | Here we will use the *randomForest* package for constructing
319 | trees. From `?randomForest` we can see the rules for how many
320 | predictors are used in each tree: 
321 | 
322 | > mtry: Number of variables randomly sampled as candidates at each
323 | >       split.  Note that the default values are different for
324 | >       classification (sqrt(p) where p is number of variables in
325 | >       ‘x’) and regression (p/3)
326 | 
327 | We will call for a random forest to be built using this library, by
328 | specifying `method="rf"` from the *caret* interface. We specify that
329 | we want to retain the predictions. Note that when we run `train` we
330 | are doing two levels of bootstrap resampling: there is the
331 | bootstrapping that `train` performs for all methods (unless
332 | cross-validation or an addition `method` is specified to
333 | `trainControl`), as well as the bootstrapping for individual trees
334 | within the forests. The number of bootstraps is controlled by `number`
335 | in `trainControl` for the former (default is 25) and by `ntree` in the
336 | *randomForest* package for the latter (default is 500).
337 | 
338 | ```{r}
339 | trCtl <- trainControl(savePredictions=TRUE)
340 | fit <- train(x, y, method="rf", trControl=trCtl) # ~150 s
341 | ```
342 | 
343 | We can see that the forest achieves fairly high accuracy and kappa at
344 | various levels of `mtry`:
345 | 
346 | ```{r}
347 | fit$results
348 | ```
349 | 
350 | Random forest comes with a measure of variable importance (described
351 | in `?importance`:
352 | 
353 | > For each tree, the prediction error on the out-of-bag portion of the data is
354 | > recorded (error rate for classification, MSE for regression). Then
355 | > the same is done after permuting each predictor variable. The
356 | > difference between the two are then averaged over all trees, and
357 | > normalized by the standard deviation of the differences. If the
358 | > standard deviation of the differences is equal to 0 for a variable,
359 | > the division is not done (but the average is almost always equal to
360 | > 0 in that case). 
361 | >
362 | > The second measure is the total decrease in node impurities from
363 | > splitting on the variable, averaged over all trees. For
364 | > classification, the node impurity is measured by the Gini index. For
365 | > regression, it is measured by residual sum of squares. 
366 | 
367 | We can see that the random forests built by *caret* use the second
368 | measure of importance by default:
369 | 
370 | ```{r}
371 | imp <- fit$finalModel$importance
372 | head(imp)
373 | dotplot(tail(sort(imp[,1]),10), xlab="Mean Decrease Gini")
374 | ```
375 | 
376 | # Returning to unequal costs
377 | 
378 | We previously used down-sampling as a method to address the fact that
379 | we had many more negative examples than positive examples, although
380 | the false negatives should be counted as 50x more costly than the
381 | false positives. We can now return to this question, to see if the
382 | default prediction by the trained random forest is optimal, or if we
383 | should re-calibrate to minimize total cost further (relative to a
384 | given distribution of positives and negatives).
385 | 
386 | Note that, if one were to use a test set as below to re-calibrate a
387 | predictor based on minimizing costs in a given dataset, one would want
388 | to use a final held-out test set to assess cost in a new dataset.
389 | 
390 | We read in a held-out test set:
391 | 
392 | ```{r}
393 | col_types <- paste0(c("c",rep("n",170)),collapse="")
394 | dat.test <- read_csv("aps_failure_test_set.csv", skip=20, na="na", col_types=col_types)
395 | ```
396 | 
397 | And we can again impute the missing values using the previously
398 | generated pre-processing rule:
399 | 
400 | ```{r}
401 | summary(sapply(dat.test[,-1], function(z) sum(is.na(z))))
402 | x.test <- as.data.frame(dat.test[,2:21])
403 | y.test <- factor(dat.test$class)
404 | table(y.test)
405 | x.test <- predict(ppfit, x.test)
406 | ```
407 | 
408 | We then classify the test set and also generate class probabilities,
409 | by setting `type="prob"`:
410 | 
411 | ```{r}
412 | y.pred <- predict(fit, x.test)
413 | y.prob <- predict(fit, x.test, type="prob")
414 | confusionMatrix(data=y.pred, reference=y.test)
415 | ```
416 | 
417 | We can create a function which evaluates the total costs for a given
418 | cut point, and run this over the test set predictions:
419 | 
420 | ```{r}
421 | test.pred <- data.frame(obs=y.test, pos=y.prob$pos)
422 | costFn <- function(cut, df) {
423 |   tab <- table(obs=df$obs, pred=factor(df$pos >= cut, c("FALSE","TRUE")))
424 |   cost <- tab["neg","TRUE"] * 10 + tab["pos","FALSE"] * 500
425 |   cost
426 | }
427 | costFn(cut=.5, test.pred)
428 | s <- seq(from=0.1, to=.9, length=100)
429 | costs <- sapply(s, costFn, df=test.pred)
430 | plot(s, costs, type="l", lwd=3, col="dodgerblue")
431 | ```
432 | 
433 | Note that the default prediction (class probability > 0.5) is doing
434 | pretty well at minimizing costs, although it appears adopting a rule
435 | closer to 0.6 would do a bit better:
436 | 
437 | ```{r}
438 | s <- seq(from=0.45, to=0.75, length=100)
439 | costs <- sapply(s, costFn, df=test.pred)
440 | plot(s, costs, type="l", lwd=3, col="dodgerblue")
441 | ```
442 | 
443 | # Regression trees
444 | 
445 | Finally, we demonstrate with some simulated data that trees and
446 | forests can be applied to continuous data as well. Instead of focusing
447 | on decreasing impurity in the leaves, the trees are focused on
448 | reducing the variance of a continuous target in each leaf, again
449 | splitting using the predictors. If we try to learn a sine function, we
450 | can see that a tree with very low complexity penalty can learn an
451 | arbitrary shape by splitting along `x` (here a one dimensional
452 | surface, but consider how a multi-dimensional surface could be
453 | approximated as well).
454 | 
455 | ```{r}
456 | library(rpart)
457 | df <- data.frame(x=runif(1000))
458 | df$y <- sin(2 * pi * df$x)
459 | with(df, plot(x,y))
460 | rfit <- rpart(y ~ x, data=df, method="anova", cp=.001)
461 | rpart.plot(rfit)
462 | ```
463 | 
464 | ```{r}
465 | pred <- predict(rfit, df)
466 | with(df, plot(x,y))
467 | points(df$x, pred, col="red")
468 | ```
469 | 
470 | By bagging and subsetting predictors in a random forest, we get a more
471 | complex shape, much closer to the true distribution of the target:
472 | 
473 | ```{r}
474 | df <- data.frame(x=runif(200))
475 | df$y <- sin(2 * pi * df$x)
476 | trCtl <- trainControl(method="cv", number=5, savePredictions=TRUE)
477 | tg <- data.frame(mtry=1)
478 | rf.fit <- train(df["x"], df$y, method="rf", trControl=trCtl, tuneGrid=tg)
479 | with(df, plot(x,y))
480 | pred <- rf.fit$pred[order(rf.fit$pred$rowIndex),]
481 | points(df$x, pred$pred, col="red")
482 | ```
483 | 
484 | Finally, we note that the random forest approach to predicting this
485 | continuous function may look similar to a k-nearest-neighbors
486 | approach, although the random forest approach will scale to higher
487 | dimensions, while k-nearest-neighbors will begin to breakdown, due to
488 | difficulties posed by
489 | [distance functions](https://en.wikipedia.org/wiki/Curse_of_dimensionality#Distance_functions) 
490 | in high dimensions.
491 | 
492 | ```{r}
493 | trCtl <- trainControl(method="cv", number=5, savePredictions=TRUE)
494 | kfit <- train(df["x"], df$y, method="knn", trControl=trCtl)
495 | kfit$results
496 | with(df, plot(x,y))
497 | pred <- kfit$pred[kfit$pred$k == 5,]
498 | pred <- pred[order(pred$rowIndex),]
499 | points(df$x, pred$pred, col="red")
500 | ```
501 | 
502 | * This lecture note borrows from (offline) machine learning course
503 |   notes of [Guenther Walther](http://statweb.stanford.edu/~gwalther/). 
504 | 


--------------------------------------------------------------------------------
/ml/svm.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Support vector machines"
  3 | author: "Michael Love"
  4 | date: 12/15/2018
  5 | output: html_document
  6 | ---
  7 | 
  8 | Support vector machines (SVM) are a very popular machine learning
  9 | method for binary classification, which can be solved efficiently even
 10 | for large datasets. SVM have a number of desirable features,
 11 | including: ability to perform classification in a non-linear space
 12 | using kernels, tend to have good generalization to new data, and work
 13 | well in very high dimensional space.
 14 | 
 15 | # First look at SVM
 16 | 
 17 | Here we will begin by demonstrating how SVM differ from other linear
 18 | classifiers, such as LDA (linear discriminant analysis) on simulated
 19 | data (two dimensional, so we can easily visualize the hyperplane that
 20 | defines the classification boundary). We will then show the objective
 21 | function that the SVM minimizes (in the completely separable case),
 22 | show how this can be solved with quadratic programming. Finally, we
 23 | will discuss how various kernels can be used in combination with SVM.
 24 | 
 25 | First we construct some data that happens to be completely separable
 26 | by a hyperplane in two dimensions:
 27 | 
 28 | ```{r include=FALSE}
 29 | knitr::opts_chunk$set(fig.width=6, cache=TRUE)
 30 | ```
 31 | 
 32 | ```{r}
 33 | set.seed(2)
 34 | n <- 100
 35 | x1 <- c(rnorm(n),rnorm(n,5))
 36 | x2 <- c(rnorm(n),rnorm(n,5))
 37 | x1 <- scale(x1)
 38 | x2 <- scale(x2)
 39 | y <- factor(rep(c(-1,1),each=n))
 40 | dat <- data.frame(y,x1,x2)
 41 | library(ggplot2)
 42 | ggplot(dat, aes(x1,x2,col=y)) + geom_point()
 43 | ```
 44 | 
 45 | Using LDA gives us the following classification boundary. 
 46 | <!-- In this -->
 47 | <!-- case, the boundary line is given by `-coefs[1]/coefs[2]`, but instead -->
 48 | We use code below which generalizes for other cases (including multiple
 49 | classes, or non-linear boundaries as we will explore below). As you
 50 | can recall, the boundary line for the LDA is determined by the
 51 | likelihood ratio where two Gaussian distributions are fit to the data
 52 | of the two classes. We draw the center of the Gaussians with a cross:
 53 | 
 54 | ```{r}
 55 | library(caret)
 56 | x <- data.frame(x1,x2)
 57 | lfit <- train(x, y, method="lda")
 58 | coefs <- lfit$finalModel$scaling
 59 | means <- data.frame(lfit$finalModel$means, y=factor(c(-1,1)))
 60 | s <- seq(from=-2,to=2,length=400)
 61 | grid <- expand.grid(x1=s,x2=s)
 62 | grid$y <- as.numeric(predict(lfit, newdata=grid))
 63 | ggplot(dat, aes(x1,x2,col=y)) + geom_point() +
 64 |   geom_point(data=means,shape=10,size=10,stroke=2,alpha=.5,show.legend=FALSE) +
 65 |   geom_contour(data=grid, aes(x1,x2,z=y), breaks=1.5, col="black")
 66 | ```
 67 | 
 68 | We will go ahead a fit a *linear kernel* SVM (to be explained later)
 69 | to the same data. We will see a boundary as above, but the slope is a
 70 | bit different. 
 71 | 
 72 | ```{r}
 73 | # for standard SVM usage, do not set this `C` parameter so high
 74 | # this will be discussed later when we talk about "soft margin" SVM
 75 | tg <- data.frame(C=100)
 76 | fit <- train(x, y, method="svmLinear", tuneGrid=tg)
 77 | alpha <- fit$finalModel@alpha[[1]]
 78 | sv <- as.data.frame(x[fit$finalModel@SVindex,]) # the "support vectors"
 79 | sv.y <- 2 * (as.numeric(y[fit$finalModel@SVindex]) - 1.5)
 80 | w <- colSums(alpha * sv.y * as.matrix(sv))
 81 | b <- fit$finalModel@b
 82 | grid <- expand.grid(x1=s,x2=s)
 83 | grid$y.cont <- (as.matrix(grid[,1:2]) %*% w - b)[,1]
 84 | ggplot(dat, aes(x1,x2,col=y)) + geom_point() + 
 85 |   geom_point(data=sv, col="black", size=5, shape=21) +
 86 |   geom_contour(data=grid, aes(x1,x2,z=y.cont), breaks=c(-1,0,1), col="black")
 87 | ```
 88 | 
 89 | The key difference is that, instead of modeling the data as two
 90 | Gaussians, the SVM has attempted to put the widest margin between the
 91 | two groups of samples. This ends up being equivalent to finding a set
 92 | of points which define the boundary between the two groups, and
 93 | putting a wide band between those sets of points. The code may not
 94 | make much sense now, but it is extracting the key parameters *w* and
 95 | *b* which define the following rules:
 96 | 
 97 | $$ w^T x - b = 1 $$
 98 | 
 99 | $$ w^T x - b = -1 $$
100 | 
101 | Anything on or above the line defined by the first equation will be 
102 | classified as +1, while anything on or below the line in the second
103 | equation will be classified as -1. We then draw the lines for 1, 0,
104 | and -1 to show the boundaries and center of the margin dividing the
105 | two groups. The lines pass through a set of data points, these are
106 | called the *support vectors*. It is the nature of the constrained
107 | optimization of the SVM that a subset (sometimes small) of the
108 | training dataset ends up defining the decision boundary.
109 | 
110 | And just to show how SVM can be used to do more interesting things
111 | than finding a line between two sets of points, we show how by simply
112 | swapping out the *linear kernel* (so whenever we compute the dot
113 | product between two observations),
114 | 
115 | $$ K\left(x,x'\right) = \left\langle x, x' \right\rangle, $$
116 | 
117 | for a *radial basis function kernel*, that is,
118 | 
119 | $$ K\left(x,x'\right) = \exp\left(-\gamma \left\|x-x'\right\|^2 \right), $$
120 | 
121 | we can use the same SVM routine to find a different set of support
122 | vectors (defining the boundary of points from all sides), and a very
123 | different classification boundary. Again, we will discuss how kernels
124 | are relevant to SVM in a section below.
125 | 
126 | ```{r}
127 | rfit <- train(x, y, method="svmRadial")
128 | rsv <- as.data.frame(x[rfit$finalModel@SVindex,])
129 | grid <- expand.grid(x1=s,x2=s)
130 | grid$y <- predict(rfit, newdata=grid)
131 | grid$yy <- 2*(as.numeric(grid$y) - 1.5)
132 | ggplot(dat, aes(x1,x2,col=y)) + geom_point() + 
133 |   geom_point(data=rsv, col="black", size=5, shape=21) +
134 |   geom_contour(data=grid, aes(x1,x2,z=yy), breaks=0, col="black") +
135 |   geom_raster(data=grid, aes(x1,x2,fill=y), alpha=.2)
136 | ```
137 | 
138 | # Motivation behind the SVM solution
139 | 
140 | First we will give some motivation to how we solve for *w* and *b*
141 | above. The notes below follow closely 
142 | [Andrew Ng's notes on SVM](http://cs229.stanford.edu/summer2020/cs229-notes3.pdf),
143 | which I recommend for more in depth derivation and details on the
144 | algorithms which solve the SVM.
145 | 
146 | Again, supposing we have two linearly separable sets of points,
147 | we want to find *w* and *b* so that the data are correctly classified,
148 | that is, $w^T x - b \ge 1$ for all the data with $y=1$ and $w^T x - b \le
149 | -1$ for all the data with $y=-1$. The distance between these two hyperplanes
150 | is given by:
151 | 
152 | $$ \frac{(1 + b) - (-1 + b)}{\|w\|} = \frac{2}{\|w\|} $$
153 | 
154 | and so to make the margin as wide as possible corresponds to
155 | minimizing $\|w\|$. The constrained optimization is then:
156 | 
157 | $$
158 | \begin{aligned}
159 | & \underset{w,b}{\text{min}}
160 | & & \|w\| \\
161 | & \text{s.t.}
162 | & & w^T x_i - b \ge 1 : y_i = 1 \\
163 | & & & w^T x_i - b \le -1 : y_i = -1
164 | \end{aligned}
165 | $$
166 | 
167 | Note that multiplying both of the constraints by $y_i$ then gives a
168 | cleaner form:
169 | 
170 | $$
171 | \begin{aligned}
172 | & \underset{w,b}{\text{min}}
173 | & & \|w\| \\
174 | & \text{s.t.}
175 | & & y_i(w^T x_i - b) \ge 1,\quad i=1,\dots,n \\
176 | \end{aligned}
177 | $$
178 | 
179 | And we can square the norm and multiply by one half to make the
180 | optimization even easier, because we will have a quadratic objective
181 | to minimize, and linear constraints.
182 | 
183 | $$
184 | \begin{aligned}
185 | & \underset{w,b}{\text{min}}
186 | & & \tfrac{1}{2} w^T w \\
187 | & \text{s.t.}
188 | & & y_i(w^T x_i - b) \ge 1,\quad i=1,\dots,n \\
189 | \end{aligned}
190 | $$
191 | 
192 | # SVM objective solved with quadratic programming
193 | 
194 | We can take the above constrained optimization formulation and
195 | directly plug it into a quadratic programming package to find the
196 | optimal margin for the training data. The *quadprog* package in R
197 | offers optimization for problems of the form:
198 | 
199 | $$
200 | \begin{aligned}
201 | & \underset{b}{\text{min}}
202 | & & -d^T b + \tfrac{1}{2} b^T D b \\
203 | & \text{s.t.}
204 | & & A^T b \ge b_0 \\
205 | \end{aligned}
206 | $$
207 | 
208 | Unfortunately, they have used a *b* as well as the typically *b* that
209 | is used in the SVM problem. We will refer to their *b* as
210 | $b'$. Nevertheless, we can map our problem into their notation, by
211 | setting $d=0$, $b' = [w,b]$, 
212 | $D = \bigl(\begin{smallmatrix}I & 0 \\ 0 & 0\end{smallmatrix}\bigr)$,
213 | $A^T = [y x^1, y x^2, \dots, y x^p, -y]$, and $b_0 =
214 | [1,\dots,1]$. Here I have used $y x^j$ to refer to a column vector
215 | where each $y_i$ is multiplied by sample i's value for the j-th
216 | predictor, $x_i^j$.
217 | 
218 | Converting our SVM notation to the notation of *quadprog* gives:
219 | 
220 | ```{r}
221 | library(quadprog)
222 | # min_w,b wT w s.t. y_i (w x_i - b) >= 1
223 | # quadprog gives:
224 | # min_b 1/2 bT D b s.t. AT b >= b0
225 | yy <- 2 * (as.numeric(y) - 1.5) # {-1,1}
226 | n <- length(y)
227 | p <- ncol(x)
228 | D <- matrix(0, nrow=p+1, ncol=p+1)
229 | diag(D) <- 1
230 | D[p+1,p+1] <- 1e-6 # ensure D is positive def
231 | d <- numeric(p+1)
232 | AT <- cbind(as.matrix(x), rep(-1, n))
233 | A <- t(AT * yy)
234 | b0 <- rep(1, n)
235 | wb <- solve.QP(D,d,A,b0)
236 | ```
237 | 
238 | We can then pull out our fitted *w* and *b*, and plot them against the
239 | training data:
240 | 
241 | ```{r}
242 | w <- wb$solution[1:p]
243 | b <- wb$solution[p+1]
244 | ggplot(dat, aes(x1,x2,col=y)) + geom_point() +
245 |   geom_abline(intercept=(b+1)/w[2],slope=-w[1]/w[2],alpha=.2,linetype=2) +
246 |   geom_abline(intercept=(b-1)/w[2],slope=-w[1]/w[2],alpha=.2,linetype=2) +
247 |   geom_abline(intercept=b/w[2],slope=-w[1]/w[2],linetype=3)
248 | ```
249 | 
250 | # Non separable case and soft margins
251 | 
252 | The SVM would be a very brittle method if it required separability in
253 | all cases, as it would both fail for many datasets, and it wouldn't be
254 | very robust to outliers. Since the constraints are hard, a single
255 | data point could tilt the margin. So, the SVM which is actually used
256 | in practice is not the one defined above, with a "hard margin", but
257 | instead one with a "soft margin", where points in the training set can
258 | cross the margin, and even cross the classification boundary and
259 | therefore be misclassified, with the advantage that the method will
260 | now work on all datasets and be more robust and have lower
261 | generalization error.
262 | 
263 | The soft margin SVM is accomplished by softening the constraints,
264 | while adding a penalty for points which are above the margin. The
265 | tradeoff between the main objective of making a wide margin and the
266 | penalty for points crossing the margin will be controlled with a
267 | tuning parameter *C*. The soft margin constrained optimization then
268 | looks like: 
269 | 
270 | $$
271 | \begin{aligned}
272 | & \underset{w,b}{\text{min}}
273 | & & \tfrac{1}{2} w^T w + C \sum_{i=1}^n \xi_i \\
274 | & \text{s.t.}
275 | & & y_i(w^T x_i - b) \ge 1 - \xi_i,\quad i=1,\dots,n \\
276 | & & & \xi_i \ge 0 \\
277 | \end{aligned}
278 | $$
279 | 
280 | If all the points can be kept outside the margin and correctly
281 | classified then this means all the $\xi_i = 0$. This can also be
282 | re-formulated as a "hinge loss". The above optimization is equivalent
283 | to trying to minimize:
284 | 
285 | $$ \tfrac{1}{2} w^T w + C \left( \sum_{i=1}^n \max(0, 1 - y_i (w^T x_i - b) ) \right) $$ 
286 | 
287 | where the piece inside the large parentheses is equal to $\xi_i$. This
288 | is called a hinge loss, because, again, the $\xi_i = 0$ for those *i*
289 | that are outside of the margin and correctly classified, and only
290 | engages as a loss when a boundary is crossed. You may have noticed
291 | above that we set *C* very large when we ran the linear kernel
292 | SVM. This is so that we would obtain the hard margin classifier. We
293 | can re-run with a more typical value of $C=1$, and notice how the
294 | margin changes. Now more points emerge as support vectors, and some of
295 | them are within the margin.
296 | 
297 | ```{r}
298 | fit <- train(x, y, method="svmLinear")
299 | fit$results
300 | alpha <- fit$finalModel@alpha[[1]]
301 | sv <- as.data.frame(x[fit$finalModel@SVindex,]) # the "support vectors"
302 | sv.y <- 2 * (as.numeric(y[fit$finalModel@SVindex]) - 1.5)
303 | w <- colSums(alpha * sv.y * as.matrix(sv))
304 | b <- fit$finalModel@b
305 | grid <- expand.grid(x1=s,x2=s)
306 | grid$y.cont <- (as.matrix(grid[,1:2]) %*% w - b)[,1]
307 | ggplot(dat, aes(x1,x2,col=y)) + geom_point() + 
308 |   geom_point(data=sv, col="black", size=5, shape=21) +
309 |   geom_contour(data=grid, aes(x1,x2,z=y.cont), breaks=c(-1,0,1), col="black")
310 | ```
311 | 
312 | # Kernel trick
313 | 
314 | Finally, we explain what we meant earlier by saying that various
315 | *kernels* can be used with SVM. So far we have mostly looked at
316 | *linear kernel* SVM, except for a sneak peak of a *radial basis
317 | function kernel* where we saw very different behavior of the support
318 | vectors and the classification boundary. To explain how kernels come
319 | into play, we need to show the Lagrangian dual form of the constrained
320 | optimization we have been showing. We will go back to the hard margin
321 | SVM but the logic applies equally to the soft margin as well.
322 | 
323 | Our optimization can be written:
324 | 
325 | $$
326 | \begin{aligned}
327 | & \underset{w,b}{\text{min}}
328 | & & \tfrac{1}{2} w^T w \\
329 | & \text{s.t.}
330 | & & -y_i(w^T x_i - b) + 1 \le 0,\quad i=1,\dots,n \\
331 | \end{aligned}
332 | $$
333 | 
334 | And the Lagrange function, with multipliers $\alpha_i$ is:
335 | 
336 | $$ \mathcal{L}(w,b,\alpha) = \tfrac{1}{2} w^T w - \sum_{i=1}^n \alpha_i \left( y_i (w^T x_i - b) - 1 \right) $$
337 | 
338 | Taking the gradient with respect to *w* and setting equal to zero gives:
339 | 
340 | $$ w - \sum_{i=1}^n \alpha_i y_i x_i = 0$$
341 | 
342 | $$ w = \sum_{i=1}^n \alpha_i y_i x_i $$
343 | 
344 | Repeating the same for *b* gives:
345 | 
346 | $$ \sum_{i=1}^n \alpha_i y_i = 0 $$
347 | 
348 | Then we can rewrite the Langrange function, using these two equations, as:
349 | 
350 | $$ \mathcal{L}(w,b,\alpha) = \tfrac{1}{2} A - A - \sum_{i=1}^n \alpha_i y_i b + \sum_{i=1}^n \alpha_i $$
351 | 
352 | where $A = \sum_{i,j=1}^n y_i y_j \alpha_i \alpha_j x_i^T x_j$. 
353 | Re-arranging this gives what is typically shown for the Lagrangian
354 | dual of the SVM:
355 | 
356 | $$
357 | \begin{aligned}
358 | & \underset{\alpha}{\text{max}}
359 | & & \sum_{i=1}^n \alpha_i - \tfrac{1}{2} \sum_{i,j=1}^n y_i y_j \alpha_i \alpha_j \langle x_i, x_j \rangle \\
360 | & \text{s.t.}
361 | & & \alpha_i \ge 0, \quad i=1,\dots,n \\\
362 | & & & \sum_{i=1}^n \alpha_i y_i = 0 
363 | \end{aligned}
364 | $$
365 | 
366 | One note while looking at the first line of the Lagrangian dual: 
367 | 
368 | $$ \max_\alpha \sum_{i=1}^n \alpha_i - \tfrac{1}{2} \sum_{i,j=1}^n y_i y_j \alpha_i \alpha_j \langle x_i, x_j \rangle $$
369 | 
370 | We want to maximize this quantity, which has a $-\tfrac{1}{2}$ and then
371 | a dot product between all $x_i$ and $x_j$. If sample *i* and *j* are
372 | in the same class, then $y_i y_j$ will be positive. If they are near
373 | each other, then the dot product will be large. For this reason, for
374 | many of the samples in the interior of a group of samples, the
375 | maximization will "want" to set $\alpha_i, \alpha_j = 0$. This is some
376 | loose motivation for why we will end up with a sparse solution, where
377 | only a few of the $\alpha_i$ are non-zero, and these will be the
378 | *support vectors* that define the hyperplane.
379 | 
380 | As we have defined $w = \sum_{i=1}^n \alpha_i y_i x_i$, once we
381 | have fit a model to the training data, calculating the label for a new
382 | data point $x'$ involves calculating:
383 | 
384 | $$ w^T x' - b = \sum_{i=1}^n \alpha_i y_i \langle x_i, x' \rangle - b, $$
385 | 
386 | but since most of the $\alpha_i$ will be equal to zero, we only need
387 | to perform the dot product for the support vectors. This is one of the
388 | efficiencies that allows SVM good performance for large, high
389 | dimensional datasets.
390 | 
391 | We showed that both finding the solution to the SVM on the training
392 | data, and calculating the label for a new point, involves a dot
393 | product $\langle x_i, x_j \rangle$ for all the samples *i*, *j* in the 
394 | dataset. This observation motivates the following trick: if we wanted
395 | to work in a transformed space on the original *x*, we could apply a
396 | function *f* throughout all of our equations around *x*. However, we
397 | just saw that the solution and prediction of labels for new points
398 | requires only $\langle x_i, x_j \rangle$, and so working in the
399 | transformed space implies replacing this with $\langle f(x_i), f(x_j)
400 | \rangle$. This is what is referred to as a kernel, and our
401 | transformation could be written:
402 | 
403 | $$ K(x_i, x_j) = \langle f(x_i), f(x_j) \rangle $$
404 | 
405 | This property of the SVM both allows arbitrary transformations, and
406 | means that we can work in arbitrary large spaces, as long as we can
407 | calculate $K(x_i, x_j)$. We can avoid actually calculating $x_i$ or
408 | $f(x_i)$. To give an example, suppose we are interested in classifying
409 | documents that contain words. We could count each word as a feature,
410 | and give a 1 if the document contains the word, or a 0 if the document
411 | does not contain a word. This is a very high dimensional space, if we
412 | start to enumerate all the words in a corpus of documents. However, we
413 | only need to the dot product between two documents, which amounts to
414 | counting the number of words that appear in common, and we never need
415 | to enumerate the $x_i$ themselves. 
416 | 
417 | The radial basis function kernel we used above gives a high value for
418 | $K(x_i, x_j)$ if the points are close together, and a value that
419 | quickly drops off to zero if the points are not close. 
420 | <!-- It is easy to -->
421 | <!-- compute the value of the kernel function for two points, however the -->
422 | <!-- feature space is represented by an  -->
423 | <!-- [infinite sum](https://en.wikipedia.org/wiki/Radial_basis_function_kernel). -->
424 | 
425 | # Extensions to regression
426 | 
427 | We won't cover these, but I wanted to provide pointers to literature
428 | which extends the SVM model to predict continuous covariates. Two such
429 | references are:
430 | 
431 | * Drucker et al. [Support Vector Regression Machines](https://papers.nips.cc/paper/1238-support-vector-regression-machines) (1996)
432 | * Suykens and Vandewalle [Least Squares Support Vector Machine Classifiers](https://link.springer.com/content/pdf/10.1023/A:1018628609742.pdf) (1999)
433 | 
434 | These can be accessed within *caret* by selecting one of the 
435 | [SVM models](https://topepo.github.io/caret/train-models-by-tag.html#support-vector-machines)
436 | with a *regression* tag.
437 | 
438 | # Additional reference for these notes
439 | 
440 | * [Andrew Ng's notes on SVM](http://cs229.stanford.edu/summer2020/cs229-notes3.pdf)
441 |   
442 | 


--------------------------------------------------------------------------------
/ml/svm_HW12.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Homework 12 - Support Vector Machines"
 3 | author: "your name here"
 4 | date: "`r format(Sys.time(), '%m/%d/%Y')`"
 5 | output: html_document
 6 | ---
 7 | 
 8 | # Credit card dataset
 9 | 
10 | We will be working in this homework with a popular machine learning
11 | dataset about credit card fraud. Go to the following link and download
12 | the CSV:
13 | 
14 | <https://www.openml.org/d/1597>
15 | 
16 | The data description is 
17 | 
18 | > The datasets contains transactions made by credit cards in September
19 | > 2013 by european cardholders. This dataset present transactions that
20 | > occurred in two days, where we have 492 frauds out of 284,807
21 | > transactions. The dataset is highly unbalanced, the positive class
22 | > (frauds) account for 0.172% of all transactions. 
23 | 
24 | Now we begin by reading into R:
25 | 
26 | ```{r}
27 | 
28 | library(readr)
29 | z <- read_csv("phpKo8OWT.csv")
30 | dim(z)
31 | table(z$Class)
32 | y <- gsub("\\'","",z$Class)
33 | x <- as.data.frame(z[,-31])
34 | ```
35 | 
36 | We will deal with the class imbalance for this homework just by
37 | downsampling the non-fraud cases. As we saw in the random forest
38 | homework there are other approaches including custom cost functions. 
39 | 
40 | ```{r}
41 | set.seed(1)
42 | idx <- c(sample(which(y == "0"), sum(y == "1")), which(y == "1"))
43 | y <- y[idx]
44 | x <- as.data.frame(scale(x[idx,]))
45 | table(y)
46 | ```
47 | 
48 | The homework assignment is to run linear and radial basis function SVM
49 | on the dataset, and report the Kappa for both models. For RBF, you
50 | should plot the Kappa over the different values for the cost
51 | function (`metric="Kappa"`). 
52 | 
53 | Now, suppose we want to examine plots of the decision boundary in the
54 | feature space. We can only look at two features at a time in a scatter
55 | plot. What are the two most important variables for the SVMs (they are
56 | the same for both SVMs)?
57 | 
58 | Make a scatterplot for each method that includes: the data points in this two
59 | dimensional space, colored by the "0" and "1" prediction, and the decision
60 | boundary. In class, we simply used `expand.grid` to build the
61 | `newdata` that was fed to `predict`. Start with this approach, using a
62 | grid of 40 points from -4 to 4 for the two most important variables,
63 | but before you attempt to run `predict` (which would give an error), read further:
64 | 
65 | In this case, we have to worry about the other 30 - 2 = 28
66 | variables. If we put in 0's, this would not be typical observations,
67 | and we will get strange results.
68 | 
69 | Instead, you should put `NA` for the other variables, and use
70 | `preProcess` with KNN imputation (alone, don't re-scale), to impute
71 | the other values. Then use this data to run `predict` and define the
72 | decision boundary. This is a simpler approach compared to the
73 | integration approach taken by `plot.gbm` to produce marginal plots
74 | that we saw when we looked at boosting, but it is sufficient to get a
75 | sense of the decision boundary in 2D for "typical" values of the other
76 | covariates. 
77 | 
78 | Do you see a big difference in the decision boundary for linear vs RBF
79 | SVM? 
80 | 


--------------------------------------------------------------------------------
/numint/alzheimers.dat:
--------------------------------------------------------------------------------
  1 | subject month words
  2 | 1 1  9
  3 | 1 2 12
  4 | 1 3 16
  5 | 1 4 17
  6 | 1 5 18
  7 | 2 1  6
  8 | 2 2  7
  9 | 2 3 10
 10 | 2 4 15
 11 | 2 5 16
 12 | 3 1 13
 13 | 3 2 18
 14 | 3 3 14
 15 | 3 4 21
 16 | 3 5 21
 17 | 4 1  9
 18 | 4 2 10
 19 | 4 3 12
 20 | 4 4 14
 21 | 4 5 15
 22 | 5 1  6
 23 | 5 2  7
 24 | 5 3  8
 25 | 5 4  9
 26 | 5 5 12
 27 | 6 1 11
 28 | 6 2 11
 29 | 6 3 12
 30 | 6 4 14
 31 | 6 5 16
 32 | 7 1  7
 33 | 7 2 10
 34 | 7 3 11
 35 | 7 4 12
 36 | 7 5 14
 37 | 8 1  8
 38 | 8 2 18
 39 | 8 3 19
 40 | 8 4 19
 41 | 8 5 22
 42 | 9 1  3
 43 | 9 2  3
 44 | 9 3  3
 45 | 9 4  7
 46 | 9 5  8
 47 | 10 1  4
 48 | 10 2 10
 49 | 10 3 11
 50 | 10 4 17
 51 | 10 5 18
 52 | 11 1 11
 53 | 11 2 10
 54 | 11 3 10
 55 | 11 4 15
 56 | 11 5 16
 57 | 12 1  1
 58 | 12 2  3
 59 | 12 3  2
 60 | 12 4  4
 61 | 12 5  5
 62 | 13 1  6
 63 | 13 2  7
 64 | 13 3  7
 65 | 13 4  9
 66 | 13 5 10
 67 | 14 1  0
 68 | 14 2  3
 69 | 14 3  3
 70 | 14 4  4
 71 | 14 5  6
 72 | 15 1 18
 73 | 15 2 18
 74 | 15 3 19
 75 | 15 4 22
 76 | 15 5 22
 77 | 16 1 15
 78 | 16 2 15
 79 | 16 3 15
 80 | 16 4 18
 81 | 16 5 19
 82 | 17 1 10
 83 | 17 2 14
 84 | 17 3 16
 85 | 17 4 17
 86 | 17 5 19
 87 | 18 1  6
 88 | 18 2  6
 89 | 18 3  7
 90 | 18 4  9
 91 | 18 5 10
 92 | 19 1  9
 93 | 19 2  9
 94 | 19 3 13
 95 | 19 4 16
 96 | 19 5 20
 97 | 20 1  4
 98 | 20 2  3
 99 | 20 3  4
100 | 20 4  7
101 | 20 5  9
102 | 21 1  4
103 | 21 2 13
104 | 21 3 13
105 | 21 4 16
106 | 21 5 19
107 | 22 1 10
108 | 22 2 11
109 | 22 3 13
110 | 22 4 17
111 | 22 5 21
112 | 


--------------------------------------------------------------------------------
/numint/chap6figrules.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biodatascience/statcomp_src/558195d8bb181b582263678914107f12872a4443/numint/chap6figrules.jpg


--------------------------------------------------------------------------------
/numint/numint_HW7.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "HW 7 - Numerical Integration"
 3 | author: "Naim Rashid"
 4 | date: "2/20/2019"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | # Maximization of poisson GLMM from lecture
13 | 
14 | Now that we have discussed several approaches for numerical integration, lets now maximize the model given in lecture.  You may choose any maximization approach, as well as any numerical integration procedure from lecture, to obtain the MLE's for $\boldsymbol{\beta}$ and $\sigma_{\gamma}^2$.  
15 | 
16 | Hint: You should evaluate a number of intervals/nodes, decreasing the convergence threshold, etc and evaluate its impact before reporting your final result. We have shown how to perform, for example, AGQ and IS to obtain the likelihood pertaining to the first subject from class. 
17 | 
18 | ```{r}
19 | ## Solution: place relevant helper functions pertaining to integration here 
20 | 
21 | ## End Solution
22 | 
23 | 
24 | 
25 | ## Solution: place relevant helper functions pertaining to maximization here (likelihood etc)
26 | 
27 | ## End Solution
28 | 
29 | 
30 | 
31 | ## Solution: place primary code for maximization here, calling functions in the above two sections
32 | ## Remember to print your primary results and use the following starting values
33 | beta = c(1.804, 0.165)
34 | s2gamma = 0.000225
35 | alz = read.table("alzheimers.dat", header = T)
36 | 
37 | 
38 | ## End Solution
39 | 
40 | 
41 | 
42 | ```
43 | 
44 | # Plot
45 | 
46 | Now, plot the fitted line from the fitted GLMM on the spaghetti plot from lecture
47 | 
48 | ```{r}
49 | ## solution
50 | 
51 | ## end solution
52 | ```
53 | 


--------------------------------------------------------------------------------
/optim/chap2figgoodnewt.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biodatascience/statcomp_src/558195d8bb181b582263678914107f12872a4443/optim/chap2figgoodnewt.jpg


--------------------------------------------------------------------------------
/optim/chap2figgoodnewt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biodatascience/statcomp_src/558195d8bb181b582263678914107f12872a4443/optim/chap2figgoodnewt.png


--------------------------------------------------------------------------------
/optim/optim_HW5.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Homework 5 - Optimization"
  3 | author: "Naim Rashid"
  4 | date: "2/7/2019"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | # Question 1:  Simple Univariate Optimization
 13 | 
 14 | Use Newton-Raphson to maximize the following function:  $$f(x) = 7\log(x) + 3\log(1-x).$$  Fill in the R functions below pertaining to $f(x)$, $f'(x)$ and $f''(x)$.  Utilize the values below to initialize and run the algorithm (analogous to how they were defined in class).  Repeat the method for several starting values for $x$ given below, picking the best $\hat{x}$ in terms of the final objective function value for each. Make sure you print your final value for $\hat{x}$
 15 | 
 16 | ```{r}
 17 | # f(x)
 18 | f = function(x){
 19 |   ## solution
 20 |  
 21 |   ## end solution
 22 | }
 23 | 
 24 | # first derivative
 25 | f1 = function(x){
 26 |   ## solution
 27 | 
 28 |   ## end solution
 29 | }
 30 | 
 31 | # second derivative
 32 | f2 = function(x){
 33 |   ## solution
 34 | 
 35 |   ## end solution
 36 | }
 37 | 
 38 | # to start the model
 39 | tol = 10^-4
 40 | x = 0.01 # also try 0.5 and 0.99
 41 | maxit = 50
 42 | iter = 0
 43 | eps = Inf
 44 | 
 45 | ## solution
 46 | 
 47 | ## end solution
 48 | ```
 49 | 
 50 | Bonus:  $f(x)$ pertains to the likelihood/PDF for which distribution(s)?  Two answers are possible.  Given this, what would be a closed form estimate for $\hat{x}$?
 51 | 
 52 | 
 53 | # Question 2:  Not So Simple Univariate Optimization
 54 | 
 55 | Repeat question 1 for the function below, using the same template for your answer.  Choose a range of starting values between 0 and 5 and report the best $\hat{x}$ based on the final objective function value:
 56 | 
 57 | $$f(x) = 1.95 - e^{-2/x} - 2e^{-x^4}.$$
 58 | 
 59 | ```{r}
 60 | # f(x)
 61 | f = function(x){
 62 |   ## solution
 63 | 
 64 |   ## end solution
 65 | }
 66 | 
 67 | # first derivative
 68 | f1 = function(x){
 69 |   ## solution
 70 | 
 71 |   ## end solution
 72 | }
 73 | 
 74 | # second derivative
 75 | f2 = function(x){
 76 |   ## solution
 77 | 
 78 |   ## end solution
 79 | }
 80 | 
 81 | # to start the model
 82 | tol = 10^-4
 83 | x = 1.2 # also try 0.5 and 0.99
 84 | maxit = 50
 85 | iter = 0
 86 | eps = Inf
 87 | 
 88 | ## solution
 89 | ## end solution
 90 | ```
 91 | 
 92 | What does this say about the stability of NR, especially in this case?  Plot the second derivative of this function and comment on why or why this is supports your observations.
 93 | 
 94 | ```{r}
 95 | ## solution for plot
 96 | 
 97 | ## end solution
 98 | ```
 99 | 
100 | 
101 | ## Multivariate optimization:  Zero-inflated Poisson 
102 | 
103 | Following a chemical spill from a local industrial plant, the city government is attempting to characterize the impact of the spill on recreational fishing at a nearby lake.  Over the course of one month following the spill, park rangers asked each adult leaving the park how many fish they had caught that day. At the end of the month they had collected responses for 4,075 individuals, where the number of fish caught for each individual is summarized in the table below:
104 | 
105 | ```{r, echo=F}
106 | library(knitr)
107 | kable(matrix(c(3062, 587, 284, 103, 33, 4, 2), nrow = 1, byrow = T),col.names = as.character(0:6),format = "html", table.attr = "style='width:30%;'",)
108 | ```
109 | 
110 | Based on an initial examination of this distribution, there appears to be many more zeros than expected.  Upon speaking to the park rangers, they were embarrassed to state that after recording the number of fish each adult had caught, they did not ask or record whether each adult had gone to the park with the intention of fishing in the first place.  
111 | 
112 | The city statistician surmised that the observed distribution in the table above may be resulting from a mixture of two populations of subjects.  The first population pertains to the subset of visitors that arrived without any intention of fishing (exactly zero fish caught by each member of this population). The second population pertains to the set of visitors that arrived with the intention of fishing (0 or more fish potentially caught in this population).  Therefore, if we fit a standard Poisson model to this data, the estimate for $\lambda$ would be biased downwards.  
113 | 
114 | To account for the excess zeros in the observed data, the statistician decided to fit a zero-inflated Poisson model.  To simplify things the log likelihood is given below, and we utilized the tabulated values from the table above in place of individual observations: 
115 | 
116 | $$ 
117 | \mathcal{l}(\boldsymbol{\theta}) = n_0\log(\pi + (1-\pi)e^{-\lambda}) + (N-n_0)[\log(1-\pi) - \lambda] + \sum_{i=1}^{\infty}in_i\log(\lambda)
118 | $$
119 | 
120 | where $\boldsymbol{\theta} = (\pi, \lambda)$, $n_i$ pertains to the number of individuals that caught $i$ fish ($n_i=0$ for $i>6$ here), $N = 4075$, $\pi$ is the probability that an individual did not show up to fish, and $\lambda$ is the mean number of fish caught by those individuals that intended to fish at the park.  From this log-likelihood, we can see that for the individuals that caught 0 fish, we assume those observations come from a mixture of individuals that did not show up to fish ($\pi$ proportion of the time) and those that showed up to fish but did not catch anything ($1-\pi$ proportion of the time with 0 for their catch count).  The remaining individuals that caught more than zero fish are also in the log likelihood and are similarly weighted by $(1-\pi)$. 
121 | 
122 | 
123 | Lets go ahead and fit this model with Newton-Raphson.  Similar to the previous problems, fill in the code below to fit this model.  Then, fill in the function for the log likelihood and its 1st derivative $$\left(\frac{\partial\mathcal{l}(\boldsymbol{\theta})}{\partial\pi}, \frac{\partial\mathcal{l}(\boldsymbol{\theta})}{\partial\lambda}\right)$$ and second derivative matrix 
124 | 
125 | $$\left[\begin{array}
126 | {rr}
127 | \frac{\partial^2\mathcal{l}(\boldsymbol{\theta})}{\partial\pi^2} & \frac{\partial^2\mathcal{l}(\boldsymbol{\theta})}{\partial\pi \partial\lambda} \\
128 | \frac{\partial^2\mathcal{l}(\boldsymbol{\theta})}{\partial\lambda \partial\pi } & \frac{\partial^2\mathcal{l}(\boldsymbol{\theta})}{\partial\lambda^2} 
129 | \end{array}\right] $$  
130 | 
131 | Given the estimates, interpret the results. Accounting for the excess zeros, on average how many fish were caught among those who intended to go fishing?  What proportion of subjects did not even consider to fish?
132 | 
133 | 
134 | ```{r}
135 | # f(x)
136 | logLik = function(theta, y, ny){
137 |   ## solution
138 |   # returns scalar
139 |   
140 |   ## end solution
141 | }
142 | 
143 | # first derivative
144 | f1 = function(theta,y, ny){
145 |   ## solution
146 |   # returns 2x1 vector
147 |   
148 |   ## end solution
149 | }
150 | 
151 | # second derivative
152 | f2 = function(theta,y, ny){
153 |   ## solution
154 |   # returns 2x2 matrix
155 |   
156 |   ## end solution
157 | }
158 | 
159 | # data 
160 | y = 0:6
161 | ny = c(3062, 587, 284, 103, 33, 4, 2)
162 | 
163 | # to start the model
164 | tol = 10^-4
165 | theta = c(
166 |           ny[y==0]/sum(ny), # initial value for pi, prop of total 0's
167 |           sum(ny*y)/sum(ny) # intial value for lambda, mean of y's
168 |           )
169 | maxit = 50
170 | iter = 0
171 | eps = Inf
172 | 
173 | ## solution
174 | 
175 | ## end solution
176 | ```
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 


--------------------------------------------------------------------------------
/rpkg/build.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Building R packages"
  3 | author: "Michael Love"
  4 | date: 10/25/2018
  5 | output: 
  6 |   html_document:
  7 |     toc: true
  8 |     toc_float: true
  9 | ---
 10 | 
 11 | # Why put your code into a package?
 12 | 
 13 | There are two compelling reasons to move your code from a set of
 14 | scripts to a package:  
 15 | 
 16 | * Versioning
 17 | * Documentation
 18 | 
 19 | During a project, you may begin to accumulate a large code base, you
 20 | can get by organizing functions into files, and using `source`, for
 21 | example: 
 22 | 
 23 | ```{r eval=FALSE}
 24 | # hypothetical example:
 25 | load("data.rda")
 26 | source("normalize_functions.R")
 27 | source("EM_functions.R")
 28 | source("plot_functions.R")
 29 | ```
 30 | 
 31 | This may be sufficient for a while, but it is not sufficient for
 32 | sharing the code with others, either for reproducibility or for having
 33 | others use your methods. Using `source` as above doesn't let you or
 34 | someone else know which *version* of the functions are being used. The
 35 | way to do this in R is to put the R scripts into a *package structure*
 36 | which has a version attached to it. The version is simply recorded as
 37 | a line in a text file, which we will describe in these notes.
 38 | 
 39 | A potential user also cannot find out about the purpose of the
 40 | functions, details about the arguments or output of the function if
 41 | they are simply brought into R with `source`. You could provide
 42 | comments in the R files, but these comments are not accessible from
 43 | the command line while a potential user is about to use your
 44 | function. Help for R functions is provided in a specifically formatted
 45 | file (ending with `.Rd`) that goes into the `man` directory of an R
 46 | package. I strongly recommend to not write these files by hand but
 47 | instead use in-line documentation called *roxygen* and help functions
 48 | from the *devtools* package to automatically create these help
 49 | files. This will be covered in detail in another lecture note.
 50 | 
 51 | Two additional references for building R packages are:
 52 | 
 53 | * [R Packages](http://r-pkgs.had.co.nz/) by Hadley Wickham (currently
 54 |   out of date with respect to the addition of the *usethis* package)
 55 | * [Writing R Extensions](https://cran.r-project.org/doc/manuals/R-exts.html)
 56 |   the official guide from CRAN on how to create an R package
 57 | 
 58 | # Minimal package skeleton
 59 | 
 60 | The bare minimum for an R package is a directory with the following:
 61 | 
 62 | * a `DESCRIPTION` text file, which gives information about the package
 63 | * a `NAMESPACE` text file, which tells which functions are imported or
 64 |   exported by the package (best written automatically, not manually)
 65 | * a directory called `R` with the R scripts inside
 66 | 
 67 | We can get started by using the `create_package()` function from the *usethis*
 68 | package. Suppose we have an R script with the following function:
 69 | 
 70 | ```{r}
 71 | add <- function(x,y,negative=FALSE) {
 72 |   z <- x + y
 73 |   if (negative) {
 74 |     z <- -1 * z
 75 |   }
 76 |   z
 77 | }
 78 | ```
 79 | 
 80 | We will assume that we are running the following code on the class virtual machine, which uses R version 4.1.2.  Let's say we will create an R package called *foo*. We can start with
 81 | the following:
 82 | 
 83 | ```{r eval=FALSE}
 84 | library(usethis)
 85 | create_package("foo", roxygen=FALSE)
 86 | ```
 87 | 
 88 | This will make a directory called `foo` in you *current working
 89 | directory*, and will then will open a new Rstudio instance where the working directory to `foo`.  For now, we set Roxygen equal to FALSE since we have not covered Roxygen yet.  This option allows for automatic exporting of functions to the namespace when FALSE.  If set to TRUE (default), the function assumes you will be using Roxygen to document functions and will export functions in accordance to your Roxygen documentation, leaving the NAMESPACE file blank upon creation.   
 90 | 
 91 | In your current working directory, we can print the contents of the foo directory
 92 | 
 93 | ```{r, eval = F}
 94 | list.files("foo")
 95 | ```
 96 | 
 97 | 
 98 | You should see the following:
 99 | 
100 | ```
101 | [1] "DESCRIPTION" "foo.Rproj"   "NAMESPACE"   "R"  
102 | ```
103 | 
104 | It will start with a DESCRIPTION file that looks like:
105 | 
106 | ```
107 | Package: foo
108 | Title: What the Package Does (One Line, Title Case)
109 | Version: 0.0.0.9000
110 | Authors@R: 
111 |     person("First", "Last", , "first.last@example.com", role = c("aut", "cre"),
112 |            comment = c(ORCID = "YOUR-ORCID-ID"))
113 | Description: What the package does (one paragraph).
114 | License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a
115 |     license
116 | Encoding: UTF-8
117 | ```
118 | 
119 | The DESCRIPTION file has a number of fields that we need to
120 | populate. We may start editing this file: 
121 | 
122 | ```
123 | Package: foo
124 | Title: Functions for doing great things
125 | Version: 0.0.1
126 | Authors@R: person("Jane","Doe",email="jdoe@uni.edu",role=c("aut","cre"))
127 | Description: Contains amazing functions for doing amazing things.
128 | License: GPL-2
129 | Encoding: UTF-8
130 | ```
131 | 
132 | There are a number of licenses that are possible and these are
133 | important to read about and consider before releasing code. Here is a
134 | list of licenses that are in use for R:
135 | 
136 | <https://www.r-project.org/Licenses/>
137 | 
138 | R itself is licensed under `GPL-2 | GPL-3`.
139 | 
140 | If we simply move the R script above into the `R` directory, we are
141 | done with the bare minimum for our first R package, because the
142 | `NAMESPACE` file already says to export all functions that do not
143 | begin with a period:
144 | 
145 | ```
146 | exportPattern("^[^\\.]")
147 | ```
148 | 
149 | A complication is if we want to use functions from another package in
150 | our package. The way to do this is to *import* or *depend* on other
151 | packages. The easiest way to handle importing functions from another package
152 | is through the *roxygen* documentation format, so we will hold off on
153 | discussing this in the next lecture note. We have some description of
154 | the different kinds of package dependencies at the bottom of this
155 | lecture note.
156 | 
157 | We can build a shareable package "tarball" using the `build` function:
158 | 
159 | ```{r eval=FALSE}
160 | library(devtools)
161 | build("foo")
162 | ```
163 | 
164 | Alternatively, this can be done on the command line, from one
165 | directory above the package directory, with `R CMD build foo`.
166 | 
167 | The `build` command prints out the following message and returns the
168 | location of the package tarball.
169 | 
170 | ```
171 | √  checking for file 'C:\statcomp_src\foo/DESCRIPTION' (2.3s)
172 | -  preparing 'foo':
173 | √  checking DESCRIPTION meta-information
174 | -  checking for LF line-endings in source and make files and shell scripts
175 | -  checking for empty or unneeded directories
176 | -  building 'foo_0.0.1.tar.gz'
177 |    
178 | [1] "C:/statcomp_src/foo_0.0.1.tar.gz"
179 | ```
180 | 
181 | Now if we want to share our function `add` with a collaborator, we can
182 | send them the file `foo_0.0.1.tar.gz`. 
183 | 
184 | # Version numbers
185 | 
186 | The most important thing about version numbers is that they are
187 | *free*. You should "bump" the version number any time you make a
188 | change that you will push out the world (e.g. anytime you push your
189 | changes to GitHub or other repository). Even if you are working
190 | entirely by yourself, it is useful to be able to pinpoint differences
191 | in code output based on the version string (which will appear if you
192 | append the session information to the output of every script or
193 | Rmarkdown file you run).
194 | 
195 | There are three parts to most R package version numbers:
196 | 
197 | `x.y.z`
198 | 
199 | Roughly:
200 | 
201 | * The "x" part is for major releases
202 | * The "y" part is for minor releases
203 | * The "z" part is for any change to the package
204 | 
205 | In *Bioconductor*, there is some additional information and restrictions
206 | on these numbers. "y" is even for release versions and odd for
207 | development versions. They have additional information on the system
208 | for [Bioconductor package version numbers](http://bioconductor.org/developers/how-to/version-numbering/).
209 | 
210 | The first time a package is started, you may choose `0.0.1` and then
211 | add 1 to the "z" digit every time you make a change to the package
212 | code. It would be typical to use `1.0.0` when you first "release" the
213 | package, which you would do yourself if you are planning to host on
214 | GitHub or on CRAN. If you still consider the package a working "beta",
215 | then you might only use `0.1.0`, until you feel it is ready for
216 | `1.0.0`. For Bioconductor packages, you would submit
217 | `0.99.z` to the Bioconductor repository, and the Bioconductor machines
218 | would increment to `1.0.0` on your behalf for the first release.
219 | There is some additional information about version numbers and
220 | releasing a package from [Hadley Wickham](http://r-pkgs.had.co.nz/release.html).
221 | 
222 | In R, package versions are a special class and you can compare package
223 | versions like so: 
224 | 
225 | ```{r}
226 | packageVersion("stats")
227 | packageVersion("stats") >= "3.0.0"
228 | ```
229 | 
230 | If you want a R script to produce an error if the package is out of
231 | data you can use:
232 | 
233 | ```{r}
234 | stopifnot(packageVersion("stats") >= "3.0.0")
235 | ```
236 | 
237 | For an R package, if you need a specific version of another package,
238 | you should include a string in parentheses after its name in the
239 | Depends, Imports or Suggests field, for example:
240 | 
241 | ```
242 | Imports: foo (>= 1.2.0)
243 | ```
244 | 
245 | We will discuss these fields a bit more in the documentation notes.
246 | 
247 | # Loading and sharing packages
248 | 
249 | By far the easiest way to load the package into R while you are
250 | developing it is to use the `load_all` function from the *devtools*
251 | package. This does not require the creation of the package tarball
252 | with the `.tar.gz` ending, but simply mimics an installation of the
253 | package. From within the package you can simply call `load_all()`, or 
254 | you can also specify the path to the package:
255 | 
256 | ```{r eval=FALSE} 
257 | load_all("/path/to/foo")
258 | ```
259 | 
260 | You also bypass having to call `library(foo)` as the package will
261 | already be loaded.
262 | 
263 | Another way to load the package is the standard `install.packages`
264 | function but specifying that the package tarball is a local file, not
265 | a name of a package in a remote repository. This then requires an
266 | explicit `library(foo)` call afterward.
267 | 
268 | ```{r eval=FALSE}
269 | install.packages("foo_0.0.1.tar.gz", repos=NULL)
270 | ```
271 | 
272 | You may need to restart your R session, or try this is another R session if you already have loaded the foo package in certain versions of R.  Another easy way to share your package is to put all of the files into a GitHub repository. Then others can install the package on their
273 | machines simply with `install_github("username/foo")` using the
274 | *devtools* package. Again, this requires a `library(foo)` call
275 | afterward to load the package.
276 | 
277 | We still haven't written any documentation, so when we try to load
278 | help we are told it is missing:
279 | 
280 | ```{r, eval = F}
281 | library(foo)
282 | help(topic="add", package="foo")
283 | ```
284 | 
285 | Either through using `load_all` or installing and then loading with
286 | `library`, we can now use our function:
287 | 
288 | ```{r}
289 | add(3,4)
290 | add(3,4,negative=TRUE)
291 | ```
292 | 
293 | And for reproducibility sake, we can ask about the package version:
294 | 
295 | ```{r}
296 | packageVersion("foo")
297 | ```
298 | 
299 | And we can include the following at the end of scripts to include all
300 | package versions (and other important details):
301 | 
302 | ```{r}
303 | library(devtools)
304 | session_info()
305 | ```
306 | 
307 | # Types of package dependencies
308 | 
309 | There are three main types of formal package dependencies in R:
310 | 
311 | * Depends
312 | * Imports
313 | * Suggests
314 | 
315 | Each of these can be listed in the `DESCRIPTION` file. We will show in
316 | the documentation lecture note how to import specific functions from
317 | other packages, and how to add these packages to the `Imports:` line
318 | in the `DESCRIPTION` file. I recommend to import specific functions
319 | from other packages, rather than the entire package, but it is also
320 | possible to import the entire package. Note that the packages that
321 | you import functions from are not *attached to the search path* when
322 | you load your package with `library(foo)`. This means that, even if
323 | you import one or more functions from another package within your
324 | package, those functions are not available to the user unless they
325 | also load the package with another call to `library`.
326 | 
327 | The `Suggests:` field of the `DESCRIPTION` file is for packages which
328 | are not absolutely required for your package to work, but which your
329 | package can make use of, or which are used in examples or the vignette
330 | of your package. I recommend to list a package under Suggests also in
331 | the case that you have a function which can make use of another
332 | package, but has a fallback implementation which does not require the
333 | other package. Or if there is no fallback, the function should explain
334 | that it cannot be run because the other package is not available. This
335 | testing to see if the other package is available can be performed using
336 | `requireNamespace("foo", quiet=TRUE)`, which will return `TRUE` if the
337 | package is installed and `FALSE` if not. You can then use the function
338 | from the other package with `::` within your package, for example:
339 | `cool::fancy` can be used to call the `fancy` function from the *cool*
340 | package.
341 | 
342 | This leaves Depends to be explained. If you read Hadley Wickham's
343 | R Package reference and particular the section on
344 | [Namespace](http://r-pkgs.had.co.nz/namespace.html), you'll see that
345 | he does not recommend the use of Depends at all. I agree with his
346 | arguments laid out there and simply quote the key sentences here:
347 | 
348 | > The main difference is that where Imports just loads the package,
349 | > Depends attaches it. There are no other differences.
350 | > ... Unless there is a good reason otherwise, you
351 | > should always list packages in Imports not Depends. That’s because a
352 | > good package is self-contained, and minimises changes to the global
353 | > environment (including the search path).
354 | 
355 | # Package checking
356 | 
357 | R has a very useful, but also very picky checking software associated
358 | with it. It can be run with the `check` function from the *devtools*
359 | package, or by running `R CMD check foo_0.0.1.tar.gz` from the command
360 | line. Passing all of the R package checks is a good idea if you plan
361 | to share your code with others. In addition, there may be 
362 | [other checks](https://bioconductor.org/packages/devel/bioc/vignettes/BiocCheck/inst/doc/BiocCheck.html) 
363 | that you will need to pass if you want to submit to Bioconductor. We
364 | won't cover R package checking, as we only have limited time, but the
365 | reference on [R Packages](http://r-pkgs.had.co.nz/check.html) from
366 | Hadley Wickham discusses this.
367 | 
368 | 


--------------------------------------------------------------------------------
/rpkg/debug.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Debugging in R"
  3 | author: "Michael Love"
  4 | date: 10/25/2018
  5 | output: 
  6 |   html_document:
  7 |     toc: true
  8 |     toc_float: true
  9 | ---
 10 | 
 11 | # Many approaches to debug code in R
 12 | 
 13 | There are a number of different ways to do debugging of R code, but as
 14 | with documentation and testing, I will show here what I think is the
 15 | simplest way to debug a complex situation. Suppose we have a function
 16 | that is called in the middle of a larger operation and we want to
 17 | inspect its arguments and step through what is happening. My preferred
 18 | method for this is to insert the `browser()` command into the function
 19 | at a relevant level, and then to run the function in the setting that
 20 | produces the behavior I want to investigate. Note that this approach
 21 | can also be used in general, not just for debugging, but for when
 22 | introducing new features that may alter core components of your
 23 | current package infrastructure. 
 24 | 
 25 | If I want to investigate an error or behavior in a package that I do
 26 | not maintain, I will typically obtain the latest source code for the
 27 | package, and apply this same procedure of inserting `browser()` at
 28 | critical points that I want to investigate. `load_all()` can be run to
 29 | load the version of the package that contains the `browser()` call
 30 | instead of the original package source.
 31 | 
 32 | I will now create a series of inter-dependent functions, so that we
 33 | can examine how different strategies of debugging look in R.
 34 | 
 35 | ```{r}
 36 | # a high level function
 37 | high <- function(obj) {
 38 |   obj.sub <- subset(obj)
 39 |   obj.sub <- analyze(obj.sub)
 40 |   return(obj.sub)
 41 | }
 42 | # some subsetting operation
 43 | subset <- function(obj) {
 44 |   # every other element
 45 |   idx <- seq_len(length(obj$x)/2)*2 - 1
 46 |   list(x=obj$x[idx],
 47 |        y=obj$y[idx],
 48 |        foo=obj$foo)
 49 | }
 50 | # another sub-function
 51 | analyze <- function(obj) {
 52 |   if (obj$foo == "no") {
 53 |     z <- obj$x + obj$y
 54 |   } else if (obj$foo == "yes") {
 55 |     z <- obj$x + obj$y + special(obj$x)
 56 |   }
 57 |   obj$z <- z
 58 |   obj
 59 | }
 60 | # a low-level function
 61 | special <- function(x) {
 62 |   sum(x) + "hi"
 63 | }
 64 | ```
 65 | 
 66 | If we run this with `foo="yes"` we will run into some problems.
 67 | 
 68 | ```{r error=TRUE}
 69 | obj <- list(x=1:10,
 70 |             y=11:20,
 71 |             foo="yes")
 72 | high(obj)
 73 | ```
 74 | 
 75 | # Traceback
 76 | 
 77 | We can call `traceback()` which gives the series of function calls
 78 | that produced the latest error:
 79 | 
 80 | ```{r eval=FALSE}
 81 | traceback()
 82 | ```
 83 | 
 84 | This gives:
 85 | 
 86 | ```
 87 | 3: special(obj$x) at #5
 88 | 2: analyze(obj.sub) at #3
 89 | 1: high(obj)
 90 | ```
 91 | 
 92 | # Browser
 93 | 
 94 | My favorite strategy is to put `browser()` inside of one of the
 95 | functions, for example, here:
 96 | 
 97 | ```{r eval=FALSE}
 98 | special <- function(x) {
 99 |   browser()
100 |   sum(x) + "hi"
101 | }
102 | ```
103 | 
104 | Then rerunning:
105 | 
106 | ```{r eval=FALSE}
107 | high(obj)
108 | ```
109 | 
110 | We are given a special `Browse` prompt where we can see what is in the
111 | environment with `ls()`, and investigate the contents:
112 | 
113 | ```
114 | Called from: special(obj$x)
115 | Browse[1]> 
116 | debug at #3: sum(x) + "hi"
117 | Browse[2]> ls()
118 | [1] "x"
119 | Browse[2]> x
120 | [1] 1 3 5 7 9
121 | Browse[2]> sum(x)
122 | [1] 25
123 | Browse[2]> 
124 | ```
125 | 
126 | To exit, type `Q`.
127 | 
128 | # Recover on error 
129 | 
130 | Another option to debug on error is to set a global option:
131 | 
132 | ```{r eval=FALSE}
133 | options(error=recover)
134 | ```
135 | 
136 | Then when running a function that triggers an error, R will launch a
137 | special browser prompt, but first asks us what level of the stack of
138 | calls we would like to enter:
139 | 
140 | ```
141 | Error in sum(x) + "hi" (from #2) : non-numeric argument to binary operator
142 | Calls: high -> analyze -> special
143 | 
144 | Enter a frame number, or 0 to exit   
145 | 
146 | 1: high(obj)
147 | 2: #3: analyze(obj.sub)
148 | 3: #5: special(obj$x)
149 | 
150 | Selection: 3
151 | ```
152 | 
153 | Here we pick to enter frame number 3.
154 | 
155 | ```
156 | Called from: analyze(obj.sub)
157 | Browse[1]> ls()
158 | [1] "x"
159 | Browse[1]> x
160 | [1] 1 3 5 7 9
161 | Browse[1]> sum(x)
162 | [1] 25
163 | ```
164 | 
165 | When you are finished, you can return with `Q` and you can stop the
166 | browser prompt upon error with:
167 | 
168 | ```{r eval=FALSE}
169 | options(error=NULL)
170 | ```
171 | 
172 | # Debug and undebug
173 | 
174 | Other options for debugging in R are the `debug`/`undebug` or `debugonce`
175 | functions. These are similar to `browser` but they step through a function
176 | line-by-line, and so can be tedious for large functions. However they
177 | do not require having the source code of the program on hand for
178 | editing.
179 | 


--------------------------------------------------------------------------------
/rpkg/document.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Documenting an R package"
  3 | author: "Michael Love"
  4 | date: 10/25/2018
  5 | output: 
  6 |   html_document:
  7 |     toc: true
  8 |     toc_float: true
  9 | ---
 10 | 
 11 | # Types of documentation in R
 12 | 
 13 | There are two main locations in an R package which contain
 14 | documentation:
 15 | 
 16 | * help pages (sometimes called "man pages") for each function
 17 | * vignettes (longer form with R code chunks embedded in text)
 18 | 
 19 | We will show in this lecture note how to generate documentation for
 20 | both. For the function help pages, I strongly recommend to use
 21 | *roxygen* formatting and the `document` function from the *devtools*
 22 | package, which will be covered here. This will automatically generate
 23 | both the `.Rd` files that go in the `man` directory, and it will
 24 | populate the `NAMESPACE` file which specifies what is exported and
 25 | what is imported by your package.
 26 | 
 27 | Since we did not use Roxygen in the last lecture, lets recreate our foo package with Roxygen in mind from the start.  In newer versions of R, this helps to avoid issues not overwriting the existing NAMESPACE file when we use the document() function later.  
 28 | 
 29 | Lets run the following to prep our foo package.  If you do not want to rewrite your DESCRIPTION file, you can elect not to overwrite it when asked below.  But, definitely choose to overwrite your existing NAMESPACE file. 
 30 | 
 31 | Alternatively, you can delete your existing foo folder and start from scratch.  Just make sure to add the add function to the R/ directory in an R script file. 
 32 | 
 33 | ```{r, eval = F}
 34 | library(usethis)
 35 | create_package("foo", roxygen=TRUE)
 36 | ```
 37 | 
 38 | # Writing help for functions
 39 | 
 40 | Suppose we have a function in one of our R scripts, which is located
 41 | in the `R` directory of our package.
 42 | 
 43 | ```{r}
 44 | add <- function(x,y,negative=FALSE) {
 45 |   z <- x + y
 46 |   if (negative) {
 47 |     z <- -1 * z
 48 |   }
 49 |   z
 50 | }
 51 | ```
 52 | 
 53 | We need to tell our prospective users what the function does, what
 54 | kind of arguments it takes, and what is the output of the function. It
 55 | is useful to also supply references to literature or to specific
 56 | manuscripts associated with the code.
 57 | 
 58 | Take a look at the help for the `quantile` function and the `sapply`
 59 | function: 
 60 | 
 61 | ```{r eval=FALSE}
 62 | ?quantile
 63 | ?sapply
 64 | ```
 65 | 
 66 | You can see there is some variation among function help pages, but key
 67 | elements are repeated. Here we will cover some basics about
 68 | documenting functions, and for further reference you can look up
 69 | the notes from [Hadley Wickham](http://r-pkgs.had.co.nz/man.html) on
 70 | writing documentation.
 71 | 
 72 | # roxygen
 73 | 
 74 | To start documenting this function with *roxygen*, you type a pound
 75 | key and a single quote mark `#'` in a line above the function, and
 76 | then add a title to the function
 77 | 
 78 | ```
 79 | #' Sum of two vectors of numbers
 80 | add <- function(x,y,negative=FALSE) {
 81 | ...
 82 | ```
 83 | 
 84 | If you press Enter for a new line from the title line, in most R-aware
 85 | editors (Rstudio, Emacs + ESS, although not the R GUI) you will get a
 86 | new line starting with `#'`.
 87 | 
 88 | The first line becomes the Title of the help page, the second line
 89 | becomes the Description (briefly describe what the function does), and
 90 | further lines will populate the Details section of the help page. For
 91 | example:
 92 | 
 93 | ```
 94 | #' Sum of two vectors of numbers
 95 | #' 
 96 | #' This function sums two vectors of numbers and optionally
 97 | #' allows for the negative of the sum to be returned.
 98 | #' 
 99 | #' This paragraph will start the Details section...
100 | #' 
101 | #' This will be the second paragraph of the Details section... 
102 | add <- function(x,y,negative=FALSE) {
103 | ...
104 | ```
105 | 
106 | If it is desired to refer to the function itself or another function,
107 | one can use the following, `\code{add}`, which will be rendered as
108 | monospace type, `add`, in the HTML and PDF version of the help pages and
109 | as `'add'` with single-quotes in the R console version of the help pages.
110 | 
111 | # Arguments
112 | 
113 | Next we will document the arguments, which get a special tag to
114 | indicate their lines `@param`. One may ask why the arguments are
115 | tagged with `@param` as they will show up in the help page under
116 | **Arguments** and I believe the answer is that *roxygen* for
117 | documenting R code is patterned historically on
118 | [Doxygen](https://en.wikipedia.org/wiki/Doxygen) which uses the tag
119 | `@param`.
120 | 
121 | Here we take out the extra Details paragraphs from above, and just
122 | focus on the Title, Description and Arguments:
123 | 
124 | ```
125 | #' Sum of two vectors of numbers
126 | #' 
127 | #' This function sums two vectors of numbers and optionally
128 | #' allows for the negative of the sum to be returned.
129 | #' 
130 | #' @param x a vector of numbers
131 | #' @param y a vector of numbers
132 | #' @param negative logical, whether to flip the sign of the sum
133 | add <- function(x,y,negative=FALSE) {
134 | ...
135 | ```
136 | 
137 | The format is: `@param name description`. The description is a bit
138 | personal preference, and I tend to put the expected type of the
139 | argument (e.g. logical) in the front for certain arguments, and
140 | sometimes also the default value: "logical, whether to flip the sign
141 | of the sum (default is FALSE)". The default value will also be printed
142 | in the Usage section, which is generated by default, so it's not
143 | strictly necessary.
144 | 
145 | # Returned values
146 | 
147 | It's also important to add the Value that is returned by the
148 | function, which I tend to put below the `@param` lines. In this
149 | trivial example, it's not very revealing, but some functions have
150 | complex outputs, e.g. a list with different elements, or a complex
151 | object, in which case it is useful to describe exactly what is being
152 | returned. If there is any ambiguity about the returned values,
153 | *please* indicate it in the help file, for example if the values are
154 | on the log, log2 or log10 scale, this would be the place to describe
155 | it. 
156 | 
157 | ```
158 | ...
159 | #' @param negative logical, whether to flip the sign of the sum
160 | #' 
161 | #' @return the sum of the two vectors
162 | add <- function(x,y,negative=FALSE) {
163 | ...
164 | ```
165 | 
166 | If a list-like object is being returned, one can use the following
167 | paradigm to describe each piece of the returned object:
168 | 
169 | ```
170 | #' @return a list with the following elements:
171 | #' \itemize{
172 | #' \item{...}
173 | #' \item{...}
174 | #' }
175 | ```
176 | 
177 | This will become a bulleted list in the help page. This paradigm can
178 | be used in the other sections, e.g. arguments as well.
179 | 
180 | # Examples
181 | 
182 | In the Examples field, you can provide small examples of using the
183 | function *that will run when you check the package*. These are
184 | required by Bioconductor. Ideally the examples should take no more
185 | than a few seconds. The R code directly follows the line with the
186 | tag `@examples`:
187 | 
188 | ```
189 | ...
190 | #' @return the sum of the two vectors
191 | #' 
192 | #' @examples
193 | #'
194 | #' add(1:5, 6:10)
195 | #' add(1:5, 6:10, negative=TRUE)
196 | #'
197 | add <- function(x,y,negative=FALSE) {
198 | ...
199 | ```
200 | 
201 | # Import and export
202 | 
203 | While *roxygen* is not the only way to deal with function import and
204 | export, I find it the easiest by far. Any function which you desire to
205 | export (to make visible to users who load your package), you add the
206 | `@export` tag in the roxygen code. I tend to add it directly above the
207 | function definition:
208 | 
209 | ```
210 | ...
211 | #' @export
212 | add <- function(x,y,negative=FALSE) {
213 | ...
214 | ```
215 | 
216 | Imports are only slightly more complicated. If you want to import a
217 | specific function from a package (and I recommend this, rather than
218 | importing entire packages), you can add the `@importFrom` tag in the
219 | *roxygen* code block. The format is `@importFrom package function`. I
220 | also recommend to use `package::function` in your R code when you use
221 | a function from another package. This clarifies that the function is
222 | not defined in your package and will avoid errors when you run R's
223 | package check.
224 | 
225 | ```
226 | ...
227 | #' @importFrom gtools rdirichlet
228 | #' 
229 | #' @export
230 | add <- function(x,y,negative=FALSE) {
231 |   d <- gtools::rdirichlet(1, alpha=c(1,2,3))
232 |   z <- x + y
233 |   ...
234 | ```
235 | 
236 | # Actually making the Rd files
237 | 
238 | Now that we have created the following *roxygen* code block above our
239 | file, we can run the `document` function from the *devtools* package
240 | to create the `.Rd` files for our package.
241 | 
242 | Altogether we have:
243 | 
244 | ```
245 | #' Sum of two vectors of numbers
246 | #' 
247 | #' This function sums two vectors of numbers and optionally
248 | #' allows for the negative of the sum to be returned.
249 | #' 
250 | #' @param x a vector of numbers
251 | #' @param y a vector of numbers
252 | #' @param negative logical, whether to flip the sign of the sum
253 | #'
254 | #' @return the sum of the two vectors
255 | #' 
256 | #' @examples
257 | #'
258 | #' add(1:5, 6:10)
259 | #' add(1:5, 6:10, negative=TRUE)
260 | #'
261 | #' @importFrom gtools rdirichlet
262 | #' 
263 | #' @export
264 | add <- function(x,y,negative=FALSE) {
265 |   d <- gtools::rdirichlet(1, alpha=c(1,2,3))
266 |   z <- x + y
267 |   if (negative) {
268 |     z <- -1 * z
269 |   }
270 |   z
271 | }
272 | ```
273 | 
274 | When we run `document()` the help pages with `.Rd` ending are created
275 | and also the `NAMESPACE` file is re-written to indicate the imports and
276 | exports. The `DESCRIPTION` file may also be updated with a single
277 | field `RoxygenNote: x.y.z` denoting the version of *roxygen2*, the R
278 | package that handles reading *roxygen* code chunks and writing `.Rd`
279 | files. 
280 | 
281 | The following file is created in `man/add.Rd`. Note that it says the
282 | file is generated by the *roxygen2* package and so **should not be
283 | edited by hand**. This is because the source of the help page is in
284 | the R file, and so any edits to this file will be wiped out the next
285 | time `document()` is run.
286 | 
287 | ```
288 | % Generated by roxygen2: do not edit by hand
289 | % Please edit documentation in R/foo.R
290 | \name{add}
291 | \alias{add}
292 | \title{Sum of two vectors of numbers}
293 | \usage{
294 | add(x, y, negative = FALSE)
295 | }
296 | \arguments{
297 | \item{x}{a vector of numbers}
298 | 
299 | \item{y}{a vector of numbers}
300 | 
301 | \item{negative}{logical, whether to flip the sign of the sum}
302 | }
303 | \value{
304 | the sum of the two vectors
305 | }
306 | \description{
307 | This function sums two vectors of numbers and optionally
308 | allows for the negative of the sum to be returned.
309 | }
310 | \examples{
311 | 
312 | add(1:5, 6:10)
313 | add(1:5, 6:10, negative=TRUE)
314 | 
315 | }
316 | ```
317 | 
318 | Note that, if you are keeping your package on a repository such as
319 | GitHub, you will need to explicitly add `man/foo.Rd` to the repository
320 | and push it to the origin, so that others will have access to the
321 | documentation when they load your package.
322 | 
323 | Another note: if you just want to preview the help file, you can
324 | simply call `load_all()` and then type out `?foo`, which will bring up
325 | your most recent edits.
326 | 
327 | The NAMESPACE file is updated from exporting all functions that do not
328 | begin with a period, to only exporting the `add` function:
329 | 
330 | ```
331 | # Generated by roxygen2: do not edit by hand
332 | 
333 | export(add)
334 | importFrom(gtools,rdirichlet)
335 | ```
336 | 
337 | There is one thing left to do to manually, which is to add the
338 | following line to the `DESCRIPTION` file, which indicates that we are
339 | importing at least one function from the *gtools* package:
340 | 
341 | ```
342 | Imports: gtools
343 | ```
344 | 
345 | # How to add a package vignette
346 | 
347 | We mentioned at the beginning that there are two types of
348 | documentation for R package, the help/man pages which we have showed
349 | how to build with *roxygen*, and the package vignettes, which are
350 | longer form discursive examples of using the functions in the package,
351 | and perhaps also including some of the motivation or theory behind the
352 | methods in the package.
353 | 
354 | Adding vignettes to a package is very simple, technically much simpler
355 | than writing the function help pages, but it takes a lot of time to
356 | hone the message of a vignette. First we will show the technical
357 | aspects of adding a vignette. 
358 | 
359 | I recommend to use Rmarkdown (`.Rmd`) as the format of the vignette,
360 | although the Sweave format (.`Rnw`) is also possible (Sweave predates
361 | Rmd, producing PDF output). Rmarkdown can produce both HTML and PDF
362 | output, but typically it is used to produce HTML output. My reasoning
363 | for preferring Rmarkdown/HTML is that it allows the documentation to
364 | be easily viewed and scrolled through on computers, laptops and
365 | phones, whereas PDF is optimal for printing. I don't think many users
366 | are printing vignettes for statistical software packages, but are
367 | instead probably often reading the documentation and vignettes
368 | *on-demand* while they are also in a separate window perhaps working
369 | on their own particular dataset and analysis. This is just my opinion,
370 | but I've heard from users that they appreciate HTML vignettes, which
371 | are just as easy to create as PDF vignettes.
372 | 
373 | We don't teach Rmarkdown in this class explicitly, as the class
374 | teaching computing at a high level and builds on other classes which
375 | leverage Rmarkdown. It is a very easy format to learn if you haven't
376 | written it yet (as you know, these lecture notes are written in
377 | Rmarkdown). A guide to writing Rmarkdown can be
378 | found [here](https://rmarkdown.rstudio.com/lesson-15.html), or by
379 | Googling "Rmarkdown cheat sheet".
380 | 
381 | There can be multiple vignettes for a package, but it's probably most
382 | common that a package would have one vignette. The vignettes are then
383 | files with `.Rmd` endings in the `vignettes` directory of the package.
384 | 
385 | The preamble to a software vignette can look like the following (not
386 | all aspects here are necessary, but I find this preamble useful):
387 | 
388 | ```
389 | ---
390 | title: "A great title for a vignette"
391 | date: "`r format(Sys.Date(), '%m/%d/%Y')`"
392 | author: "Jane Doe, John Doe"
393 | output: rmarkdown::html_document
394 | abstract: |
395 |   Lorem ipsum lorem ipsum lorem ipsum lorem ipsum lorem ipsum
396 |   lorem ipsum lorem ipsum lorem ipsum lorem ipsum lorem ipsum
397 | bibliography: library.bib
398 | vignette: |
399 |   %\VignetteIndexEntry{A great title for a vignette}
400 |   %\VignetteEngine{knitr::rmarkdown}
401 |   %\VignetteEncoding{UTF-8}
402 | ---
403 | ```
404 | 
405 | In order to have the package recognize and compile this vignette, we
406 | also need to do two things: (1) add *knitr* and *rmarkdown* to the
407 | Suggests field in `DESCRIPTION` and (2) add the following new field to
408 | the `DESCRIPTION`:
409 | 
410 | ```
411 | VignetteBuilder: knitr, rmarkdown
412 | ```
413 | 
414 | That concludes the technical aspects of adding an Rmarkdown vignette
415 | to a package, but I'll add a few notes on *what* in my opinion should
416 | go in a package vignette, as I've written a number of these now and
417 | they have evolved over time to cover what users need to know.
418 | 
419 | # What to put in a package vignette
420 | 
421 | Here are some bulleted thoughts on writing a good package vignette:
422 | 
423 | * Use sections (and sub-sections) to demarcate meaningful steps in the
424 |   process, e.g. "Preprocessing", "Quality control", "Main analysis",
425 |   "Plotting results", "Diagnostics", etc. You can specify `toc: true`
426 |   (and `toc_float: true`) in the **output** section of the preamble in
427 |   order to add a Table of Contents (and make it floating, that is,
428 |   unfold/fold and follow as the user scrolls down). See the Rmarkdown
429 |   cheat sheet for instructions on adding options to the preamble.
430 | * Most of the code chunks should be evaluated, it's ok to add a few 
431 |   `eval=FALSE` code chunks for demonstration, but vignettes where none
432 |   of the code chunks are evaluated are not as helpful in my opinion,
433 |   as the output that a user gets when running the code may differ from
434 |   what happens in the non-evaluated code chunks, which may even
435 |   generate errors if they are out-of-date!
436 | * A Frequently Asked Questions (FAQ) section at the end can be useful
437 |   to put together answers to common questions you expect (or compile)
438 |   from users. 
439 | * I like to put Acknowledgments and References sections, the first to
440 |   list people who helped in writing the software, and the second to
441 |   list relevant literature. See the preamble above for how to add a
442 |   BibTex file (and see Rmarkdown cheat sheet for how to add references
443 |   to an Rmarkdown file). 
444 |   
445 | # Vignette datasets  
446 | 
447 | This is one of the trickiest parts to writing a good package
448 | vignette. You want the vignette to demonstrate a typical analysis,
449 | however a typical analysis will use files from an arbitrary location
450 | on a user's machine or cluster. However, the package vignette is
451 | built whenever the package is built, and so it cannot point to a
452 | missing dataset, or else this would generate an error. Relying on a
453 | URL for downloading the dataset is therefore a bad idea. Therefore
454 | packages will typically put the example dataset (which may itself be
455 | smaller than an normal-sized dataset) into the `inst/extdata` or
456 | `data` directory of the package itself or another associated data
457 | package (the former for non-R data objects, the latter for `.rda`
458 | files). If the example dataset is more than 1 Mb, I'd suggest to put
459 | it into a separate data package, as the software packages should not
460 | be inflated by unnecessary data objects. Then in the vignette, you
461 | could use the following code to find the dataset:
462 |   
463 | ```{r eval=FALSE}
464 | dir <- system.file("extdata", package="fooData")
465 | file <- file.path(dir, "example.xyz")
466 | ```
467 |   
468 | ...for an arbitrary file, or for an `.rda` file named `example.rda`:
469 |   
470 | ```{r eval=FALSE}
471 | library(fooData)
472 | data(example)
473 | ```
474 | 
475 | Obviously, these lines of code are not what a user would do when they
476 | are running your software on their files. Their data will not reside
477 | in an R package, but instead it will be at some location on their
478 | machine or cluster. And so before these types of code chunks, I will
479 | typically write a sentence that these code chunks are specifically for
480 | the vignette, and do not represent typical workflow code.
481 | 


--------------------------------------------------------------------------------
/rpkg/efficient.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Readable and efficient R code"
  3 | author: "Michael Love, modified by Naim Rashid"
  4 | date: 10/19/2018
  5 | output: 
  6 |   html_document:
  7 |     toc: true
  8 |     toc_float: true
  9 | ---
 10 | 
 11 | # Why write readable and efficient code?
 12 | 
 13 | In this document we will discuss some strategies for writing readable
 14 | and efficient code in R (the strategies may sometimes extend to
 15 | other high-level programming languages like python, Julia, or Matlab).
 16 | 
 17 | First, why do we care about these *secondary* properties of our
 18 | code. Perhaps the most important thing about an implementation of a
 19 | statistical method is that it *works*. You want it to be accurate, in
 20 | that you have faithfully translated the mathematical formulation of a
 21 | statistical method into code and that it runs without crashing on
 22 | typical input data. This is true, and yet these other properties are
 23 | also very important.
 24 | 
 25 | The main reason to care about readability and efficiency is because
 26 | *you want others to use your method, or be able to understand your
 27 | analysis*. There is a lot of chance involved in your method becoming
 28 | popular, but you give yourself a much better chance if it can be
 29 | easily understood by reading the code, and if it has been written in a
 30 | clever way, avoiding unnecessary inefficiencies. And similarly,
 31 | someone is more likely to trust your analysis if it doesn't look
 32 | like [spaghetti code](https://en.wikipedia.org/wiki/Spaghetti_code).
 33 | 
 34 | Trust that someone will probably look at your code (whether for a
 35 | method or a published analysis) and decide whether it makes sense.
 36 | 
 37 | # Readable code
 38 | 
 39 | **Readable code** for a high-level programming language will look
 40 | similar across whatever language you use. 
 41 | 
 42 | * *Visual space:* Code within a file is broken into meaningful
 43 |   vertical chunks, and code within a project is broken into meaningful
 44 |   files (this varies a bit from person to person). Use spaces between
 45 |   operators, e.g. `x <- 1` rather than `x<-1`.
 46 |   
 47 | * *Non-repeating:* Functions are used to define operations that will
 48 |   be repeated more than once. There should almost never be any code
 49 |   that looks as if it were *copy-pasted* within a project. Variations
 50 |   among similar code chunks can be turned into arguments of a
 51 |   function.
 52 |   
 53 | * *Inline documentation:* User-facing functions should have arguments
 54 |   that are documented above the function, along with a description of
 55 |   what the function returns. (We will discuss strategies for doing
 56 |   this in R using *roxygen2* and the *devtools* package.)
 57 |   
 58 | * *Comments:* Use lots of comments to describe the choices that are
 59 |   made in the code. It's difficult to actually provide *too many*
 60 |   comments. This helps others, and it will certainly help yourself in
 61 |   the future as you return to your code to make sense of what you were
 62 |   trying to do.
 63 |   
 64 | * *Meaningful names:* Function and variable naming is actually quite
 65 |   difficult. Simple and descriptive is good. If you have a function
 66 |   that estimates weights, e.g.  `estimateWeights`. For the main
 67 |   variables/objects (such as the ones the user provides as an
 68 |   argument), short variable names are acceptable, especially when
 69 |   these align with standard notation, e.g. `y`, `x`, or abbreviations,
 70 |   `wts`. For intermediate variables/objects it is best to be more
 71 |   descriptive, e.g. `robustEstVar` for a robust estimate of
 72 |   variance. Some R developers use "camel-case" for functions and
 73 |   variables (`estimateWeights`), while others use underscores
 74 |   (`estimate_weights`) for functions or periods for variables names
 75 |   `robust.est.var`. This is not so important, but try to be consistent
 76 |   within a project. 
 77 | 
 78 | It is also important to maintain a consistent coding style, as this consistency throughout your code helps with readability.  I would recommend reading Hadley Wickham's [R style guide](http://adv-r.had.co.nz/Style.html) as a good place to start (there are others that exist).  This guide goes through naming conventions, syntax, indentation, and other items, some of which were already covered in the list above.  
 79 | 
 80 | # Efficient code
 81 | 
 82 | **Efficient code** is a bit more language specific, and here we will
 83 | focus on efficiency in the R language. The most important factor
 84 | below however is true also for the other high-level programming
 85 | languages.
 86 | 
 87 | * *Use vectorized functions:* The most important thing to recognize
 88 |   about high-level programming languages is that they are built on top
 89 |   of fast routines written in Fortran, C, or C++ mostly. Iteration in
 90 |   the high-level language [will always be slower](https://www.noamross.net/archives/2014-04-16-vectorization-in-r-why/) than the *vectorized*
 91 |   function which iterates in the lower-level language. Use of row- or
 92 |   column-based operations over matrices, or matrix multiplication,
 93 |   that avoids iterating over rows or columns is one of the keys to
 94 |   efficient programming in R. 
 95 | * *Allocate first:* Another pitfall for writing R code is any
 96 |   operation which grows the memory space required for an object at
 97 |   each iteration. You should almost never have a loop where the inside
 98 |   of the loop has `out <- c(out, new.thing)`. Concatenation is ok, but
 99 |   remember that it has a time cost, so you don't want to be doing it
100 |   often. Loops in R are not as bad as you may have heard, as long
101 |   as the space has been pre-allocated. Loops in R will likely be
102 |   slower than using a vectorized function, or a loop in C or C++, but
103 |   they don't need to be avoided at all cost.
104 | * *Avoid copying large objects:* This goes along with the above point,
105 |   but copying large objects takes up a lot of time in R. To the extent
106 |   that you can avoid [making copies](https://adv-r.hadley.nz/names-values.html) of a data object, you will avoid unnecessary slowdowns. We will discuss this in more detail later.
107 | * *Go to C or C++:* You can typically gain a lot of speed by moving a
108 |   repetitive operation from R to C or C++. We will see how to do this
109 |   easily and in a way that the code is still readable in later lecture
110 |   notes.
111 | * *Memoization:* If you happen to be writing a function that will take
112 |   input which has repeated elements, and it is very important for this
113 |   function to be very fast, memoization can be a useful
114 |   technique. Memoization entails storing the values of expensive
115 |   function calls so that they don't have to be repeated. There is a
116 |   small overhead in saving and looking up the precomputed values, but
117 |   if the degree of repeated input is high, the savings of memoization
118 |   can be large. The *memoise* package helps with this in R.
119 | * *Parallelization:*  One can think of the standard for loop as a serial operation: the $(i+1)$th    iteration is always ran after the $(i)$th iteration has completed.  On machines with more than one CPU available, say with $P$ CPUs, parallelization of iterations may allow for the execution of up to $P$ iterations of the for loop at the same time. This is particularly helpful when there are a large number of loop iterations that are non-recursive, meaning the next iteration of the loop does not depend on the prior one (simulations for example).  One thing to note is that each parallel instance invoked in R requires additional memory, and therefore in memory-intensive operations this may quickly exhaust the available memory on your machine if you are not careful.    
120 | 
121 | Two short notes on avoiding making copies, from Section 2.5 in [Advanced R](https://adv-r.hadley.nz/names-values.html):
122 | 
123 | > For loops have a reputation for being slow in R, but often that slowness is caused by every iteration of the loop creating a copy. Consider the following code. It subtracts the median from each column of a large data frame:
124 | 
125 | and
126 | 
127 | > While it’s not hard to determine when a copy is made, it is hard to prevent it. If you find yourself resorting to exotic tricks to avoid copies, it may be time to rewrite your function in C++
128 | 
129 | # Learning more about efficient code
130 | 
131 | This course has many topics to cover, and so we can only cover some of
132 | the basics for each module, and within each module on each topic. Of
133 | course, efficient code is a topic that could expand to fill a
134 | semester. We will show some of the most important topics for a
135 | Biostatistics student in this lecture note, but I highly recommend
136 | this online textbook by Colin Gillespie and Robin Lovelace:
137 | 
138 | * [Efficient R Programming](https://csgillespie.github.io/efficientR/)
139 | 
140 | We don't have time to sufficiently cover everything in this note, but it's worth reading on your own.  Some of the topics in this book, for example usage of *Rcpp* and *data.table* will be covered in later lecture notes.
141 | 
142 | 
143 | # Benchmarking R code
144 | 
145 | We will make use of *microbenchmark* package to assess efficiency of
146 | methods here and again in the course. It is not part of the core set
147 | of R packages, so you will need to install it with `install.packages`.
148 | 
149 | ## Pre-allocation
150 | 
151 | You can compare two implementations by passing them to the
152 | `microbenchmark` function:
153 | 
154 | ```{r}
155 | library(microbenchmark)
156 | slow.sqrt <- function(x) {
157 |   ans <- numeric(0)
158 |   for (i in seq_along(x)) {
159 |     ans <- c(ans, sqrt(x[i]))
160 |   }
161 |   ans
162 | }
163 | microbenchmark(sqrt(1:1000), slow.sqrt(1:1000))
164 | ```
165 | 
166 | This benchmark indicates that the vectorized version of the square
167 | root is many orders of magnitude faster than a naive
168 | implementation. Let's compare `slow.sqrt` with a version where we
169 | preallocate the vector that stores the eventual output of the function:
170 | 
171 | ```{r}
172 | pre.sqrt <- function(x) {
173 |   ans <- numeric(length(x))
174 |   for (i in seq_along(x)) {
175 |     ans[i] <- sqrt(x[i])
176 |   }
177 |   ans
178 | }
179 | microbenchmark(pre.sqrt(1:1000), slow.sqrt(1:1000))
180 | ```
181 | 
182 | So simply pre-allocating saves us about an order of magnitude in
183 | speed.
184 | 
185 | ## Initial attempts at C++
186 | 
187 | We can also assess the speed of R's vectorized `sqrt` comparing to our
188 | own implementations in C++. Here we use the *Rcpp* package to define
189 | some inline C++ code which is then compiled and accessible as an R
190 | function. We will show more details on incorporating C++ code in
191 | future lecture notes.
192 | 
193 | ```{r}
194 | library(Rcpp)
195 | cppFunction(
196 | "NumericVector rcpp_sqrt1(NumericVector x) {
197 |   int n = x.size();
198 |   Rcpp::NumericVector y(x);
199 |   for (int i=0; i < n; i++) {
200 |     y[i] = sqrt(x[i]);
201 |   }
202 |   return y;
203 | }")
204 | cppFunction(
205 | "NumericVector rcpp_sqrt2(NumericVector x) {
206 |   return sqrt(x);
207 | }")
208 | microbenchmark(sqrt(1:1000),
209 |                rcpp_sqrt1(1:1000),
210 |                rcpp_sqrt2(1:1000))
211 | ```
212 | 
213 | Our two C++ implementations are comparable, and now only about 3 times
214 | slower than R's `sqrt`. 
215 | 
216 | ## Memoization
217 | 
218 | We also show an example of how memoization can help in certain
219 | circumstances when there is repeated input.
220 | 
221 | ```{r}
222 | library(memoise)
223 | x <- sample(10, 1000, TRUE)
224 | slow.fn <- function(x) {
225 |   Sys.sleep(.001)
226 |   x + pi
227 | }
228 | mem.fn <- memoise(slow.fn)
229 | system.time(sapply(x, slow.fn))
230 | system.time(sapply(x, mem.fn))
231 | forget(mem.fn)
232 | ```
233 | 
234 | We defined an arbitrary function `slow.fn` which waits a millisecond
235 | before returning the input plus `pi`. We then run `slow.fn` on a
236 | vector of length 1000, although the values of the vector are limited
237 | to then numbers 1 to 10. When we instead call the memoized version of
238 | the function, we only incur the millisecond cost of `slow.fn` 10 times
239 | (for each time we need to calculate the value of `slow.fn` on the
240 | numbers 1 to 10). Any repeated calls to `slow.fn` on a number that
241 | `mem.fn` has already seen instead incur the cost of looking up the
242 | value of `slow.fn` in an in-memory cache.
243 | 
244 | ## Matrices when possible
245 | 
246 | Finally, one can be using vectorized functions in R and still not
247 | writing the most efficient code possible, if one is not making use of
248 | functions which can operate on matrices, but instead working "one row
249 | at a time". Here we give a simple example of $A X = B$ where all three
250 | are matrices. If we treat this problem as a series of $A x = b$ where
251 | $x$ and $b$ are vectors, then we will incur a cost in speed.
252 | 
253 | ```{r}
254 | n <- 50
255 | m <- 1000
256 | a <- matrix(rnorm(n*n),nrow=n)
257 | x <- matrix(rnorm(n*m),nrow=n)
258 | b <- a %*% x
259 | xx <- solve(a, b[,1])
260 | all.equal(x[,1], xx)
261 | ```
262 | 
263 | Implementing this as a function:
264 | 
265 | ```{r}
266 | slow.solve <- function(a, b) {
267 |   xx <- matrix(nrow=nrow(b), ncol=ncol(b))
268 |   for (i in seq_len(ncol(b))) {
269 |     xx[,i] <- solve(a, b[,i])
270 |   }
271 |   xx
272 | }
273 | x1 <- solve(a, b)
274 | x2 <- slow.solve(a, b)
275 | all.equal(x1, x2)
276 | microbenchmark(solve(a,b), slow.solve(a,b))
277 | ```
278 | 
279 | ## Parallelization
280 | 
281 | The *parallel* package in R is commonly use for parallelizing functions. Let see how much faster we can speed up the slow.solve function by parallelizing it. First we have to load the parallel library and detect the max number of cores available on your machine.
282 | 
283 | ```{r}
284 | library("parallel")
285 | no_of_cores = detectCores()
286 | ```
287 | 
288 | Then, we can utilize the parallelized version of sapply, apply, and lapply from the parallel package (parSapply, parApply, and parLapply). We start by requesting the number of cores we would like to use (here we use two).
289 | 
290 | ```{r}
291 | ## creates a set of copies of R running in parallel
292 | cl = makeCluster(2)
293 | 
294 | ## now we create a function representing the inner portion of the slow.solve for loop
295 | # bi is assumed to be the i'th column of b being passed to slow.solve.i
296 | slow.solve.i = function(bi, a){
297 |   xxi <- solve(a, bi)
298 |   return(xxi)
299 | }
300 | 
301 | ## now run benchmark comparing slow.solve, the apply version, and the parApply version
302 | microbenchmark(
303 |   slow.solve(a,b), 
304 |   apply(X = b, MARGIN = 2, FUN = slow.solve.i, a = a), 
305 |   parApply(cl = cl, X = b,  FUN = slow.solve.i, MARGIN = 2, a = a)
306 | )
307 | 
308 | # Close the cores after use of parApply to prevent memory leaks
309 | stopCluster(cl)
310 | ```
311 | 
312 | We see that the parallelized version is twice as fast than slow.solve and the apply function.   Still, it is not as fast as using a vectorized function. For a smaller number of iterations paralellization may not be as helpful given the overhead required to coordinate tasks across cores. If using within a function, sometimes an error in the function may cause it to terminate earlier.  Adding "on.exit(stopCluster(cl))" after makeCluster() above helps to simplify this issue, where whatever function is within on.exit() will be executed regardless of how the function ends. 
313 | 
314 | On linux and mac, one can alternatively use the mcapply() and mclapply() functions, where the main difference is that one does not need to explictly start and stop the cluster objects as in the above example. This simplifies its use, especially when nested within other functions. 
315 | 
316 | One important note when using parallelization on high performance computing clusters.  Such clusters consist of multiple compute nodes connected to a login node.  Typically jobs that are submitted to the cluster are handled from the login nodes and distributed to the compute nodes.  A single job may also request multiple cpu's to run operations within the job in parallel. However for the parallel package and most other parallelized routines in R, parallelization cannot go beyond a single node, and is therefore limited to the number of CPUs on that node.  For parallelization across a large number of CPUs, one needs to utilize Message Passing Interface (MPI) enabled versions of R, with accompanying MPI-enabled R packages (*Rmpi*) and cluster software (*openMPI*). 
317 | 
318 | Additional detail can be found in Section 7.5 of the Efficient R Programming book. An up-to-date listing of other R packages that enable parallel computing can be [found here](https://cran.r-project.org/web/views/HighPerformanceComputing.html).  Examples include *Rmpi*, which is an R-wrapper for MPI, as well as *foreach*, which is an R package that allows for the execution of parallelized loops in R. 
319 | 
320 | ## Code Profiling
321 | 
322 | [Code profiling](https://csgillespie.github.io/efficientR/performance.html#performance-profvis) is a really useful way to figure out what lines of code are
323 | causing the greatest bottlenecks in your package or script. I highly recommend using code profiling if you end up making a package where speed and memory usage are a critical factors.  A popular package used for this purpose is the *profvis* package via the profvis() function.  One way to use this function is to apply it directly to several lines of R code, which can be passed to the first argument of the function.  We demonstrate this with an example from earlier:
324 | 
325 | 
326 | ```{r, eval = F}
327 | library("profvis")
328 | profvis({
329 |   n <- 500
330 |   m <- 1000
331 |   a <- matrix(rnorm(n*n),nrow=n)
332 |   x <- matrix(rnorm(n*m),nrow=n)
333 |   b <- a %*% x
334 |   xx <- solve(a, b[,1])
335 | })
336 | ```
337 | 
338 | After running the above example, we can see a new interactive html panel open where the top pane breaks up the code chunk by line and lists the memory usage and timing of each (in milliseconds).
339 | 
340 | ![](profvis.png)
341 | 
342 | We can see that the matrix multiplication step is the most time consuming here, and would a place to focus our efforts if we would like to speed up the overall run time of this chunk.  The bottom pane is called a "flame graph" and lists the call stack on the vertical axis and time (in milliseconds) on the horizontal axis. We can see here that again the matrix multiplication step was the most time consuming.  Since we are using simple functions here the call stack is not that deep, but you can find more examples [here](https://rstudio.github.io/profvis/) on the package website.  
343 | 
344 | You can also run profiling on individual functions, such as the ones we defined earlier:
345 | 
346 | ```{r, eval = F}
347 | profvis(slow.solve(a, b))
348 | ```
349 | 
350 | Interpreting memory use in the top pane is a little more tricky, see the following quote from the *profvis* package page:
351 | 
352 | > The code panel also shows memory allocation and deallocation. Interpreting this information can be a little tricky, because it does not necessarily reflect memory allocated and deallcated at that line of code. The sampling profiler records information about memory allocations that happen between the previous sample and the current one. This means that the allocation/deallocation values on that line may have actually occurred in a previous line of code.
353 | 
354 | In general, code profiling allows one to focus your speed and memory optimization efforts on those parts that are *actually*  resource intensive in your code. 
355 | 
356 | # Example: Monte-Carlo integration 
357 | 
358 | Suppose we wish to estimate the integral
359 | \[
360 | \int_0^1 x^2 dx
361 | \]
362 | using Monte-Carlo Integration. We will learn more about this in Module 2, but for now, we focus on the programming aspect.
363 | 
364 | _Monte Carlo Integration Psuedocode_
365 | 
366 | 1. Initialise: `hits = 0`
367 | 1. __for i in 1:N__
368 | 1. $~~~$ Generate two random numbers, $U_1, U_2$, between 0 and 1
369 | 1. $~~~$ If $U_2 < U_1^2$, then `hits = hits + 1`
370 | 1. __end for__
371 | 1. Area estimate = `hits/N`
372 | 
373 | Implementing this Monte-Carlo algorithm in R would typically lead to something like:
374 | 
375 | ```{r tidy=FALSE}
376 | monte_carlo = function(N) {
377 |   hits = 0
378 |   for (i in seq_len(N)) {
379 |     u1 = runif(1)
380 |     u2 = runif(1)
381 |     if (u1 ^ 2 > u2)
382 |       hits = hits + 1
383 |   }
384 |   return(hits / N)
385 | }
386 | ```
387 | 
388 | In R this takes a few seconds
389 | 
390 | ```{r cache=TRUE}
391 | N = 500000
392 | system.time(monte_carlo(N))
393 | ```
394 | 
395 | Derive a more R-centric, vectorized approach to the above function to make it faster.  Can you explain the intuition behind why this algorithm works for estimating the value of an integral?
396 | 
397 | 
398 | 


--------------------------------------------------------------------------------
/rpkg/profvis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biodatascience/statcomp_src/558195d8bb181b582263678914107f12872a4443/rpkg/profvis.png


--------------------------------------------------------------------------------
/rpkg/rcpp.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Calling C++ with Rcpp"
  3 | author: "Michael Love"
  4 | date: 10/25/2018
  5 | output: 
  6 |   html_document:
  7 |     toc: true
  8 |     toc_float: true
  9 | ---
 10 | 
 11 | # Calling other languages from R
 12 | 
 13 | Calling languages like C and C++ from R can be a very powerful
 14 | framework, and you'll see that many successful R packages will use C
 15 | or C++ (or Fortran) to run the most computationally intensive part of
 16 | the method. R then provides a useful interface, and a way to make nice
 17 | plots and summaries of the results, or to link to other datasets. R is
 18 | also useful for reading in a variety of data, as we will see in a
 19 | series of notes about working with large data.
 20 | 
 21 | R being the glue tying together fast low-level routines is actually
 22 | how most of R works. Note for example the `.f` and `.c` files in the 
 23 | [R source directory](https://svn.r-project.org/R/trunk/src/library/stats/src/).
 24 | This R-as-glue framework goes back to the design proposal for the
 25 | original language S, as conceived by one of its creators John
 26 | Chambers in the 1970s:
 27 | 
 28 | <img width="500" src="s_diagram.png">
 29 | 
 30 | [Slide from John Chambers' useR 2006 talk](https://www.r-project.org/conferences/useR-2006/Slides/Chambers.pdf)
 31 | 
 32 | # Introduction to compilation errors
 33 | 
 34 | In this lecture note, we are going to focus on adding C++ code to an R
 35 | package. We already showed a small example in the efficient code
 36 | lecture note about using the `cppFunction` in *Rcpp* to write some
 37 | inline C++ code. This is not the approach that you should take however
 38 | if you want to include C++ code in an R package. Instead you will put
 39 | your C++ code into one or more files with a `.cpp` ending, which go
 40 | into the `src` directory of your R package.
 41 | 
 42 | We will not teach C++ itself in this class, but just show some simple
 43 | examples and let you continue on your own in your research. 
 44 | There are numerous resources online for learning C++,
 45 | but you can also stick closely to the operations defined in the *Rcpp*
 46 | and *RcppArmadillo* guides and implement many statistical routines
 47 | with experimentation and use of Google and Internet forums.
 48 | It is perfectly normal that you will encounter compilations errors if
 49 | you are new to writing C++ code, and as you learn how to work with new
 50 | packages and new C++ libraries. As you encounter errors, look for
 51 | examples of well written R packages using Rcpp, read the 
 52 | [Rcpp vignettes](https://cran.r-project.org/web/packages/Rcpp/index.html),
 53 | and use Google to find relevant threads. In
 54 | fact, the compilation errors will sometimes point you straightaway to
 55 | the problems in your code. Other times it will take some more
 56 | investigation, but this is all totally normal. For students more
 57 | familiar with writing R code, typically the errors involve working
 58 | with a
 59 | [static typed language](https://en.wikipedia.org/wiki/Type_system#Type_checking).
 60 | 
 61 | Why C++? You can also combine R with C or with Fortran. We choose C++
 62 | because there is an R package Rcpp which *greatly* simplifies the
 63 | interface between R and a fast language. It is even possible to
 64 | perform vectorized operations in C++ as described in the 
 65 | [Rcpp-sugar vignette](https://cran.r-project.org/web/packages/Rcpp/vignettes/Rcpp-sugar.pdf).
 66 | In addition there are sister packages like *RcppArmadillo*
 67 | providing access to the fast and elegant linear algebra library
 68 | [Armadillo](http://arma.sourceforge.net/), which leverage the Rcpp interface.
 69 | By elegant, I mean that the C++ code is still quite "high-level" in
 70 | that it is easily readable as linear algebra. I find this very
 71 | compelling as statistical software is both a useful product to
 72 | scientists who may want to use the statistical method, and *the source
 73 | code should be readable as a research product itself*, so that other
 74 | statisticians can be inspired by it to create related methods.
 75 | 
 76 | As background, here are some of the first few lines from 
 77 | [the introductory vignette to Rcpp](https://cran.r-project.org/web/packages/Rcpp/vignettes/Rcpp-introduction.pdf):
 78 | 
 79 | > Since the beginning, and as we argue below, “by design”, the R
 80 | > system has always provided an application programming interface
 81 | > (API) suitable for extending R with code written in C or Fortran.
 82 | > Being implemented chiefly in R and C (with a generous sprinkling of
 83 | > Fortran for well-established numerical subroutines), R has always
 84 | > been extensible via a C interface. ...
 85 | > And with the introduction of the Rcpp package (Eddelbuettel and
 86 | > François, 2011; Eddelbuettel, 2013; Eddelbuettel et al., 2018a),
 87 | > and its later refinements, this process of extending R has become
 88 | > considerably easier yet also more robust.
 89 | 
 90 | # Hello world example
 91 | 
 92 | We will first focus on a "hello world" example of getting a C++
 93 | function that we can incorporate into an R package and call from
 94 | R. The first step is to install the *Rcpp* package.
 95 | 
 96 | We then write a simple example of a C++ function (this one from the 
 97 | [Rcpp-package vignette](https://cran.r-project.org/web/packages/Rcpp/vignettes/Rcpp-package.pdf)).
 98 | 
 99 | ```{Rcpp eval=FALSE}
100 | #include <Rcpp.h>
101 | using namespace Rcpp;
102 | 
103 | // [[Rcpp::export]]
104 | List rcpp_hello_world() {
105 |   CharacterVector x = CharacterVector::create( "foo", "bar" ) ;
106 |   NumericVector y = NumericVector::create( 0.0, 1.0 ) ;
107 |   List z = List::create( x, y ) ;
108 |   return z ;
109 | }
110 | ```
111 | 
112 | The top two lines include the Rcpp header file, and declare that we
113 | will use the Rcpp namespace. If we did not use the Rcpp namespace we
114 | would have to put `Rcpp::` in front of all Rcpp functions,
115 | e.g. `Rcpp::NumericVector`.
116 | The line beginning with `//` indicates that we want to export this
117 | function to be used in R (more on this later). The body of the
118 | function follows, were we indicate that `rcpp_hello_world` returns a
119 | *List* and takes no arguments. Inside the function are some simple
120 | commands, creating a pre-specified character vector, and numeric
121 | vector, then making a *List* from these two and returning this
122 | *List*. 
123 | 
124 | We can add this to a `.cpp` file in the `src` directory, for example,
125 | `src/foo.cpp`. One last step is that we need to add the following to
126 | our `DESCRIPTION` file (don't add a new Imports line if you already
127 | have one, but just add Rcpp to the comma separated list). The
128 | LinkingTo line will allow us to include the Rcpp header file. 
129 | 
130 | ```
131 | Imports: Rcpp (>= 0.11.0)
132 | LinkingTo: Rcpp
133 | ```
134 | 
135 | We can try out our function by running the `load_all()`
136 | function from *devtools*. This will trigger a call to the
137 | `compileAttributes` function in *Rcpp*, but since we've already
138 | introduced `load_all`, we won't worry about `compileAttributes`. The
139 | other way to try out the new C++ function in the R package without
140 | running `load_all()` would be to run `compileAttributes()`, then build and
141 | install the package.
142 | 
143 | Running `load_all()` produces the following files (here, using version
144 | 1.0.7 of *Rcpp*): 
145 | 
146 | * `src/RcppExports.cpp`
147 | * `R/RcppExports.R`
148 | 
149 | The first file `src/RcppExports.cpp` has the following auto-generated
150 | contents (as it says at the top, these auto-generated files should not
151 | be edited by hand):
152 | 
153 | ```{Rcpp, eval=FALSE}
154 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
155 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
156 | 
157 | #include <Rcpp.h>
158 | 
159 | using namespace Rcpp;
160 | 
161 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
162 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
163 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
164 | #endif
165 | 
166 | // rcpp_hello_world
167 | List rcpp_hello_world();
168 | RcppExport SEXP _foo_rcpp_hello_world() {
169 | BEGIN_RCPP
170 |     Rcpp::RObject rcpp_result_gen;
171 |     Rcpp::RNGScope rcpp_rngScope_gen;
172 |     rcpp_result_gen = Rcpp::wrap(rcpp_hello_world());
173 |     return rcpp_result_gen;
174 | END_RCPP
175 | }
176 | 
177 | static const R_CallMethodDef CallEntries[] = {
178 |     {"_foo_rcpp_hello_world", (DL_FUNC) &_foo_rcpp_hello_world, 0},
179 |     {NULL, NULL, 0}
180 | };
181 | 
182 | RcppExport void R_init_foo(DllInfo *dll) {
183 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
184 |     R_useDynamicSymbols(dll, FALSE);
185 | }
186 | ```
187 | 
188 | We can see this has a wrapper function for our C++ function
189 | `_foo_rcpp_hello_world`, where *foo* is the name of the package, and
190 | the rest is the name of the C++ function being exported.
191 | 
192 | The second file `R/RcppExports.R` has the following contents:
193 | 
194 | ```{r eval=FALSE}
195 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
196 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
197 | 
198 | rcpp_hello_world <- function() {
199 |     .Call('_foo_rcpp_hello_world', PACKAGE = 'foo')
200 | }
201 | ```
202 | 
203 | The second file creates a function in R which calls the C++ function
204 | we wrote. The R function is called `rcpp_hello_world`, and you can see
205 | that it is calling the wrapper function in C++,
206 | `_foo_rcpp_hello_world`. 
207 | 
208 | I typically do not export the R wrapper functions such as
209 | `rcpp_hello_world`, but instead put these functions inside of other R
210 | functions. This makes sense, because the wrapper function defined
211 | above is auto-generated and incredibly minimal. It makes sense to
212 | provide argument checks and messages/warnings/errors before going
213 | straight into the C++ function. Those checks and
214 | messages/warnings/errors would be appropriate to put into the
215 | following type of R function, which lives in a separate R file (not in
216 | `R/RcppExports.R`, which is re-written every time you run `load_all()`
217 | or `compileAttributes()`). 
218 | 
219 | ```{r eval=FALSE}
220 | #' @useDynLib foo
221 | #' @export
222 | rcpp.hello.world <- function() {
223 |   rcpp_hello_world()
224 | }
225 | ```
226 | 
227 | The `@export` tag indicates that this function will be exported and
228 | visible when the *foo* package is loaded. The `@useDynLib foo` line
229 | is necessary to add to the *roxygen* block above only one function of
230 | your package (not needed twice). The `foo` here refers to the package
231 | name. As we added *roxygen* code blocks, we need to then run
232 | `document()`, and then `load_all()` again. 
233 | 
234 | Finally we can run our example function:
235 | 
236 | ```{r echo=FALSE}
237 | load_all("foo")
238 | ```
239 | 
240 | ```{r}
241 | rcpp.hello.world()
242 | ```
243 | 
244 | # Another Rcpp example
245 | 
246 | While the example above took no input and returned a list, we can also
247 | show an example where we take a numeric vector as input and return a
248 | numeric vector:
249 | 
250 | ```
251 | // [[Rcpp::export]]
252 | NumericVector add_one_sqrt(NumericVector x) {
253 |   NumericVector y(x);
254 |   y = sqrt(x + 1.0);
255 |   return y;
256 | }
257 | ```
258 | 
259 | We then add an R wrapper function:
260 | 
261 | ```{r eval=FALSE}
262 | #' @export
263 | add.one.sqrt <- function(x) {
264 |   add_one_sqrt(x)
265 | }
266 | ```
267 | 
268 | We then repeat the steps above to load our package, which are to run
269 | `document()` so that the `add.one.sqrt` function is exported and the
270 | `src/RcppExports.cpp` and `R/RcppExports.R` files are updated
271 | (`document()` triggers these compilation steps as well as
272 | `load_all()`). Then we can load the package with `load_all()`.
273 | 
274 | ```{r}
275 | add.one.sqrt(1:5)
276 | ```
277 | 
278 | For more details on all of the C++ functions that can be used in
279 | *Rcpp* one should check the 
280 | [vignettes](https://cran.r-project.org/web/packages/Rcpp/index.html).
281 | There is typically a lot of practice and debugging when taking a bit
282 | of working code in R and translating to C++, so be persistent.
283 | 
284 | # RcppArmadillo example
285 | 
286 | Finally, as we mentioned in the introduction to this lecture note,
287 | there are other libraries that make *Rcpp* a compelling interface,
288 | including the [Armadillo](http://arma.sourceforge.net/) linear algebra
289 | library for C++. In the *Armadillo* documentation, a table presents 
290 | [common matrix operations](http://arma.sourceforge.net/docs.html#syntax) in
291 | Matlab and the corresponding code in *Armadillo*.
292 | 
293 | *Armadillo* can be run within the *Rcpp* framework using an additional
294 | R package *RcppArmadillo*. The objects are represented differently in
295 | comparison to *Rcpp*. Below the objects typed with `arma::` prefix are 
296 | *Armadillo* classes of objects. *RcppArmadillo* has a different set of 
297 | [vignettes](https://cran.r-project.org/web/packages/RcppArmadillo/index.html) and
298 | you should read the introduction vignette to become acquainted with
299 | the interface. This example is derived from the 
300 | [fastLm.cpp](http://dirk.eddelbuettel.com/code/rcpp.armadillo.html) example
301 | from *RcppArmadillo* and uses the `const arma::mat&` typing in order
302 | to avoid making an extra copy of the matrices `X` and `Y`.
303 | 
304 | ```
305 | #include <RcppArmadillo.h>
306 | using namespace Rcpp;
307 | 
308 | // [[Rcpp::export]]
309 | arma::mat matrix_mult(const arma::mat& X, const arma::mat& Y) {
310 |   int m = X.n_rows;
311 |   int n = Y.n_cols;
312 |   arma::mat Z(m,n);
313 |   Z = X * Y;
314 |   return Z;
315 | }
316 | ```
317 | 
318 | We need to update our `DESCRIPTION` file to include *RcppArmadillo* in
319 | both Imports and LinkingTo fields:
320 | 
321 | ```
322 | Imports: Rcpp (>= 0.11.0), RcppArmadillo
323 | LinkingTo: Rcpp, RcppArmadillo
324 | ```
325 | 
326 | And we write and mark for export our wrapper function:
327 | 
328 | ```{r eval=FALSE}
329 | #' @export
330 | matrix.mult <- function(X,Y) {
331 |   matrix_mult(X,Y)
332 | }
333 | ```
334 | 
335 | Then running `document()` and `load_all()` we can try out our new
336 | function: 
337 | 
338 | ```{r}
339 | m1 <- matrix(1:12,ncol=4)
340 | m2 <- matrix(13:24,nrow=4)
341 | matrix.mult(m1, m2)
342 | ```
343 | 
344 | 


--------------------------------------------------------------------------------
/rpkg/rpkg_HW1.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Homework 1 - readable and efficient R code"
  3 | author: "your name here"
  4 | date: "`r format(Sys.time(), '%m/%d/%Y')`"
  5 | output: html_document
  6 | ---
  7 | 
  8 | # Question 1 - "function-alize" this code
  9 | 
 10 | Read over the code below and perform the following:
 11 | 
 12 | * Wrap it into a function `foobar0` which has arguments `x` and `z`
 13 |   and which returns the vector `x` at the end of the following code.
 14 | * Rewrite this into a function `foobar` which is easier to read, by
 15 |   reducing repetitive code. E.g. `foobar` might call a function to
 16 |   check the input, and another function to perform the three lines of
 17 |   computation. 
 18 | * Check that the two versions produce the same output using the
 19 |   function `all.equal`.
 20 | 
 21 | ```{r}
 22 | set.seed(1)
 23 | x <- rnorm(100)
 24 | z <- rnorm(100)
 25 | if (sum(x >= .001) < 1) {
 26 |   stop("step 1 requires 1 observation(s) with value >= .001")
 27 | }
 28 | fit <- lm(x ~ z)
 29 | r <- fit$residuals
 30 | x <- sin(r) + .01
 31 | if (sum(x >= .002) < 2) {
 32 |   stop("step 2 requires 2 observation(s) with value >= .002")
 33 | }
 34 | fit <- lm(x ~ z)
 35 | r <- fit$residuals
 36 | x <- 2 * sin(r) + .02
 37 | if (sum(x >= .003) < 3) {
 38 |   stop("step 3 requires 3 observation(s) with value >= .003")
 39 | }
 40 | fit <- lm(x ~ z)
 41 | r <- fit$residuals
 42 | x <- 3 * sin(r) + .03
 43 | if (sum(x >= .004) < 4) {
 44 |   stop("step 4 requires 4 observation(s) with value >= .004")
 45 | }
 46 | fit <- lm(x ~ z)
 47 | r <- fit$residuals
 48 | x <- 4 * sin(r) + .04
 49 | x
 50 | ```
 51 | 
 52 | # Question 2 - vectorize this code and benchmark
 53 | 
 54 | * Take the following function `f0` and rewrite it as a function `f`,
 55 |   which is faster and easier to read, by removing the loop of `i` from
 56 |   1 to `m`.
 57 | * Benchmark `f` and `f0` using `microbenchmark`. How much faster is `f`?
 58 | 
 59 | ```{r}
 60 | n <- 30
 61 | p <- 50
 62 | p2 <- 25
 63 | m <- 1000
 64 | set.seed(1)
 65 | x <- matrix(rnorm(n*p),nrow=n,ncol=p)
 66 | b <- matrix(rnorm(m*p),nrow=m,ncol=p)
 67 | a <- matrix(rnorm(m*p2),nrow=m,ncol=p2)
 68 | f0 <- function(x,b,a) {
 69 |   out <- numeric(0)
 70 |   for (i in seq_len(m)) {
 71 |     bb <- b[i,]
 72 |     aa <- a[i,]
 73 |     out <- c(out, sum(x %*% bb) + sum(aa))
 74 |   }
 75 |   out
 76 | }
 77 | ```
 78 | 
 79 | # Question 3 - build a faster t-test
 80 | 
 81 | * Rewrite the following function `getT0` which computes `m`
 82 |   two-sample t-tests (equal variance) between two groups as a function
 83 |   `getT`, which is faster by using vectorized operations over the `m`
 84 |   sets of observations. (There are functions in R packages, such as
 85 |   `genefilter::rowttests` which will quickly perform this operation,
 86 |   but I want you to write your own function using simple R functions
 87 |   like `rowSums`, etc.)
 88 | * Benchmark `getT` and `getT0`. How much faster is `getT`?
 89 | 
 90 | ```{r}
 91 | m <- 400
 92 | n <- 50
 93 | little.n <- n/2
 94 | set.seed(1)
 95 | x <- matrix(rnorm(m*n),nrow=m,ncol=n)
 96 | f <- gl(2,little.n)
 97 | getT0 <- function(x, f) {
 98 |   ts <- sapply(seq_len(m), function(i) t.test(x[i,] ~ f, var.equal=TRUE)$statistic)
 99 |   unname(ts)
100 | }
101 | ```
102 | 


--------------------------------------------------------------------------------
/rpkg/rpkg_HW2.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Homework 2 - building an R package"
 3 | author: "your name here"
 4 | date: "`r format(Sys.time(), '%m/%d/%Y')`"
 5 | output: html_document
 6 | ---
 7 | 
 8 | # Question 1 - build and document an R package
 9 | 
10 | Take the `getT` function that you wrote for last weeks homework and
11 | put it into a R package called `bios735`. You should add this to
12 | your homework repository, so it looks like the directory structure
13 | drawn below. You will re-use `bios735` across multiple homeworks.
14 | 
15 | ```
16 | statcomp-yourgithub
17 | |-- rpkg_HW1.html
18 | |-- rpkg_HW1.Rmd
19 | |-- rpkg_HW2.html
20 | |-- rpkg_HW2.Rmd
21 | |-- bios735
22 |     |-- DESCRIPTION
23 |     ...
24 | ```
25 | 
26 | Add the function `getT` to `bios735` and document its two arguments
27 | `x` and `f`, provide information about the output, add a description
28 | and details. Export the `getT` function. So in the end, you should be
29 | adding `.Rd` files to the `man` directory through the use of
30 | `devtools::document`. You should also update the `DESCRIPTION` file
31 | with your details. You can put whatever you like as the `Title` of
32 | your package and so on. Finally, build the package. Add the `.tar.gz`
33 | built package to your homework repository as well.
34 | 
35 | # Question 2 - add tests to your package
36 | 
37 | Use the *testthat* package to add tests to your package. The details
38 | of the tests are up to you, but you should have both some tests of the
39 | input to `getT`, for example, that `x` and `f` have consistent
40 | dimension. Also, the way we wrote `getT` assumes that there is equal
41 | sample size between the two levels of `f`: don't bother making the
42 | function more general, but instead make sure that `f` indeed is a
43 | factor with two levels and that both levels have `little.n` number of
44 | samples. You should also have some tests that the function gives the
45 | expected output on a toy dataset. You can for example, simulate a few
46 | rows of data, and then check that `getT` gives the same answer as
47 | `t.test` with `var.equal=TRUE`. Remember to add the tests directory to
48 | your GitHub repository so that the graders can see it. You can try out
49 | your tests with `test_package` or `test_file`.
50 | 
51 | Finally, run `check(manual=TRUE)` from within your package. Make sure
52 | that it passes without error, including `checking tests ...` and put
53 | the output below: 
54 | 
55 | ```
56 | #PUT THE OUTPUT OF CHECK HERE:
57 | > check(manual=TRUE)
58 | ...
59 | ```
60 | 
61 | # Question 3 - short debugging example
62 | 
63 | The last question is a short one. You should (temporarily) add a bug
64 | to `getT`, which can be a either a bug that produces an error or a
65 | silent bug which creates some kind of erroneous output. Then use one
66 | of the debugging methods presented in the lecture to get to this point
67 | in the code when calling the function, and examine the variables at
68 | that point in the code, e.g. `ls()` and using `head`. Take a
69 | screenshot of the debugging environment and add this screenshot to
70 | your GitHub repository.
71 | 
72 | 


--------------------------------------------------------------------------------
/rpkg/rpkg_HW3.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Homework 3 - C++ code from R"
  3 | author: "your name here"
  4 | date: "`r format(Sys.time(), '%m/%d/%Y')`"
  5 | output: html_document
  6 | ---
  7 | 
  8 | # Setup and hints
  9 | 
 10 | For all of the following functions, you should add these functions to
 11 | your R package, by adding code to the `src` directory, adding the
 12 | necessary importing and linking to lines to `DESCRIPTION`, and then
 13 | writing and exporting a wrapper R function (e.g. adding a `@export`
 14 | tag). 
 15 | 
 16 | Remember to include the following at the top of your C++ script (you
 17 | do not include `Rcpp.h` because `RcppArmadillo.h` includes this). As
 18 | the homework uses RcppArmadillo, the following covers all questions:
 19 | 
 20 | ```
 21 | #include <RcppArmadillo.h>
 22 | using namespace Rcpp;
 23 | ```
 24 | 
 25 | Remember that Armadillo uses 0-based indexing. This is true for square
 26 | bracket indexing `z[0]` or `Z[0,0]` as well as using the row and
 27 | column getters: `z.row(0)` and `z.col(0)`.
 28 | 
 29 | # Question 1 - simple Rcpp function
 30 | 
 31 | Write a simple Rcpp function that implements the following R code:
 32 | 
 33 | `ifelse(x < 0.0, 1.0, exp(x))`
 34 | 
 35 | Compare it's output in R. Your R wrapper function should be named
 36 | `one.or.exp()`.
 37 | 
 38 | ```{r}
 39 | library(devtools)
 40 | document("bios735")
 41 | load_all("bios735")
 42 | x <- -10:10
 43 | one.or.exp(x)
 44 | all.equal(ifelse(x < 0.0,1.0,exp(x)), one.or.exp(x))
 45 | ```
 46 | 
 47 | # Question 2 - random walk in C++ and vectorized R
 48 | 
 49 | Write a random walk function in C++, call the wrapper function for
 50 | your C++ function, `randomWalk2`. Your C++ function should follow the
 51 | same style as the function below, by iterating over 1-niter, despite
 52 | the fact that the random walk can be vectorized in R as a cumulative
 53 | sum. You will later compare to the vectorized random walk in R.
 54 | 
 55 | Hint: you should use the C++ function `Rf_rbinom(integer size, numeric
 56 | probability)` function to flip the coin in the random walk.
 57 | 
 58 | Compare the output of `randomWalk2` to ensure that it produces the
 59 | same output as `randomWalk1`, with the same seed set in R. How much
 60 | faster is your random walk? 
 61 | 
 62 | ```{r}
 63 | niter <- 1e4
 64 | lambda <- .01
 65 | randomWalk1 <- function(niter,lambda) {
 66 |   x <- numeric(niter)
 67 |   y <- numeric(niter)
 68 |   for (i in seq_len(niter)[-1]) {
 69 |     x[i] <- x[i-1] + lambda * (2.0 * rbinom(1,1,0.5) - 1.0)
 70 |   }
 71 |   for (i in seq_len(niter)[-1]) {
 72 |     y[i] <- y[i-1] + lambda * (2.0 * rbinom(1,1,0.5) - 1.0)
 73 |   }
 74 |   list(x=x,y=y)
 75 | }
 76 | myplot <- function(dat) {
 77 |   niter <- length(dat$x)
 78 |   plot(0,type="n",xlim=c(-2,2),ylim=c(-2,2),xlab="x",ylab="y")
 79 |   cols <- colorRampPalette(c("blue","red","orange"))(100)
 80 |   with(dat, segments(x[-niter],y[-niter],x[-1],y[-1],col=rep(cols,each=niter/100)))
 81 | }
 82 | set.seed(5)
 83 | dat1 <- randomWalk1(niter,lambda)
 84 | str(dat1)
 85 | myplot(dat1)
 86 | ```
 87 | 
 88 | Your random walk:
 89 | 
 90 | ```{r}
 91 | document("bios375")
 92 | load_all("bios375")
 93 | set.seed(5)
 94 | dat2 <- randomWalk2(niter,lambda)
 95 | str(dat2)
 96 | myplot(dat2)
 97 | all.equal(dat2$x, dat1$x)
 98 | all.equal(dat2$y, dat1$y)
 99 | library(microbenchmark)
100 | microbenchmark(randomWalk1(niter,lambda),randomWalk2(niter,lambda),times=10)
101 | ```
102 | 
103 | Now write a vectorized version of the random walk in R and call this
104 | function `randomWalkVectorized`. Compare to ensure it gives the same
105 | result at `randomWalk1`, and compare its speed to `randomWalk1` and to
106 | `randomWalk2`:
107 | 
108 | ```{r}
109 | randomWalkVectorized <- function(niter,lambda) {
110 |   # your code here...
111 | }
112 | set.seed(5)
113 | datVec <- randomWalkVectorized(niter,lambda)
114 | str(datVec)
115 | myplot(datVec)
116 | all.equal(datVec$x, dat1$x)
117 | all.equal(datVec$y, dat1$y)
118 | library(microbenchmark)
119 | microbenchmark(randomWalk1(niter,lambda),randomWalkVectorized(niter,lambda),times=10)
120 | microbenchmark(randomWalk2(niter,lambda),randomWalkVectorized(niter,lambda),times=10)
121 | ```
122 | 
123 | # Question 3 - simple RcppArmadillo function
124 | 
125 | Write a simple RcppArmadillo function that solves for `x` in the
126 | matrix multiplication formula $Ax = b$. Call your R wrapper function
127 | `armadilloSolve`. You can skip the part about pointing to the matrix
128 | in R with `const arma::mat& A` for this and the following homework
129 | question. That is, the top line of your function can look like:
130 | 
131 | ```
132 | arma::mat armadillo_solve(arma::mat A, arma::vec b) {
133 | ```
134 | 
135 | Check that your function gives the correct answer here:
136 | 
137 | ```{r}
138 | document("bios735")
139 | load_all("bios735")
140 | A <- matrix(runif(12),ncol=3)
141 | x <- matrix(runif(3),ncol=1)
142 | b <- A %*% x
143 | xx <- armadilloSolve(A,b)
144 | all.equal(x, xx)
145 | ```
146 | 
147 | # Question 4 - column-wise ridge regression
148 | 
149 | Implement a C++ function that performs ridge regression one-by-one on
150 | columns of a matrix `Y`, using a constant design matrix `X`, and a
151 | variable ridge parameter `lambda`. That is, use the first element of
152 | `lambda` for the first column of `Y` and so on. Call your R wrapper
153 | function `colRidge2`. Again, you can skip the part about pointing from
154 | C++ to the matrix in R and just have the top of your C++ function look
155 | like, e.g.
156 | 
157 | ```
158 | arma::mat col_ridge_2(arma::mat Y, arma::mat X, arma::vec lambda) {
159 | ```
160 | 
161 | Compare the output of `colRidge2` to `colRidge1`. It is not expected to
162 | be numerically identical, but it should be "close" which you can see
163 | from the plots. How much faster is the C++ version?
164 | 
165 | Hint: for your C++ function use the simple ridge formula:
166 | 
167 | $\hat{\beta} = (X^t X + \lambda I)^{-1} X^t y$
168 | 
169 | Hint: see the `eye()` function in Armadillo to build the identity matrix.
170 | 
171 | ```{r}
172 | document("bios735")
173 | load_all("bios735")
174 | set.seed(1)
175 | n <- 100
176 | Y <- matrix(rnorm(n*20),nrow=20)
177 | X <- scale(matrix(rnorm(20*2),ncol=2))
178 | lambda <- runif(n,.1,2)
179 | library(MASS)
180 | colRidge1 <- function(Y, X, lambda) {
181 |   df <- as.data.frame(X)
182 |   n <- ncol(Y)
183 |   beta <- matrix(nrow=2,ncol=n)
184 |   stopifnot(length(lambda) == n)
185 |   for (j in seq_len(n)) {
186 |     beta[,j] <- coef(lm.ridge(Y[,j] ~ 0 + V1 + V2, data=df, lambda=lambda[j]))
187 |   }
188 |   beta
189 | }
190 | beta1 <- colRidge1(Y, X, lambda)
191 | beta1[,1:5]
192 | beta2 <- colRidge2(Y, X, lambda)
193 | beta2[,1:5]
194 | plot(beta1[1,], beta2[1,])
195 | abline(0,1)
196 | plot(beta1[2,], beta2[2,])
197 | abline(0,1)
198 | all.equal(beta1[1,], beta2[1,])
199 | all.equal(beta1[2,], beta2[2,])
200 | microbenchmark(colRidge1(Y, X, lambda), colRidge2(Y, X, lambda), times=10)
201 | ```
202 | 


--------------------------------------------------------------------------------
/rpkg/s_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/biodatascience/statcomp_src/558195d8bb181b582263678914107f12872a4443/rpkg/s_diagram.png


--------------------------------------------------------------------------------
/rpkg/test.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Writing package tests"
  3 | author: "Michael Love"
  4 | date: 10/25/2018
  5 | output: 
  6 |   html_document:
  7 |     toc: true
  8 |     toc_float: true
  9 | ---
 10 | 
 11 | # Why write tests?
 12 | 
 13 | Package tests are a simple way to make sure that the statistical
 14 | software you have written does what you expect, both when you run it
 15 | on typical and atypical input. I also tend to use package tests when
 16 | implementing new features in my software packages, as a way to check
 17 | to see that the new functionality works as I expect it.
 18 | 
 19 | There are multiple frameworks for writing package tests, but we will
 20 | focus on the framework that I find the most straightforward, which is
 21 | implemented in the *testthat* package.
 22 | 
 23 | A reference for writing tests with *testthat* can be found at the [R
 24 | Packages](http://r-pkgs.had.co.nz/tests.html) book by Hadley Wickham.
 25 | 
 26 | # Set up testthat for a package
 27 | 
 28 | To begin writing tests, say for a part of your software that you call
 29 | "name", you can run the *usethis* function `use_test("name")`. This
 30 | will create a directory called `tests/testthat` in the root of your R
 31 | package directory, add *testthat* to your `Suggests:` line in the
 32 | `DESCRIPTION` file, create a file `tests/testthat.R` that will run
 33 | all the tests in `tests/testthat` when you run R's package check, and
 34 | create a file `tests/testthat/test-name.R`. You may have multiple
 35 | groups of tests that you want to separate into different files, so you
 36 | can choose "name" however you like, e.g. `test-data-input.R`,
 37 | `test-normalization.R`, etc. However, you can also put all your tests
 38 | into a single file for the package, e.g. `test-foo.R`.
 39 | 
 40 | The `testthat.R` file is very simple:
 41 | 
 42 | ```
 43 | # This file is part of the standard setup for testthat.
 44 | # It is recommended that you do not modify it.
 45 | #
 46 | # Where should you do additional test configuration?
 47 | # Learn more about the roles of various files in:
 48 | # * https://r-pkgs.org/tests.html
 49 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
 50 | 
 51 | library(testthat)
 52 | library(foo)
 53 | 
 54 | test_check("foo")
 55 | ```
 56 | 
 57 | This file stays the same way, and we will write new `.R` files that go
 58 | into `tests/testthat` which will implement the package tests.
 59 | 
 60 | Suppose we run `use_test("add")` for our *foo* package, and we want to
 61 | write a test for our `add` function (make sure the *usethis* pakage is loaded). We can do this by opening up the
 62 | file `tests/testthat/test-add.R`, and adding some tests. The default
 63 | file has some dummy code to show you the style:
 64 | 
 65 | ```
 66 | test_that("multiplication works", {
 67 |   expect_equal(2 * 2, 4)
 68 | })
 69 | ```
 70 | 
 71 | But we can rewrite this for our purposes:
 72 | 
 73 | ```
 74 | test_that("add works on two vectors", {
 75 | 
 76 |   expect_equal(add(1:5,6:10), c(7,9,11,13,15))
 77 | 
 78 | })
 79 | 
 80 | test_that("simple errors for bad input", {
 81 | 
 82 |   expect_error(add())
 83 |   expect_error(add(1:5))
 84 |   expect_error(add(1:5,6:10,"yes"))
 85 | 
 86 | })
 87 | ```
 88 | 
 89 | There are many possible tests that one can write, with the workhorses
 90 | probably being `expect_equal` and `expect_true`. We can also specify a
 91 | numerical tolerance (absolute or relative) for equality, as shown in
 92 | the Examples in `?expect_equal`. In order to see a list of all the
 93 | `expect_` functions available in *testthat*, one can run the following
 94 | command in R:
 95 | 
 96 | ```{r eval=FALSE}
 97 | help(package="testthat", help_type="html")
 98 | ```
 99 | 
100 | # Messages, warnings, and errors
101 | 
102 | We can also check that specific messages, warnings, or errors are
103 | output for given input to our function. These three levels of output
104 | `message` the user relevant information, provide a `warning` to the
105 | user about potential problems, or `stop` the function from providing
106 | any output.
107 | 
108 | If we wanted the `add` function to warn the user about negative values
109 | as output (just a trivial example), we could write:
110 | 
111 | ```{r}
112 | add2 <- function(x,y,negative=FALSE) {
113 |   z <- x + y
114 |   if (negative) {
115 |     z <- -1 * z
116 |   }
117 |   if (any(z < 0)) {
118 |     warning("some output values are negative")
119 |   }
120 |   z
121 | }
122 | ```
123 | 
124 | We could then test this by saying we expect a specific warning. Note
125 | that the entire warning doesn't need to be written out, only a regular
126 | expression that would produce a match.
127 | 
128 | ```{r}
129 | library(testthat)
130 | expect_warning(add2(1:5, -11:-15), "are negative")
131 | ```
132 | 
133 | If we wanted to test for a message or error, we would use
134 | `expect_message` or `expect_error` with the `message` or `stop`
135 | function respectively.
136 | 
137 | # Testing files or packages
138 | 
139 | We can check all the tests for individual files with the following
140 | call to `test_file`, from within the package root:
141 | 
142 | ```{r eval=FALSE}
143 | library(devtools)
144 | load_all()
145 | test_file("tests/testthat/test-add.R")
146 | ```
147 | 
148 | Or we can check all of the tests for a given package with the
149 | following call to `test_package`:
150 | 
151 | ```{r eval=FALSE}
152 | test_package("foo")
153 | ```
154 | 
155 | # Session info
156 | 
157 | ```{r}
158 | library(devtools)
159 | session_info()
160 | ```
161 | 


--------------------------------------------------------------------------------