├── .gitignore
├── README.md
├── homework
    ├── HW1
    │   ├── hw1-problems.Rmd
    │   ├── hw1-problems.html
    │   ├── hw1-solution.Rmd
    │   └── hw1-solution.html
    ├── HW2
    │   ├── HW2-solution.Rmd
    │   └── HW2.Rmd
    ├── HW3
    │   ├── HW3-solution.Rmd
    │   └── HW3.Rmd
    ├── HW4
    │   ├── HW4-problems.Rmd
    │   └── HW4-solution.Rmd
    ├── HW5
    │   ├── HW5-solution.Rmd
    │   └── HW5.Rmd
    └── HW6
    │   ├── HW6-solution.Rmd
    │   └── HW6.Rmd
└── lectures
    ├── R
        ├── .Rhistory
        ├── console-screen-shot.png
        ├── data-import.Rmd
        ├── fixheight.R
        ├── intro-to-R.Rmd
        └── screenshot.png
    ├── databases
        ├── pics
        │   ├── RM.png
        │   ├── all.png
        │   ├── one_table.png
        │   ├── subset.png
        │   └── two_tables.png
        └── relational-databases.Rmd
    ├── dataviz
        ├── data-visualization-ggplot2.Rmd
        ├── intro-to-eda.Rmd
        ├── pics
        │   ├── .DS_Store
        │   ├── DLP_slide.jpg
        │   ├── DLP_slide.pdf
        │   ├── Moneyball_Poster.jpg
        │   ├── ad.png
        │   ├── classwebpage.png
        │   ├── depodesta.jpg
        │   ├── meonbike.jpg
        │   ├── nythist.gif
        │   ├── nytimesvotingpattern.jpg
        │   ├── nytimesvotingpattern.png
        │   └── tweetsproductplacement.gif
        ├── shiny
        │   ├── demos
        │   │   ├── 06-basic-app.R
        │   │   ├── 09-layers.R
        │   │   └── www
        │   │   │   └── shiny.png
        │   ├── exercises
        │   │   ├── 01-template.R
        │   │   ├── 02-slider.R
        │   │   ├── 03-plotOutput.R
        │   │   ├── 04-renderPlot.R
        │   │   ├── 05-reactivity.R
        │   │   ├── 07-vocab
        │   │   ├── 08-render.R
        │   │   ├── 09-reactive.R
        │   │   ├── 10-eventReactive.R
        │   │   ├── 11-reactiveValues.R
        │   │   ├── 17-layout.R
        │   │   ├── 18-panels.R
        │   │   ├── 19-navbarPage.R
        │   │   └── 20-tags
        │   │   │   ├── app.R
        │   │   │   └── www
        │   │   │       └── shiny.png
        │   └── pdfs
        │   │   ├── 01-Intro.pdf
        │   │   └── 02-Reactivity-and-UI.pdf
        └── shiny_section
        │   ├── map
        │       ├── app.R
        │       ├── data
        │       │   └── counties.rds
        │       └── helpers.R
        │   ├── maps.r
        │   ├── smoother
        │       └── app.R
        │   └── smoothing.r
    ├── git-and-github
        ├── images
        │   ├── git_add.png
        │   ├── git_clone.png
        │   ├── git_commit.png
        │   ├── git_fetch.png
        │   ├── git_layout.png
        │   ├── git_merge.png
        │   ├── git_push.png
        │   ├── git_status.png
        │   ├── gitclean.png
        │   ├── gitclone.png
        │   ├── gitcommit.png
        │   ├── github-https-clone.png
        │   ├── github-ssh-clone.png
        │   ├── github.png
        │   ├── github_ssh.png
        │   ├── gitpush.png
        │   ├── gitstaged.png
        │   ├── gituntracked.png
        │   ├── mac-git-security.png
        │   ├── sshkeygen.png
        │   ├── wgi-defaultlines.png
        │   ├── wgi-git-bash.png
        │   ├── wgi-scarymessage.png
        │   └── wgi-usemintty.png
        ├── setting-up-git.Rmd
        ├── submitting-HW-using-git.Rmd
        └── version-control.Rmd
    ├── inference
        ├── aggregators.Rmd
        ├── inference.Rmd
        ├── inference.pdf
        ├── models-no-solutions.Rmd
        ├── models.pdf
        ├── pics
        │   ├── DLP_slide.jpg
        │   ├── DLP_slide.pdf
        │   ├── Moneyball_Poster.jpg
        │   ├── depodesta.jpg
        │   ├── jar-of-beads.jpg
        │   ├── meonbike.jpg
        │   ├── nate-silver-1.png
        │   ├── nate-silver-2.png
        │   ├── nythist.gif
        │   ├── nytimesvotingpattern.jpg
        │   ├── tweetsproductplacement.gif
        │   └── wranging_data_with_tidy.Rmd
        ├── predict_elections-with-solutions.Rmd
        ├── predict_elections.Rmd
        └── probability.Rmd
    ├── ml
        ├── RafaClass_Ensembles_Rose.pdf
        ├── cross-validation.pdf
        ├── cv.Rmd
        ├── decision-trees.Rmd
        ├── dimension-reduction.Rmd
        ├── intro-ml.Rmd
        ├── lda.Rmd
        ├── matrices.Rmd
        ├── regularization.Rmd
        └── smoothing.Rmd
    ├── models
        ├── bayes.Rpres
        ├── bayes.gif
        ├── confounding.Rmd
        └── models.Rmd
    ├── regression
        ├── baseball.Rmd
        ├── regression-broom.Rmd
        ├── regression-in-practice.Rmd
        └── regression.Rmd
    └── wrangling
        ├── data-wrangling-with-dplyr.Rmd
        ├── data-wrangling-with-tidyr.Rmd
        ├── data-wrangling.Rmd
        ├── httr_twitter_and_text.Rmd
        ├── pics
            ├── dplyr_binding.png
            ├── dplyr_filtering_joins.png
            ├── dplyr_mutating_joins.png
            ├── dplyr_set_operations.png
            ├── dplyr_two_tables.png
            ├── stocks-by-company.png
            ├── stocks-by-time.png
            └── stocks-tidy.png
        └── rvest-scraping.Rmd


/.gitignore:
--------------------------------------------------------------------------------
1 | *.DS_Store
2 | *.Rhistory
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Archived Spring 2016 BIO 260 Introduction to Data Science course
2 | 
3 | Spring 2016 course page here: [http://datasciencelabs.github.io/2016](http://datasciencelabs.github.io/2016)
4 | 


--------------------------------------------------------------------------------
/homework/HW1/hw1-problems.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Homework 1: Insights on Poverty"
  3 | date: "February 4, 2016"
  4 | output: html_document
  5 | ---
  6 |  
  7 | **This homework is due Sunday February 14, 2016 at 11:59 PM. When complete, submit your code in the R Markdown file and the knitted HTML file on Canvas.**
  8 | 
  9 | 
 10 | # Background
 11 | 
 12 | This HW is based on Hans Rosling talks [New Insights on Poverty](https://www.ted.com/talks/hans_rosling_reveals_new_insights_on_poverty?language=en) and [The Best Stats You've Ever Seen](https://www.ted.com/talks/hans_rosling_shows_the_best_stats_you_ve_ever_seen). 
 13 | 
 14 | The assignment uses data to answer specific question about global health and economics. The data contradicts commonly held preconceived notions. For example, Hans Rosling starts his talk by asking: (paraphrased) "for each of the six pairs of countries below, which country do you think had the highest child mortality in 2015?" 
 15 | 
 16 | 1. Sri Lanka or Turkey
 17 | 2. Poland or South Korea
 18 | 3. Malaysia or Russia
 19 | 4. Pakistan or Vietnam
 20 | 5. Thailand or South Africa
 21 | 
 22 | Most people get them wrong. Why is this? In part it is due to our preconceived notion that the world is divided into two groups: the
 23 | _Western world_ versus the _third world_, characterized by "long life,small family" and "short life, large family" respectively. In this homework we will use data visualization to gain insights on this topic.  
 24 | 
 25 | 
 26 | # Problem 1
 27 | 
 28 | The first step in our analysis is to download and organize the data. The necessary data to answer these question is available on the [gapminder](http://www.gapminder.org/data/) website. 
 29 | 
 30 | ## Problem 1.1
 31 | 
 32 | We will use the following datasets:
 33 | 
 34 | 1.     [Childhood mortality](http://spreadsheets.google.com/pub?key=0ArfEDsV3bBwCcGhBd2NOQVZ1eWowNVpSNjl1c3lRSWc&output=csv)
 35 | 2.     [Life expectancy](http://spreadsheets.google.com/pub?key=phAwcNAVuyj2tPLxKvvnNPA&output=csv)
 36 | 3.     [Fertility](http://spreadsheets.google.com/pub?key=phAwcNAVuyj0TAlJeCEzcGQ&output=csv)
 37 | 4.     [Population](http://spreadsheets.google.com/pub?key=phAwcNAVuyj0XOoBL_n5tAQ&output=csv)
 38 | 5.     [Total GDP](http://spreadsheets.google.com/pub?key=pyj6tScZqmEfI4sLVvEQtHw&output=csv)
 39 | 
 40 | Create five `tbl_df` table objects, one for each of the tables provided in the above files. Hints: Use the `read_csv` function. Because these are only temporary files, give them short names.
 41 | ```{r,include=FALSE}
 42 | # Put your code here.
 43 | ```
 44 | 
 45 | 
 46 | ## Problem 1.2
 47 | 
 48 |  Write a function called `my_func` that takes a table as an argument and returns the column name. For each of the five tables, what is the name of the column containing the country names? Print out the tables or look at them with `View` to determine the column.
 49 | 
 50 | ```{r}
 51 | # Your code goes here.
 52 | ```
 53 | 
 54 | ## Problem 1.3 
 55 | 
 56 | In the previous problem we noted that gapminder is inconsistent in naming their country column. Fix this by assigning a common name to this column in the various tables.
 57 | 
 58 | ```{r}
 59 | # Your code goes here.
 60 | ```
 61 | 
 62 | ## Problem 1.4 
 63 | 
 64 | Notice that in these tables, years are represented by columns. We want to create a tidy dataset in which each row is a unit or observation and our 5 values of interest, including the year for that unit, are in the columns. The unit here is a country/year pair and each unit gets values:
 65 | 
 66 | ```{r}
 67 | # Your code goes here.
 68 | ```
 69 | 
 70 | We call this the _long_ format. Use the `gather` function from the `tidyr` package to create a new table for childhood mortality using the long format. Call the new columns `year` and `child_mortality`
 71 | 
 72 | ```{r}
 73 | # Your code goes here.
 74 | ```
 75 | 
 76 | Now redefine the remaining tables in this way.
 77 | 
 78 | ```{r}
 79 | # Your code goes here.
 80 | ```
 81 | 
 82 | 
 83 | ## Problem 1.5
 84 | 
 85 | Now we want to join all these files together. Make one consolidated table containing all the columns
 86 | 
 87 | ```{r}
 88 | # Your code goes here.
 89 | ```
 90 | 
 91 | ## Problem 1.6
 92 | 
 93 | Add a column to the consolidated table containing the continent for each country. Hint: We have created a file that maps countries to continents [here](https://github.com/datasciencelabs/data/blob/master/homework_data/continent-info.tsv). Hint: Learn to use the `left_join` function.
 94 | 
 95 | ```{r}
 96 | # Your code goes here.
 97 | ```
 98 | 
 99 | # Problem 2 
100 | 
101 | Report the child mortalilty rate in 2015 for these 5 pairs:
102 | 
103 | 1. Sri Lanka or Turkey
104 | 2. Poland or South Korea
105 | 3. Malaysia or Russia
106 | 4. Pakistan or Vietnam
107 | 5. Thailand or South Africa
108 | 
109 | ```{r}
110 | # Your code goes here.
111 | ```
112 | 
113 | # Problem 3
114 | 
115 | To examine if in fact there was a long-life-in-a-small-family and short-life-in-a-large-family dichotomy,  we will visualize the average number of children per family (fertility) and the life expectancy for each country.
116 | 
117 | ## Problem 3.1 
118 | 
119 | Use `ggplot2` to create a plot of life expectancy versus fertiltiy for 1962 for Africa, Asia, Europe, and the Americas. Use color to denote continent and point size to denote population size:
120 | 
121 | ```{r}
122 | # Your code goes here.
123 | ```
124 | 
125 | Do you see a dichotomy? Explain.
126 | 
127 | ## Problem 3.2
128 | 
129 | Now we will annotate the plot to show different types of countries. 
130 | 
131 | Learn about OECD and OPEC. Add a couple of columns to your consolidated tables containing a logical vector that tells if a country is OECD and OPEC respectively. It is ok to base membership on 2015.
132 | 
133 | ```{r}
134 | # Your code goes here.
135 | ```
136 | 
137 | ### Problem 3.3
138 | 
139 | Make the same plot as in Problem 3.1, but this time use color to annotate the OECD countries and OPEC countries. For countries that are not part of these two organization annotate if they are from Africa, Asia, or the Americas.
140 | 
141 | ```{r}
142 | # Your code goes here.
143 | ```
144 | 
145 | How would you describe the dichotomy?
146 | 
147 | 
148 | ### Problem 3.4
149 | 
150 | Explore how this figure changes across time. Show us 4 figures that demonstrate how this figure changes through time.
151 | 
152 | ```{r}
153 | # Your code goes here.
154 | ```
155 | 
156 | Would you say that the same dichotomy exists today? Explain:
157 | 
158 | ## Problem 3.5 (Optional)
159 | 
160 | Make an animation with the `gganimate` package.
161 | 
162 | ```{r, eval=FALSE}
163 | # Your code goes here.
164 | ```
165 | 
166 | 
167 | # Problem 4 
168 | Having time as a third dimension made it somewhat difficult to see specific country trends. Let's now focus on specific countries.
169 | 
170 | ## Problem 4.1
171 | Let's compare France and its former colony Tunisia. Make a plot of fertility versus year with color denoting the country. Do the same for life expecancy. How would you compare Tunisia's improvement compared to France's in the past 60 years? Hint: use `geom_line`
172 |  
173 | ```{r}
174 | # Put your code here.
175 | ```
176 | 
177 | ## Problem 4.2
178 | 
179 | Do the same, but this time compare Vietnam to the OECD countries.
180 | 
181 | ```{r}
182 | # Put your code here.
183 | ```
184 | 
185 | 
186 | # Problem 5
187 | 
188 | We are now going to examine GDP per capita per day.
189 | 
190 | ## Problem 5.1
191 | 
192 | Create a smooth density estimate of the distribution of GDP per capita per day across countries in 1970. Include OECD, OPEC, Asia, Africa, and the Americas in the computation. When doing this we want to weigh countries with larger populations more. We can do this using the "weight"" argument in `geom_density`. 
193 | 
194 | ```{r,warning=FALSE}
195 | # Your code goes here.
196 | ```
197 | 
198 | ## Problem 5.2
199 | 
200 | Now do the same but show each of the five groups separately.
201 | 
202 | ```{r,warning=FALSE}
203 | # Your code goes here.
204 | ```
205 | 
206 | 
207 | ## Problem 5.3
208 | 
209 | Visualize these densities for several years. Show a couple of of them. Summarize how the distribution has changed through the years.
210 | 
211 | ```{r,warning=FALSE}
212 | # Put your code here.
213 | ```
214 | 


--------------------------------------------------------------------------------
/homework/HW2/HW2.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Homework 2: The Big Short"
  3 | output: html_document
  4 | ---
  5 |  
  6 | **This homework is due Friday February 26, 2016 at 5:00 PM. When complete, submit your code in the R Markdown file and the knitted HTML via GitHub.**
  7 | 
  8 | # Background 
  9 | 
 10 | This homework is motivated by circumstances surrounding the [financial crisis of 2007-2008](https://en.wikipedia.org/wiki/Financial_crisis_of_2007%E2%80%9308). We titled the homework _The Big Short_, after the book on the same topic that was also recently made into a movie.
 11 | 
 12 | Part of what caused the financial crisis was that the risk of certain [securities](https://en.wikipedia.org/wiki/Security_(finance)) sold by financial institutions were  underestimated. Specifically, 
 13 | the risk of mortgage-backed securities (MBS) and collateralized debt obligations (CDO), the price of which depends on homeowners making their monthly payments, was grossly underestimated. A combination of factors resulted in many more defaults than were expected. This resulted in a crash of the prices of these securities. As a consequence, banks lost so much money that they needed bailouts to avoid default.
 14 | 
 15 | Here we present a **very** simplified version of what happened with some of these securities. Hopefully it will help you understand how a wrong assumption about the statistical behavior of events can lead to substantial differences between what the model predicts and what actually happens. Specifically, we will see how using an independence assumption can result in misleading conclusions. Before we start with the specific application we ask you about a simple casino game.
 16 | 
 17 | # Problem 1
 18 | 
 19 | In the game of [roullete](https://en.wikipedia.org/wiki/Roulette)
 20 | you can bet on several things including black or red. On this bet, if you win, you double your earnings. How does the casino make money on this then? If you look at the [possibilities](http://www.math.uah.edu/stat/games/Roulette.png)
 21 | you realize that the chance of red or black are both slightly less than 1/2. There are two green spots, so the of landing on black (or red) is actually 18/38, or 9/19.
 22 | 
 23 | 
 24 | ## Problem 1A
 25 | 
 26 | Let's make a quick sampling model for this simple version of roulette. You are going to bet a dollar each time you play and always bet on black. Make a box model for this process using the `sample` function. Write a function `get_outcome` that takes as an argument the number of times you play $N$ and returns your earnings $S_N$.
 27 | 
 28 | ```{r}
 29 | ##Your code here
 30 | ```
 31 | 
 32 | ## Problem 1B
 33 | 
 34 | Use Monte Carlo simulation to study the distribution of total earnings $S_N$ for $N=10,25,100,1000$. That is, study the distribution of earnings for different number of plays. What are the distributions of these two random variables? How do the expected values and standard errors change with $N$? Then do the same thing for the average winnings $S_N/N$. What result that you learned in class predicts this?
 35 | 
 36 | ```{r}
 37 | ##Your code here
 38 | ```
 39 | 
 40 | Your answer here.
 41 | 
 42 | 
 43 | ## Problem 1C
 44 | 
 45 | What is the expected value of our sampling model? What is the standard deviation of our sampling model?
 46 | 
 47 | Your answer here.
 48 | 
 49 | ## Problem 1D
 50 | 
 51 | Use CLT to approximate the probability that the casino loses money when you play 25 times. Then use a Monte Carlo simulation to confirm.
 52 | 
 53 | ```{r}
 54 | ##Your code here
 55 | ```
 56 | 
 57 | 
 58 | 
 59 | ## Problem 1E
 60 | 
 61 | In general, what is the probability that the casino loses money as a function of $N$? Make a plot for values ranging from 25 to 1,000. Why does the casino give you free drinks if you keep playing?
 62 | 
 63 | ```{r}
 64 | ##Your code here
 65 | ```
 66 | 
 67 | Your answer here. 
 68 | 
 69 | # Problem 2 
 70 | 
 71 | You run a bank that has a history of identifying potential homeowners that can be trusted to make payments. In fact, historically, in a given year, only 2% of your customers default. You want to use stochastic models to get an idea of what interest rates you should charge to guarantee a profit this upcoming year. 
 72 | 
 73 | ## Problem 2A
 74 | 
 75 | Your bank gives out 1,000 loans this year. Create a sampling model and use the function `sample` to simulate the number of foreclosure in a year with the information that 2% of customers default. Also suppose your bank loses $120,000 on each foreclosure. Run the simulation for one year and report your loss.
 76 | 
 77 | ```{r}
 78 | ##your code here
 79 | ```
 80 | 
 81 | ## Problem 2B
 82 | 
 83 | Note that the loss you will incur is a random variable. Use Monte Carlo simulation to estimate the distribution of this random variable. Use summaries and visualization to describe your potential losses to your board of trustees.
 84 | 
 85 | ```{r}
 86 | ##your code here
 87 | ```
 88 | 
 89 | ## Problem 2C
 90 | 
 91 | The 1,000 loans you gave out were for $180,000. The way your bank can give out loans and not lose money is by charging an interest rate. If you charge an interest rate of, say, 2% you would earn $3,600 for each loan that doesn't foreclose. At what percentage should you set the interest rate so that your expected profit totals $100,000. Hint: Create a sampling model with expected value 100 so that when multiplied by the 1,000 loans you get an expectation of $100,000. Corroborate your answer with a Monte Carlo simulation.
 92 | 
 93 | Your solution here. 
 94 | ```{r}
 95 | ###your code here
 96 | ```
 97 | 
 98 | 
 99 | ## Problem 2D
100 | 
101 | In problem 2C, you were able to set a very low interest rate. Your customers will be very happy and you are expected to earn $100,000 in profits. However, that is just an expectation. Our profit is a random variable. If instead of a profit your bank loses money, your bank defaults. Under the conditions of Problem 2C, what is the probability that your profit is less than 0?
102 | 
103 | ```{r}
104 | ##your code here
105 | ```
106 | 
107 | ## Problem 2E
108 | 
109 | Note that the probability of losing money is quite high. To what value would you have to raise interest rates in order to make the probability of losing money, and your bank and your job, as low as 0.001? What is the expected profit with this interest rate? Corroborate your answer with a Monte Carlo simulation.
110 | 
111 | Hint: Use the following short cut. If $p$ fraction of a box are $a$s and $(1-p)$ are $b$s, then the SD of the list is $\mid a-b \mid \sqrt{p(1-p)}$ 
112 | 
113 | Your solution here.
114 | 
115 | ```{r}
116 | ###your code here
117 | ```
118 | 
119 | ## Problem 2F
120 | 
121 | Note that the Monte Carlo simulation gave a slightly higher probability than 0.001. What is a possible reason for this? 
122 | Hint: See if the disparity is smaller for larger values of $p$. Also check for probabilities larger than 0.001. Recall we made an assumption when we calculated the interest rate.
123 | 
124 | 
125 | ```{r}
126 | ##your code here
127 | ```
128 | 
129 | 
130 | Your answer here.
131 | 
132 | ## Problem 3
133 | 
134 | We were able to set an interest rate of about 2% that guaranteed a very low probability of having a loss. Furthermore, the expected average was over $1 million. Now other financial companies noticed the success of our business. They also noted that if we increase the number of loans we give, our profits increase. However, the pool of reliable borrowers was limited. So these other companies decided to give loans to less reliable borrowers but at a higher rate.
135 | 
136 | ## Problem 3A
137 | 
138 | The pool of borrowers they found had a much higher default rate, estimated to be $p=0.05$. What interest rate would give these companies the same expected profit as your bank (Answer to 2E)? 
139 | 
140 | ```{r}
141 | ##your code here
142 | ```
143 | 
144 | ## Problem 3B 
145 | 
146 | At the interest rate calculated in 3A what is the probability of negative profits? Use both the normal approximation and then confirm with a Monte Carlo simulation.
147 | 
148 | ```{r}
149 | ##your code here
150 | ```
151 | 
152 | ## Problem 3C 
153 | 
154 | Note that the probability is much higher now. This is because the standard deviation grew. The companies giving out the loans did not want to raise interest rates much more since it would drive away clients. Instead they used a statistical approach. They increased $N$. How large does $N$ need to be for this probability to be 0.001? Use the central limit approximation and then confirm with a Monte Carlo simulation.
155 | 
156 | Your answer here.
157 | ```{r}
158 | ###your code here
159 | ```
160 | 
161 | So by doubling the number of loans we were able to reduce our risk! Now, for this to work, all the assumptions in our model need to be approximately correct, including the assumption that the probability of default was **independent**. This turned out to be false and the main reason for the under estimation of risk.
162 | 
163 | 
164 | ## Problem 3D
165 | 
166 | Define the following matrix of outcomes for two borrowers using our previous box model:
167 | 
168 | ```{r}
169 | loan <- 180000
170 | loss_per_foreclosure <- 120000
171 | p2 <- 0.05
172 | interest_rate2 <- 0.05
173 | B <- 10^5
174 | outcomes1 <- replicate(B,{
175 |   sample( c(-loss_per_foreclosure, interest_rate2*loan ), 2, replace=TRUE, prob=c(p2, 1-p2))
176 | })
177 | ```
178 | We can confirm independence by computing the probability of default for the second conditioned on the first defaulting: 
179 | 
180 | ```{r}
181 | sum( outcomes1[1,] < 0 & outcomes1[2,]<0)/sum(outcomes1[1,]<0)
182 | ```
183 | 
184 | This quantity is about the same as the probability of default $0.05$.
185 | 
186 | Now we create a new model. Before generating each set of defaults, we assume that a random event occurred that makes all default probabilities go up or go down by 4 points. We could see how this would happen if, for example, demand for houses decreases and all house prices drop. 
187 | 
188 | ```{r}
189 | B <- 10^5
190 | outcomes2 <- replicate(B,{
191 |   add <- sample( c(-0.04,0.04), 1)
192 |   sample( c(-loss_per_foreclosure, interest_rate2*loan ), 2, replace=TRUE, prob=c(p2+add, 1-(p2+add)))
193 | })
194 | ```
195 | 
196 | Note that the outcomes are no longer independent as demonstrated by this result not being equal to 0.05
197 | 
198 | ```{r}
199 | sum( outcomes2[1,] < 0 & outcomes2[2,]<0)/sum(outcomes2[1,]<0)
200 | ```
201 | 
202 | 
203 | Generate a simulation with correlated outcomes such as those above. This time use the interest rate calculated in 3A. What is the expected earnings under this model compared to the previous? What is the probability of losing $1 million compared to the previous? What is the probability of losing $10 million compared to the previous?
204 | 
205 | 
206 | 
207 | ```{r}
208 | ###your code here
209 | ```
210 | 
211 | 
212 | ## Problem 4
213 | 
214 | Read [this wikipedia page](https://en.wikipedia.org/wiki/Financial_crisis_of_2007%E2%80%9308) about the financial crisis. Write a paragraph describing how what you learned in this homework can help explain the conditions that led to the crisis.
215 | 


--------------------------------------------------------------------------------
/homework/HW3/HW3.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Homework 3: Is Donald Trump going to win the republican nomination?"
  3 | output: html_document
  4 | ---
  5 | 
  6 | **This homework is due Tuesday March 8, 2016 at 8PM EST. When complete, submit your code in an R Markdown file and the knitted HTML via GitHub.**
  7 | 
  8 | # Motivation
  9 | 
 10 | In 2012 Nate Silver, and other data scientists, [predicted the outcome of each state correctly](http://mashable.com/2012/11/07/nate-silver-wins/#2WkAUaXCVaqw). 
 11 | They did this by aggregating data from many polls to create more precise
 12 | estimates than what one single poll can provide.
 13 | 
 14 | In this homework, we will try to predict the results of the democratic 
 15 | and republican primaries by studying the performance of polls in 
 16 | elections that already occurred and then aggregating results.
 17 | 
 18 | 
 19 | # Problem 1 
 20 | 
 21 | The first step in our analysis will be to wrangle the data in a way 
 22 | that will simplify the analysis. Ultimately, we want a table of results 
 23 | with each poll represented by a row and including results for each 
 24 | candidate as well as information about the poll such as name and date.
 25 | 
 26 | #  Problem 1A
 27 | 
 28 | Install and load the `pollstR` package. This package provides functions 
 29 | to access data in the Huffington Post's database. Read the help file 
 30 | for the `pollstr_polls()` function and write a function that reads 
 31 | **all** the polls related to the republican primaries. Name the object 
 32 | `race2016`. Hint: Visit 
 33 | [this webpage](http://elections.huffingtonpost.com/pollster/api) 
 34 | to select the right `topic` and make sure to change the `max_pages` argument. 
 35 | 
 36 | 
 37 | ```{r, echo=FALSE, cache=TRUE, warning=FALSE, message=FALSE}
 38 | ##Your code here
 39 | 
 40 | ```
 41 | 
 42 | # Problem 1B
 43 | 
 44 | Examine and familiarize yourself with the `race2016` object. Note 
 45 | that the `questions` component has a table with election results. 
 46 | Look at the `topic` component of the `questions` component. Create a new 
 47 | table with only the results from the `2016-president-gop-primary` 
 48 | and only state (or territory) polls, no national polls. Hint: create 
 49 | a new object called `results` with the table of results and 
 50 | use `dplyr`. How many rows are we left with?
 51 | 
 52 | ```{r}
 53 | ##Your code here
 54 | 
 55 | ```
 56 | 
 57 | 
 58 | ## Problem 1C
 59 | 
 60 | In Problem 1B, we created a table called `results` with over 4000 rows. 
 61 | Does this mean that we have data for 4000 polls? How many polls 
 62 | did we actually have? 
 63 | Hint: look at the `id` column and use the `group_by` command.
 64 | 
 65 | ```{r}
 66 | ##Your code here
 67 | 
 68 | ```
 69 | 
 70 | 
 71 | ## Problem 1D
 72 | 
 73 | Look at the first row of your `results` table. 
 74 | What date was this poll conducted? 
 75 | Hint: Use the `polls` component of the `race2016` object to find the date.
 76 | 
 77 | ```{r}
 78 | ##Your code here
 79 | 
 80 | ```
 81 | 
 82 | ## Problem 1E
 83 | 
 84 | Now examine the candidates in the "choices" column included in `results` table. 
 85 | Hint: use the `table()` function. Note that there are several choices that
 86 | not going to be informative. For example, we have candidates that have
 87 | dropped out. We also have entries such as `No one`, `No One` and 
 88 | `No Preference`. Filter the `results` table to include only Rubio and Trump. 
 89 | 
 90 | ```{r}
 91 | ##Your code here
 92 | 
 93 | ```
 94 | 
 95 | ## Problem 1F
 96 | 
 97 | In our `results` table, we have one row for each candidate in each poll. 
 98 | Transform the `results` table to have one row for each poll and columns 
 99 | for each Rubio and Trump. Next, create a column called `diff` with the 
100 | difference between Trump and Rubio. Hint: Remove the `first_name` and 
101 | `last_name` columns then use the `tidyr` function `spread()`.
102 | 
103 | 
104 | ```{r}
105 | ##Your code here
106 | 
107 | ```
108 | 
109 | ## Problem 1G 
110 | 
111 | For each poll in the `results` table, we want to know the start date and the 
112 | end date of the poll along with the pollster name and the type of poll it was.
113 | Hint: This information is in the `polls` component of `race2016`. 
114 | You can select the relevant columns then use the `id` column to join the
115 | tables. One of the `join` functions in `tidyr` will do the trick.
116 | 
117 | ```{r}
118 | ##Your code here
119 | 
120 | ```
121 | 
122 | 
123 | ## Problem 1H
124 | 
125 | Study the type of values in the `pollster` column. Notice that you 
126 | have many different values but that certain names commonly appear 
127 | in these values. For example, consider the name "NBC" in the `pollster`
128 | column. NBC here is the Survey House. Use a join function again to add the survey 
129 | house to the `results` table. Rename the column `house`. 
130 | Hint: `race2016$survey_house` has the information you need.
131 | 
132 | ```{r}
133 | ##Your code here
134 | 
135 | ```
136 | 
137 | 
138 | # Problem 2
139 | 
140 | We now have a table with all the information we need. We will now use 
141 | the results from Iowa, New Hampshire, Nevada and South Carolina 
142 | to determine how to create a prediction for upcoming primaries.
143 | 
144 | ## Problem 2A 
145 | 
146 | Use an internet search to determine the results for the Iowa, 
147 | New Hampshire, Nevada and South Carolina primaries for the top two
148 | candidates. Create a table called `actual` with this information. 
149 | Also, create a column with the actual election difference.
150 | Use a join function to add this information to our `results` table. 
151 | 
152 | 
153 | ```{r}
154 | ##Your code here
155 | 
156 | ```
157 | 
158 | ## Problem 2B 
159 | 
160 | Create boxplots of the poll results for Trump in Iowa stratified by 
161 | the pollster survey house for polls having more than 4 total results. 
162 | Add a horizontal line with the actual results. 
163 | Hint: Use the `group_by`, `mutate`, `filter` and `ungroup` functions in 
164 | `dplyr` for the filtering step.
165 | 
166 | ```{r}
167 | ##Your code here
168 | 
169 | ```
170 | 
171 | ## Problem 2C
172 | 
173 | Using the poll results for Trump in Iowa,
174 | compute the standard deviation for the results from each pollster house 
175 | for polls having more than 4 total results. 
176 | Then, study the typical standard deviation sizes used in 
177 | these polls. Create a new table with two columns: the observed
178 | standard deviation and the standard deviations that theory predicts. 
179 | For the prediction you have several observations. Pick the smallest 
180 | one. Which is larger, the observed or the theoretical?
181 | 
182 | ```{r}
183 | ##Your code here
184 | 
185 | ```
186 | 
187 | ## Problem 2D
188 | 
189 | Now using the data from Problem 2C, plot the individual values 
190 | against the time the poll was taken (use the `end_date`). 
191 | Repeat this for each of the four states. Use color to denote pollster house. 
192 | Using this plot, explain why the theory does not match the observed results?
193 | 
194 | ```{r}
195 | ##Your code here
196 | 
197 | ```
198 | 
199 | ## Problem 2E 
200 | 
201 | Consider the Trump - Rubio difference. For each poll in IA, NH, SC and NV, 
202 | compute the error between the prediction and actual election results. 
203 | Use exploratory data analysis to get an idea of how time and pollster 
204 | impacts accuracy.
205 | 
206 | ```{r}
207 | ##Your code here
208 | 
209 | ```
210 | 
211 | 
212 | ## Problem 2F
213 | 
214 | For polls from IA, NH, and SC, aggregate all polls from within 1 week of the 
215 | election (use the `start_date` to determine cutoff) to provide a 
216 | 95% confidence interval for the difference between Trump and Rubio. 
217 | Compare the following two approaches: 
218 | (1) the method that assumes that all variance comes from sampling error 
219 | and (2) the approach that estimates variance empirically. 
220 | 
221 | ```{r}
222 | ##Your code here
223 | 
224 | ```
225 | 
226 | 
227 | # Problem 3
228 | 
229 | Before seeing any polls my _prior belief_ is that Rubio will beat 
230 | Trump in Florida. If I were to quantify this belief I would say that 
231 | the distribution of the `Trump` - `Rubio` was normal with mean 
232 | $\mu=-20$ percent and standard deviation $\tau=10$. 
233 | Let's call the difference $\theta$. Then 
234 | 
235 | $$
236 | \theta \sim N( \mu, \tau)
237 | $$
238 | 
239 | # Problem 3A
240 | 
241 | Under my prior belief, what is the chance that Trump would beat Rubio in Florida.
242 | 
243 | ```{r}
244 | ##Your code here
245 | 
246 | ```
247 | 
248 | # Problem 3B
249 | 
250 | Consider the latest 25 Florida polls. Assume the poll results for the 
251 | difference are normal distributed with mean $\theta$ and standard 
252 | deviation $\sigma$. Provide an estimate for $\theta$ and an estimate 
253 | of the standard deviation $\sigma$.
254 | 
255 | ```{r}
256 | ##Your code here
257 | 
258 | ```
259 | 
260 | $$ \hat{\theta} \sim N( \theta, \sigma/ \sqrt{25})$$
261 | 
262 | Now use the Central Limit Theorem to construct a confidence interval. 
263 | 
264 | ```{r}
265 | ##Your code here
266 | 
267 | ```
268 | 
269 | ## Problem 3C
270 | 
271 | Combine these two results to provide the mean and standard deviation of 
272 | a posterior distribution for $\theta$. 
273 | 
274 | ```{r}
275 | ##Your code here
276 | 
277 | ```
278 | 
279 | ## Problem 3D
280 | 
281 | Use the result form Problem 3C to provide your estimate of 
282 | Trump beating Rubio in Florida.
283 | 
284 | ```{r}
285 | ##Your code here
286 | 
287 | ```
288 | 
289 | 
290 | # Problem 4
291 | 
292 | Use the poll data as well as the results from Super Tuesday (March 1st) and other election results that happen before the deadline to make predictions for each remaining primary. Then use these results to estimate the probability of Trump winning the republican nomination. Justify your answer with figures, statistical arguments, and Monte Carlo simulations.
293 | 
294 | It will help to learn about how delegates are assigned. Here is [the manual].(http://www.scribd.com/doc/294928557/2016-Presidential-Nominating-Process-Book-version-2-0-Dec-2015-pdf)
295 | 
296 | 
297 | 
298 | 


--------------------------------------------------------------------------------
/homework/HW5/HW5.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Allies and Enemies"
  3 | output: html_document
  4 | ---
  5 | 
  6 | **This homework is due Sunday April 10, 2016 at 11:59PM EST. When complete, submit your code in an R Markdown file and the knitted HTML via GitHub.**
  7 | 
  8 | # Introduction 
  9 | 
 10 | The United Nations (UN) is an intergovernmental organization 
 11 | founded in 1946 to promote international cooperation. It now 
 12 | represents 193 member states. The General Assembly is the largest 
 13 | body, with a seat for every member of the UN. It discusses 
 14 | topics of international importance such as maintaining peace 
 15 | and security, providing humanitarian aid, and protecting human rights. 
 16 | 
 17 | We will be analyzing a dataset containing the full history of 
 18 | General Assembly votes by each country to determine what 
 19 | countries vote similarly and which do not. We will also 
 20 | explore how this changes through time.
 21 | 
 22 | 
 23 | # Problem 1
 24 | 
 25 | We'll start by loading the United Nations voting data into R 
 26 | and performing some data wrangling. We use data from this paper:
 27 | 
 28 | > Voeten, Erik; Strezhnev, Anton; Bailey, Michael, 2009, "United Nations General Assembly Voting Data", http://hdl.handle.net/1902.1/12379, Harvard Dataverse, V11
 29 | 
 30 | In this problem, we will combine information from three sources 
 31 | to create the datasets that we will use to study voting behavior.
 32 | 
 33 | ### Problem 1A
 34 | 
 35 | We have learned how to import text files into R. Here we are 
 36 | going to load a data object that is saved to a file. To get 
 37 | an idea of how this works try the following:
 38 | 
 39 | ```{r}
 40 | temp_filename <- tempfile() ## creaate tempory file name
 41 | temp_object <- 1:5 ## create an R object
 42 | save(temp_object, file=temp_filename) ## save the r object to file
 43 | rm(temp_object) ## remove object
 44 | load(temp_filename) ## load object from file
 45 | temp_object ## note that it's back
 46 | ```
 47 | 
 48 | We usually use the suffix `.RData` or `.rda` for these objects. 
 49 | 
 50 | The data for this project is stored as an `.RData` file. Go to 
 51 | [this web page](https://dataverse.harvard.edu/dataset.xhtml?persistentId=hdl:1902.1/12379). 
 52 | To get the `.RData` file, click on the Download button for 
 53 | `rawvotingdata13.tab` and choose the `RData format`.
 54 | 
 55 | To load the data set into R, use the `load()` function.
 56 | Define the name of the object as `x` (but do NOT print it 
 57 | out as it has over 1 million rows).
 58 | 
 59 | ```{r}
 60 | ## put your code here
 61 | 
 62 | ```
 63 | 
 64 | 
 65 | ### Problem 1B
 66 | 
 67 | The first problem to overcome is that if you try to print 
 68 | this object, it will crash your R session -- it's just that 
 69 | big! (`r nrow(x)` rows). So first wrap it in `tbl_df(x)`,
 70 | and call it `votes`. After doing this you can erase `x` with `rm(x)`.
 71 | 
 72 | ```{r}
 73 | ## put your code here
 74 | 
 75 | ```
 76 | 
 77 | 
 78 | ### Problem 1C
 79 | 
 80 | We note that the data is already arranged according to the 
 81 | rules of tidy data. There is one row for each observation 
 82 | and one column for each variable. 
 83 | 
 84 | Download the `Codebook.pdf` file from [this page](https://dataverse.harvard.edu/dataset.xhtml?persistentId=hdl:1902.1/12379). 
 85 | How would you interpret the vote column? How many of each 
 86 | kind of vote are in this dataset? 
 87 | 
 88 | ```{r}
 89 | ## put your code here
 90 | 
 91 | ```
 92 | 
 93 | 
 94 | Of the five types of votes, which three would provide information 
 95 | about the country's position on an issue? Which two would not?
 96 | 
 97 | **Your answer here**: 
 98 | 
 99 | 
100 | Filter out the types of votes that do not provide information 
101 | about our countries position on an issue from our dataset.
102 | 
103 | ```{r}
104 | ## put your code here
105 | 
106 | ```
107 | 
108 | 
109 | ### Problem 1D 
110 | 
111 | According to the codebook, which column represents countries? 
112 | What type of unique code is used to represent each country?
113 | 
114 | **Your answer here**:  
115 | 
116 | Create new `country` column that contains country names 
117 | based on this column. Hint: check out the [countrycode](https://cran.r-project.org/web/packages/countrycode/countrycode.pdf) 
118 | package. 
119 | 
120 | 
121 | ```{r}
122 | ## put your code here
123 | 
124 | ```
125 | 
126 | 
127 | ### Problem 1E
128 | 
129 | Before continuing let's wrangle the country names a bit. We are 
130 | renaming countries with long names and renaming Congo to 
131 | distinguish it from Democratic Republic of Congo. We make use 
132 | of the powerful remapping function `revalue()` from `plyr` 
133 | package. You should **not** load `plyr` though as it will 
134 | create confusion with `dplyr` functions.
135 | 
136 | ```{r}
137 | library(tidyr)
138 | mapping <- c("United States"="USA",
139 |           "United Kingdom"="UK",
140 |           "Korea, Republic of"="South Korea",
141 |           "Lao People's Democratic Republic"="Laos",
142 |           "Yemen People's Republic"="South Yemen",
143 |           "Saint Vincent and the Grenadines"="Saint Vincent",
144 |           "Congo"="Congo Republic")
145 | votes <- votes %>% mutate(country = plyr::revalue(country, mapping)) %>%
146 |   separate(country, into = c("country", "extra"), sep=",", fill="right")
147 | ```
148 | 
149 | Right now we have information about how every country voted 
150 | on every resolution. But we do not have any information about
151 | the resolutions themselves (e.g. not what their title or topic
152 | was, or what date they were voted on). 
153 | Next, we will bring this data in as well.
154 | 
155 | This data is provided as `descriptions.csv`. 
156 | Read it in using the `readr` package and wrangle it as shown below:
157 | 
158 | ```{r}
159 | library(readr)
160 | 
161 | url <- "https://raw.githubusercontent.com/datasciencelabs/data/master/un-resolutions-descriptions.csv"
162 | descriptions <- read_csv(url, col_types = list(date = col_date("%m/%d/%y")))
163 | 
164 | ## from warning and looking at csv we see
165 | ## line 1483 has an extra "
166 | ## it's supposed to be a 0
167 | descriptions[1483,"ec"] <-0
168 | 
169 | library(lubridate)
170 | y <- year(descriptions$date)
171 | year(descriptions$date) <- ifelse(y > 2030, y - 100, y)
172 | ```
173 | 
174 | Count the number of votes that were taken in each year. 
175 | Create a line graph of the number of votes per year.
176 | 
177 | ```{r}
178 | ## put your code here
179 | 
180 | ```
181 | 
182 | 
183 | What year would we want to filter out from the dataset 
184 | because there was only one vote? 
185 | 
186 | **Your answer here**: 
187 | 
188 | Filter it out now.
189 | 
190 | ```{r}
191 | ## put your code here
192 | 
193 | ```
194 | 
195 | 
196 | ### Problem 1F
197 | 
198 | Read the `Codebook.pdf` about this dataset. Who classified certain votes 
199 | as "important"? 
200 | 
201 | **Your answer here**:  
202 | 
203 | What percent of votes in history were categorized as "important"?
204 | 
205 | ```{r}
206 | ## put your code here
207 | 
208 | ```
209 | 
210 | 
211 | The most interesting analyses can be done by combining the description 
212 | and country-voting data.
213 | 
214 | Join the `description` and country-voting data (`votes`) to create a new 
215 | data set. Remove the `yes`, `no`, and `abstain` columns from the 
216 | `description` dataset. These are per-vote summaries that we do not 
217 | need any more (and could be misleading). The final dataset should be called 
218 | `votes`, which you will continue to use throughout the homework. 
219 | 
220 | ```{r}
221 | ## put your code here
222 | 
223 | ```
224 | 
225 | 
226 | 
227 | # Problem 2
228 | 
229 | ### Problem 2A
230 | 
231 | Canada and the US have been allies since the UN was created. 
232 | We can create a matrix of all votes for these two countries using 
233 | the `spread()` function in `tidyr` package like this:
234 | 
235 | ```{r}
236 | library(tidyr)
237 | y <- votes %>% 
238 |   filter(country %in% c("USA", "Canada")) %>%
239 |   mutate(year = year(date)) %>%
240 |   select(rcid, year, importantvote, country, vote) %>%
241 |   spread(country, vote)
242 | ```
243 | 
244 | We can see how often they have voted together in important votes 
245 | and not-important votes:
246 | 
247 | ```{r}
248 | y %>% 
249 |     group_by(importantvote) %>% 
250 |     summarize(mean(USA==Canada, na.rm=TRUE))
251 | ```
252 | 
253 | Compute the percentage in which the US and Canada
254 | voted the same. Calculate this percentage for each year and call it 
255 | `agreement`. Fit a linear model using `lm()` to predict `agreement`
256 | with `year`. 
257 | 
258 | ```{r}
259 | ## put your code here
260 | 
261 | ```
262 | 
263 | What is the trend predicted by the linear model? 
264 | Is it statistically significant?
265 | 
266 | **Your answer here**: 
267 | 
268 | 
269 | 
270 | ### Problem 2B
271 | 
272 | In the previous problem we found a negative trend in the agreement 
273 | between the USA and Canada throughout the years. Interpreting this 
274 | linear model would imply that disagreement between these two counties 
275 | was worse during the Clinton administration (1992-2000) than the 
276 | Reagan administration (1980-1998). 
277 | 
278 | Now, instead of blindly interpreting the regression results, 
279 | plot the data and use a smoother to estimate a trend. Based on this 
280 | analysis, how do thes Regan and Clinton administrations compare? 
281 | 
282 | **Hint**: Make sure to pick a window size or span that creates 
283 | a trend that goes through data.
284 | 
285 | ```{r}
286 | ## put your code here
287 | 
288 | ```
289 | 
290 | 
291 | ### Problem 2C
292 | 
293 | Make the plot above for the agreement through time between the US
294 | and the following countries: Israel, UK, Mexico, Cuba, and China. 
295 | Make two plots: one for important votes to non-important votes. 
296 | 
297 | ```{r}
298 | ## put your code here
299 | 
300 | ```
301 | 
302 | Describe the observed patterns.
303 | 
304 | **Your answer here**: 
305 | 
306 | 
307 | 
308 | # Problem 3
309 | 
310 | In this problem, we will focus only on important votes. 
311 | To get a better idea of who votes together we can compute a 
312 | distance between each country. We will focus on countries that 
313 | voted more than 95% of time in the 368 votes
314 | 
315 | ```{r}
316 | countries <- votes %>% 
317 |                 filter(importantvote==1) %>% 
318 |                 group_by(country) %>% 
319 |                 summarize(p=n()/368) %>% 
320 |                 filter(p>=0.95) %>% 
321 |                 .$country
322 | ```
323 | 
324 | We can create a matrix with all the votes using the `spread()` function:
325 | 
326 | ```{r}
327 | tmp <- votes %>% 
328 |     filter(country %in% countries & year(date) >= 1980 & importantvote == 1) %>%
329 |     select(rcid, country, vote) %>% 
330 |     spread(country, vote) 
331 | 
332 | X <- as.matrix(tmp[,-1])
333 | rownames(X) <- tmp$rcid
334 | ```
335 | 
336 | ### Problem 3A
337 | 
338 | Create a distance matrix between each country. Call this matrix `d`. 
339 | 
340 | **Hint**: Use the `dist()` function, but note that `X` has 
341 | countries in the columns and `dist()` computes distances between rows. 
342 | Look at the `dist` help file for more infomration. 
343 | You can use the default `method = "Euclidean"` in the `dist()` function. 
344 | You can switch rows to columns using the `t()` (transpose) function. 
345 | Finally, once you create the distance matrix `d` you can 
346 | visualize it using `heatmap()` or `hclust()`.
347 | 
348 | ```{r}
349 | ## put your code here
350 | 
351 | ```
352 | 
353 | 
354 | What country is closest to US? Which is furthest?
355 | 
356 | **Your answer here**: 
357 | 
358 | 
359 | ### Problem 3B
360 | 
361 | Given how close some countries are and how far others are to US in voting,
362 | we should be able to predict how the US will vote based on others. 
363 | Let's try to implement a machine learning algorithm to do this. 
364 | 
365 | Use the `votes` data set to create a new dataset with seven columns. 
366 | One column will represent the USA vote as the outcome (call it `y`) and
367 | the last six columns will be the vote from the six countries examined 
368 | above in Problem 2 (include Canada), which will be used a predictors 
369 | in our machine learning algorithm. Only consider the important votes. 
370 | In the column for the USA vote column (`y`), remove the `Abstain` votes and
371 | only consider the `Yes` and `No` votes from the USA. Tranform the USA vote
372 | column (`y`) to contain only 0s and 1s where 0 = `No` vote and 1 = `Yes` vote.
373 | 
374 | ```{r}
375 | ## put your code here
376 | 
377 | ```
378 | 
379 | Use the `caret` R package to split the data into a training set with 
380 | 80% of data and a test set with the remaing 20%. 
381 | Then use `glm()` to build a model. What is the accuracy?
382 | 
383 | ```{r}
384 | ## put your code here
385 | 
386 | ```
387 | 
388 | 
389 | 
390 | ### Problem 3C 
391 | 
392 | We see that obtain a very high accuracy, but note that this is a 
393 | random variable due to the random split of our data. 
394 | Try 10 new random splits and report on how much our accuracy changes.
395 | 
396 | ```{r}
397 | ## put your code here
398 | 
399 | ```
400 | 
401 | 
402 | ### Problem 3D 
403 | 
404 | Compare your `glm()` model to a `knn()`. Use the `train()` function 
405 | to run 10 cross validations leaving out 20% of the data. 
406 | Plot your results. 
407 | 
408 | ```{r}
409 | ## put your code here
410 | 
411 | ```
412 | 
413 | How many nearest neighbors should we use?
414 | 
415 | **Your answer here**: 
416 | 


--------------------------------------------------------------------------------
/lectures/R/.Rhistory:
--------------------------------------------------------------------------------
  1 | ?file.path
  2 | filename <- basename(url)
  3 | url <- "https://raw.githubusercontent.com/datasciencelabs/2016_data/master/bio260_heights.csv"
  4 | filename <- basename(url)
  5 | library(downloader)
  6 | url <- "https://raw.githubusercontent.com/datasciencelabs/2016_data/master/bio260_heights.csv"
  7 | filename <- basename(url)
  8 | download(url,filename)
  9 | dat <- read.csv(filename)
 10 | url <- "https://raw.githubusercontent.com/datasciencelabs/2016_data/master/bio260_heights.csv"
 11 | filename <- basename(url)
 12 | download(url,filename)
 13 | ?vector
 14 | x <- seq(1,5)
 15 | names(x) <- letters[1:5]
 16 | x
 17 | height <- c(60, 59, 55, "5'5", 70)
 18 | height
 19 | height <- c(60, 59, 55, "5'5", 70)
 20 | height[3]
 21 | as.numeric(height[3])
 22 | as.numeric(height)
 23 | x <- dat$height
 24 | dat
 25 | x <- dat$height
 26 | x <- c(1,2,3,4,5)
 27 | url <- "https://raw.githubusercontent.com/datasciencelabs/2016_data/master/bio260_heights.csv"
 28 | dat <- read.csv(url)
 29 | str(dat)
 30 | names(x) <- letters[1:5]
 31 | x
 32 | library(dplyr)
 33 | View(dat)
 34 | dat$Timestamp[2]
 35 | dat$Timestamp
 36 | names(dat) <- c("time","gender","height")
 37 | dat <- mutate(dat, numeric_height=as.numeric(height),
 38 | original=height)
 39 | dat
 40 | as.numeric(height)
 41 | filter(dat, is.na(numeric_height)) %>% select(height)
 42 | filter(dat, is.na(numeric_height))
 43 | filter(dat, is.na(numeric_height)) %>% select(height) %>% print(n=21)
 44 | x <- 1:5
 45 | x <- seq(1,5)
 46 | select(dat, contains("height"))
 47 | height <- c(60, 59, 55, "5'5", 70)
 48 | height
 49 | library(readr)
 50 | dat <- read_csv("https://raw.githubusercontent.com/datasciencelabs/2016_data/master/bio260_heights.csv")
 51 | ?is.na
 52 | height <- c(60, 59, 55, "5'5", 70)
 53 | height[3]
 54 | as.numeric(height[3])
 55 | dat <- mutate(dat, numeric_height=as.numeric(height),
 56 | original=height)
 57 | filter(dat, is.na(numeric_height))
 58 | dat <- mutate(dat, numeric_height=as.numeric(height),
 59 | original=height)
 60 | library(readr)
 61 | dat <- read_csv("https://raw.githubusercontent.com/datasciencelabs/2016_data/master/bio260_heights.csv")
 62 | dat <- mutate(dat, numeric_height=as.numeric(height),
 63 | original=height)
 64 | dat
 65 | dat <- mutate(dat, numeric_height=as.numeric(height),
 66 | original=height)
 67 | names(dat)
 68 | names(dat) <- c("time","gender","height")
 69 | dat <- mutate(dat, numeric_height=as.numeric(height),
 70 | original=height)
 71 | filter(dat, is.na(numeric_height))
 72 | filter(dat, is.na(numeric_height)) %>% select(height)
 73 | x <- dat$height
 74 | x
 75 | x[109:119]
 76 | x[109:117]
 77 | x[109:116]
 78 | gsub("ft", "'", x)
 79 | x <- dat$height
 80 | x <- dat$height[109:116]
 81 | gsub("ft", "'", x)
 82 | x
 83 | gsub("ft", "'", x)
 84 | x <- gsub("inches","",x)
 85 | x
 86 | dat <- mutate(dat, height= gsub("ft","'",height) )
 87 | dat <- mutate(dat, height= gsub("\"|inches|\ ","",height) )
 88 | filter(dat, is.na(numeric_height)) %>% select(height) %>% print(n=21)
 89 | x=filter(dat, is.na(numeric_height)) %>% select(height) %>% print(n=21)
 90 | x$height
 91 | gsub("''","",x$height)
 92 | dat <- mutate(dat, height= gsub("\"|inches|\ |''","",height) )
 93 | filter(dat, is.na(numeric_height)) %>% select(height) %>% print(n=21)
 94 | filter(dat, is.na(numeric_height)) %>% select(height,original) %>% print(n=21)
 95 | ?function
 96 | ?function
 97 | help("function")
 98 | class(function)
 99 | class("function")
100 | class(function)
101 | avg <- function(x){
102 | sum(x) / length(x)
103 | }
104 | avg( 1:5 )
105 | variance <- function(x){
106 | mu <- mean(x)
107 | return( mean ( (x - mu)^2 ) )
108 | }
109 | variance(1:4)
110 | variance(1:5)
111 | variance(1:5)
112 | dat <- mutate(dat, height=fixheight(height)) %>% select(-numeric_height)
113 | fixheight <- function(x){
114 | y <- strsplit(x, "'")
115 | ret <- sapply(y, function(z){
116 | ifelse( length(z)>1, as.numeric(z[1])*12 + as.numeric(z[2]) ,
117 | as.numeric(z[1]))
118 | })
119 | return(ret)
120 | }
121 | dat <- mutate(dat, height=fixheight(height)) %>% select(-numeric_height)
122 | dat
123 | filter(dat, is.na(numeric_height)) %>% select(height) %>% print(n=21)
124 | filter(dat, is.na(height)) %>% select(height) %>% print(n=21)
125 | filter(dat, is.na(height)) %>% select(height)
126 | ls()
127 | library(dplyr)
128 | library(readr)
129 | url <- "https://raw.githubusercontent.com/datasciencelabs/2016_data/master/bio260_heights.csv"
130 | dat <- read_csv(url)
131 | names(dat) <- c("time","gender","height")
132 | dat <- mutate(dat, numeric_height=as.numeric(height),
133 | original=height)
134 | dat <- mutate(dat, height= gsub("ft","'",height) )
135 | dat <- mutate(dat, height= gsub("\"|inches|\ ","",height) )
136 | fixheight <- function(x){
137 | y <- strsplit(x, "'")
138 | ret <- sapply(y, function(z){
139 | ifelse( length(z)>1, as.numeric(z[1])*12 + as.numeric(z[2]) ,
140 | as.numeric(z[1]))
141 | })
142 | return(ret)
143 | }
144 | dat <- mutate(dat, height=fixheight(height)) %>% select(-numeric_height)
145 | library(dplyr)
146 | library(readr)
147 | url <- "https://raw.githubusercontent.com/datasciencelabs/2016_data/master/bio260_heights.csv"
148 | dat <- read_csv(url)
149 | names(dat) <- c("time","gender","height")
150 | dat <- mutate(dat, numeric_height=as.numeric(height),
151 | original=height) %>%
152 | mutate(height= gsub("ft","'",height) ) %>%
153 | mutate(height= gsub("\"|inches|\ ","",height) ) %>%
154 | mutate(height=fixheight(height)) %>% select(-numeric_height)
155 | dat
156 | library(dplyr)
157 | library(readr)
158 | url <- "https://raw.githubusercontent.com/datasciencelabs/2016_data/master/bio260_heights.csv"
159 | dat <- read_csv(url)
160 | names(dat) <- c("time","gender","height")
161 | filter(dat, gender=="Male") %>% nrow
162 | filter(dat, gender=="Female") %>% nrow
163 | filter(dat, gender=="Male") %>% select(height) %>% mutate(height=as.numeric(height)) %>% filter(is.na(height)) %>% nrow
164 | dat$origi
165 | dat$ori
166 | names(dat)
167 | url <- "https://raw.githubusercontent.com/datasciencelabs/2016_data/master/bio260_heights.csv"
168 | dat <- read_csv(url)
169 | names(dat) <- c("time","gender","height")
170 | dat <- mutate(dat, numeric_height=as.numeric(height),
171 | original=height) %>%
172 | mutate(height= gsub("ft","'",height) ) %>%
173 | mutate(height= gsub("\"|inches|\ ","",height) ) %>%
174 | mutate(height=fixheight(height)) %>% select(-numeric_height)
175 | dat
176 | dat$original
177 | dat[113,]
178 | dat$height[113]
179 | mean( x > convet("6","6"))
180 | data(father.son,package="UsingR")
181 | x <- father.son$sheight
182 | mean( x > convet("6","6"))
183 | mean( x > convert("6","6"))
184 | convert <- function(f,i) as.numeric(f)*12+as.numeric(i)
185 | data(father.son,package="UsingR")
186 | x <- father.son$sheight
187 | mean( x > convert("6","6"))
188 | qqnorm(x)
189 | qqline(x)
190 | 


--------------------------------------------------------------------------------
/lectures/R/console-screen-shot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/R/console-screen-shot.png


--------------------------------------------------------------------------------
/lectures/R/data-import.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Data Import, munging, and exploration"
 3 | output: html_document
 4 | ---
 5 | 
 6 | 
 7 | ## Data import
 8 | 
 9 | Usually the first step in data analysis is getting the data in a form that permits analysis, for example, importing the data into the R environment. 
10 | 
11 | We took a poll of our students to obtain (self-reported) height and gender. Our task is to describe this list of heights to someone (or something) that has no idea what human adults look like.
12 | 
13 | The are several function in the base package available for reading data. A Google search of "how do I import data into R?" gets us to 
14 | 
15 | ```{r,eval=FALSE}
16 | ?read.table
17 | ```
18 | 
19 | From the help file we see that we need that we need 
20 | 
21 | > the name of the file which the data are to be read from. Each row of the table appears as one line of the file. If it does not contain an absolute path, the file name is relative to the current working directory, getwd(). Tilde-expansion is performed where supported. This can be a compressed file (see file).
22 | 
23 | #### Paths and the Working Directory
24 | 
25 | When you are working in R it is useful to know your _working directory_. This is the directory or folder in which R will save or look for files by default. You can see your working directory by typing:
26 | 
27 | ```{r, eval=FALSE}
28 | getwd()
29 | ```
30 | 
31 | You can also change your working directory using the function `setwd`. Or you can change it through RStudio by clicking on "Session". 
32 | 
33 | The functions that read and write files (there are several in R) assume you mean to look for files or write files in the working directory. Our recommended approach for beginners will have you reading and writing to the working directory. However, you can also type the [full path](http://www.computerhope.com/jargon/a/absopath.htm), which will work independently of the working directory.
34 | 
35 | #### Projects in RStudio
36 | 
37 | We find that the simplest way to organize yourself is to start a Project in RStudio (Click on "File" and "New Project"). When creating the project, you will select a folder to be associated with it. You can then download all your data into this folder. Your working directory will be this folder.
38 | 
39 |  
40 | #### Option 1: Download file with your browser to your working directory
41 | 
42 | 
43 | You can navigate to the file `bio260-heights.csv` in data directory of our data repository:
44 | [GitHub](https://github.com/datasciencelabs/data). If you navigate to the file, you need to click on *Raw* on the
45 | upper right hand corner of the data and then use your browser's "Save
46 | As" function to ensure that the downloaded file is in a CSV
47 | format. Some browsers add an extra suffix to your file name by
48 | default. You do not want this. You want your file to be named
49 | `bio260-heights.csv`. 
50 | 
51 | ![GitHub page screenshot](screenshot.png)
52 | 
53 | Once you have this file in your working directory, then you can simply read it in like this:
54 | 
55 | ```{r,eval=FALSE}
56 | dat <- read.csv("bio260-heights.csv")
57 | ```
58 | 
59 | If you did not receive any message, then you probably read in the file successfully.
60 | 
61 | #### Option 2: Read from within R 
62 | 
63 | We store many of the datasets used here on [GitHub](https://github.com/datasciencelabs/data). You can actually read these files directly to your R session in the following way:
64 | 
65 | ```{r,message=FALSE}
66 | filename <- "https://raw.githubusercontent.com/datasciencelabs/data/master/bio260-heights.csv"
67 | dat <- read.csv(filename)
68 | ```
69 | 
70 | #### Option 3: Download from within R 
71 | 
72 | Although option 2 is very convenient, you may want to have the data file on your hard disk. For example, this gives you complete control of the reproducibility of your ananlysis. In option 1 downloaded the file using a browser, but you can do this from within R using the `downloader` package.
73 | 
74 | ```{r}
75 | library(downloader)
76 | url <- "https://raw.githubusercontent.com/datasciencelabs/data/master/bio260-heights.csv"
77 | filename <- basename(url)
78 | download(url,filename)
79 | dat <- read.csv(filename)
80 | ```
81 | 
82 |  


--------------------------------------------------------------------------------
/lectures/R/fixheight.R:
--------------------------------------------------------------------------------
1 | fixheight <- function(x){
2 |   y <- strsplit(x, "'")
3 |   ret <- sapply(y, function(z){
4 |     ifelse( length(z)>1, as.numeric(z[1])*12 + as.numeric(z[2]) ,
5 |             as.numeric(z[1]))
6 |   })
7 |   return(ret)
8 | }
9 | 


--------------------------------------------------------------------------------
/lectures/R/intro-to-R.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Introduction to R"
  3 | output: html_document
  4 | ---
  5 | 
  6 | 
  7 | # R for Data Science
  8 | 
  9 | ## Introduction 
 10 | 
 11 | We will be using the
 12 | [R programming language](https://cran.r-project.org/) for all our
 13 | analysis. You will learn R and statistics simultaneously. We will by installing R and RStudio, then covering some basics.
 14 | 
 15 | Although R is technically a programming language, it was developed specifically for analyzing data. We will therefore teach R in the context of analyzing data rather than as one would teach a programming language. As a result, we will be learning some data analysis concepts along the way. For those with limited programming experinece we highly recommend you complete one of the following R programming courses:
 16 | 
 17 | 
 18 | * DataCamp's [R course](https://www.datacamp.com/courses/free-introduction-to-r)
 19 | * edX's [Introduction to R Programming](https://www.edx.org/course/introduction-r-programming-microsoft-dat204x-0)
 20 | 
 21 | Below we include a broader list of resources for learning R.
 22 | 
 23 | ## Installing R
 24 | 
 25 | The first step is to install R. You can download and install R from
 26 | the [Comprehensive R Archive Network](https://cran.r-project.org/)
 27 | (CRAN). It is relatively straightforward, but if you need further help
 28 | you can try the following resources: 
 29 | 
 30 | * [Installing R on Windows](https://github.com/genomicsclass/windows#installing-r) 
 31 | * [Installing R on Mac](http://youtu.be/Icawuhf0Yqo)
 32 | * [Installing R on Ubuntu](http://cran.r-project.org/bin/linux/ubuntu/README)
 33 | 
 34 | ## Installing RStudio
 35 | 
 36 | The next step is to install RStudio, a program for viewing and running R scripts. Technically you can run all the code shown here without installing RStudio, but we highly recommend this integrated
 37 | development environment (IDE). Instructions are
 38 | [here](http://www.rstudio.com/products/rstudio/download/) and for
 39 | Windows we have special
 40 | [instructions](https://github.com/genomicsclass/windows). 
 41 | 
 42 | ## The Console
 43 | 
 44 | Now that you have download and installed R you are ready to start working with data. Whichever approach you are using to interact with R, we recommend RStudio, you should identify the console. 
 45 | 
 46 | ![the console](console_screen_shot.png)
 47 | 
 48 | When you type a line of code into the consult and hit enter the command gets _executed_. For example, try typine
 49 | 
 50 | ```{r}
 51 | 2+3
 52 | ```
 53 | 
 54 | We can also assign values to variables. Try the following
 55 | 
 56 | ```{r}
 57 | x <- 2
 58 | y <- 3
 59 | x + y
 60 | ```
 61 | 
 62 | ## The R ecosystem
 63 | 
 64 | When you download R from CRAN you get what we call _base_ R. This includes several _functions_ that are considered fundamental for data analysis. It also includes several example datasets. These datasets are particularly useful as examples when we are learning to use the available functions. You can see all the available dataset by executing the function `data` like this:
 65 | 
 66 | ```{r,eval=FALSE}
 67 | data()
 68 | ```
 69 | 
 70 | Because in R functions are objects, we need the two parenthesis to let R know that we want the function to be executed as opposed to showing us the code for the function. Type the following and note the difference:
 71 | 
 72 | ```{r,eval=FALSE}
 73 | data
 74 | ```
 75 | 
 76 | 
 77 | To see na example of functions at work, we will use to `co2` dataset to illustrate the function `plot`, one of the base functions. We can plot Mauna Loa Atmospheric CO2 Concentration versus data like this:
 78 | 
 79 | ```{r}
 80 | plot(co2)
 81 | ```
 82 | 
 83 | Note that R's base functionality is bare bones. Note that data science applications are broad, the statistical toolbox is extensive, and most users need only a small fraction of all the available functionality. Therefore, a better approach is to make specific functionality available _on demand_.  R does this using _packages_, also called _libraries_. 
 84 | 
 85 | Some packages are considered popular enough that they are included with the base download.
 86 | For example, the software implementing the method of survival analysis are in the `survival` package. To bring that functionality to your current session we type
 87 | 
 88 | ```{r,eval=FALSE}
 89 | library(survival)
 90 | ```
 91 | 
 92 | However, CRAN has over 4,000 packages that are not included in the base installation. You can install these using the `install.packages` function. 
 93 | 
 94 | ## Installing Packages
 95 | 
 96 | The first R command we will run is `install.packages`.  R only includes a basic set of
 97 | functions. It can do much more than this, but not everybody needs
 98 | everything so we instead make some functions available via
 99 | packages. Many of these functions are stored in CRAN where
100 | packages are vetted: they are checked for common errors and they must have a dedicated maintainer. There are other repositories, some with more vetting, such as [Bioconductor](http://www.bioconductor.org), and no vetting, such as GitHub. You can easily install CRAN packages from within R if you know the name of the packages. As an example, we are going to install the package `dplyr` which we use in our first data analysis examples: 
101 | 
102 | ```{r,eval=FALSE}
103 | install.packages("dplyr")
104 | ```
105 | 
106 | We can then load the package into our R sessions using the `library` function:
107 | 
108 | ```{r, warning=FALSE}
109 | library(dplyr)
110 | ```
111 | 
112 | From now on you will see that we sometimes load packages without
113 | installing them. This is because once you install the package, it
114 | remains in place and only needs to be loaded with `library`. If you
115 | try to load a package and get an error, it probably means you need to
116 | install it first.
117 | 
118 | 
119 | 
120 | ## Getting help
121 | 
122 | A key feuature you need to know about R is that you can get help for a function using `help` or `?`, like this:
123 | ```{r,eval=FALSE}
124 | ?install.packages
125 | help("install.packages")
126 | ```
127 | 
128 | These pages are quite detailed and also include examples at the end. 
129 | 
130 | ## Comments
131 | The hash character represents comments, so text following these
132 | characters is not interpreted:
133 | 
134 | ```{r}
135 | ##This is just a comment
136 | ```
137 | 
138 | 
139 | 
140 | ## Importing Data into R
141 | 
142 | The first step when preparing to analyze data is to read in the data into R. We therefore teach this skill first. There are several ways to do this and we will discuss three of them. But you only need to learn one to follow along. 
143 | 
144 | Small datasets such as the one used as an
145 | example here are typically stored as Excel files. Although there are R packages designed to read Excel (xls) format, you generally want
146 | to avoid this and save files as comma delimited (Comma-Separated
147 | Value/CSV) or tab delimited (Tab-Separated Value/TSV/TXT) files.
148 | These plain-text formats are often easier for sharing, as commercial software is not required for viewing or
149 | working with the data.
150 | 
151 | We will start with a simple example dataset containing the [heights of students in our class](https://raw.githubusercontent.com/datasciencelabs/data/master/bio260-heights.csv)
152 | 
153 | The first step is to find the file containing your data and know its *path*. 
154 | 
155 | 
156 | #### Paths and the Working Directory
157 | 
158 | When you are working in R it is useful to know your _working directory_. This is the directory or folder in which R will save or look for files by default. You can see your working directory by typing:
159 | 
160 | ```{r, eval=FALSE}
161 | getwd()
162 | ```
163 | 
164 | You can also change your working directory using the function `setwd`. Or you can change it through RStudio by clicking on "Session". 
165 | 
166 | The functions that read and write files (there are several in R) assume you mean to look for files or write files in the working directory. Our recommended approach for beginners will have you reading and writing to the working directory. However, you can also type the [full path](http://www.computerhope.com/jargon/a/absopath.htm), which will work independently of the working directory.
167 | 
168 | #### Projects in RStudio
169 | 
170 | We find that the simplest way to organize yourself is to start a Project in RStudio (Click on "File" and "New Project"). When creating the project, you will select a folder to be associated with it. You can then download all your data into this folder. Your working directory will be this folder.
171 | 
172 | ## Resources
173 | 
174 | Apart from the two courses lister above, there are many resources to help you 
175 | familiarize yourself with the basics of programming and R syntax.  One such tutorial is the [swirl](http://swirlstats.com/) tutorial, which teaches you R programming interactively, at your own pace and in the R console. Once you have R installed, you can install `swirl` and run it the following way: 
176 | 
177 | ```{r, eval=FALSE}
178 | install.packages("swirl")
179 | library(swirl)
180 | swirl()
181 | ```
182 | 
183 | [try R](http://tryr.codeschool.com/) interactive class from Code School. 
184 | 
185 | There are also many open and free resources and reference
186 | guides for R. Two examples are:
187 | 
188 | * [Quick-R](http://www.statmethods.net/): a quick online reference for data input, basic statistics and plots
189 | * R reference card (PDF)[https://cran.r-project.org/doc/contrib/Short-refcard.pdf] by Tom Short 
190 | 
191 | #### More advanced R Resources (from Roger Peng)
192 | 
193 | Available from CRAN (http://cran.r-project.org)
194 | 
195 | -   An Introduction to R
196 | 
197 | -   Writing R Extensions
198 | 
199 | -   R Data Import/Export
200 | 
201 | -   R Installation and Administration (mostly for building R from
202 |     sources)
203 | 
204 | -   R Internals (not for the faint of heart)
205 | 
206 | 
207 | #### Some Useful Books on S/R
208 | 
209 | Standard texts
210 | 
211 | -   Chambers (2008). *Software for Data Analysis*, Springer. (your
212 |     textbook)
213 | 
214 | -   Chambers (1998). *Programming with Data*, Springer.
215 | 
216 | -   Venables & Ripley (2002). *Modern Applied Statistics with S*,
217 |     Springer.
218 | 
219 | -   Venables & Ripley (2000). *S Programming*, Springer.
220 | 
221 | -   Pinheiro & Bates (2000). *Mixed-Effects Models in S and S-PLUS*,
222 |     Springer.
223 | 
224 | -   Murrell (2005). *R Graphics*, Chapman & Hall/CRC Press.
225 | 
226 | Other resources
227 | 
228 | -   Springer has a series of books called *Use R!*.
229 | 
230 | -   A longer list of books is at
231 |     http://www.r-project.org/doc/bib/R-books.html
232 | 


--------------------------------------------------------------------------------
/lectures/R/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/R/screenshot.png


--------------------------------------------------------------------------------
/lectures/databases/pics/RM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/databases/pics/RM.png


--------------------------------------------------------------------------------
/lectures/databases/pics/all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/databases/pics/all.png


--------------------------------------------------------------------------------
/lectures/databases/pics/one_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/databases/pics/one_table.png


--------------------------------------------------------------------------------
/lectures/databases/pics/subset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/databases/pics/subset.png


--------------------------------------------------------------------------------
/lectures/databases/pics/two_tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/databases/pics/two_tables.png


--------------------------------------------------------------------------------
/lectures/dataviz/pics/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/.DS_Store


--------------------------------------------------------------------------------
/lectures/dataviz/pics/DLP_slide.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/DLP_slide.jpg


--------------------------------------------------------------------------------
/lectures/dataviz/pics/DLP_slide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/DLP_slide.pdf


--------------------------------------------------------------------------------
/lectures/dataviz/pics/Moneyball_Poster.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/Moneyball_Poster.jpg


--------------------------------------------------------------------------------
/lectures/dataviz/pics/ad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/ad.png


--------------------------------------------------------------------------------
/lectures/dataviz/pics/classwebpage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/classwebpage.png


--------------------------------------------------------------------------------
/lectures/dataviz/pics/depodesta.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/depodesta.jpg


--------------------------------------------------------------------------------
/lectures/dataviz/pics/meonbike.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/meonbike.jpg


--------------------------------------------------------------------------------
/lectures/dataviz/pics/nythist.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/nythist.gif


--------------------------------------------------------------------------------
/lectures/dataviz/pics/nytimesvotingpattern.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/nytimesvotingpattern.jpg


--------------------------------------------------------------------------------
/lectures/dataviz/pics/nytimesvotingpattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/nytimesvotingpattern.png


--------------------------------------------------------------------------------
/lectures/dataviz/pics/tweetsproductplacement.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/pics/tweetsproductplacement.gif


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/demos/06-basic-app.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- fluidPage(
 4 |   sliderInput("num", "Slide Me", 1, 100, 50),
 5 |   actionButton("go", "Update"),
 6 |   plotOutput("hist"),
 7 |   verbatimTextOutput("sum")
 8 | )
 9 | 
10 | server <- function(input, output) {
11 |   data <- eventReactive(input$go, {rnorm(input$num)})
12 |   output$hist <- renderPlot({
13 |     hist(data())   
14 |   })
15 |   output$sum <- renderPrint({
16 |     summary(data())
17 |   })
18 | }
19 | 
20 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/demos/09-layers.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- fluidPage(title = "Random generator",
 4 |     fluidRow(
 5 |       column(width = 2,
 6 |         actionButton("renorm", "Sample")),
 7 |       column(width = 10,
 8 |         plotOutput("norm"))),
 9 |     fluidRow(
10 |       column(width = 2,
11 |         actionButton("reunif", "Sample")),
12 |       column(width = 10,
13 |         plotOutput("unif"))),
14 |     fluidRow(
15 |       column(width = 2,
16 |         actionButton("rechisq", "Sample")),
17 |       column(width = 10,
18 |         plotOutput("chisq")))
19 | )
20 | 
21 | server <- function(input, output) {
22 |   
23 |   normdata <- eventReactive(input$renorm, {rnorm(100)})
24 |   unifdata <- eventReactive(input$reunif, {runif(100)})
25 |   chisqdata <- eventReactive(input$rechisq, {rchisq(100, df = 2)})
26 |   
27 |   output$norm <- renderPlot({hist(normdata(), breaks = 30, col = "grey", bor = "white")})
28 |   output$unif <- renderPlot({hist(unifdata(), breaks = 30, col = "grey", bor = "white")})
29 |   output$chisq <- renderPlot({hist(chisqdata(), breaks = 30, col = "grey", bor = "white")})
30 | }
31 | 
32 | shinyApp(server = server, ui = ui)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/demos/www/shiny.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/shiny/demos/www/shiny.png


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/01-template.R:
--------------------------------------------------------------------------------
1 | library(shiny)
2 | ui <- fluidPage()
3 | 
4 | server <- function(input, output) {}
5 | 
6 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/02-slider.R:
--------------------------------------------------------------------------------
1 | library(shiny)
2 | 
3 | ui <- fluidPage(
4 |   sliderInput("num", "Choose a number:", 1, 100, 50)  
5 | )
6 | 
7 | server <- function(input, output) {}
8 | 
9 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/03-plotOutput.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- fluidPage(
 4 |   sliderInput("num", "Choose a number:", 1, 100, 50),
 5 |   plotOutput("hist")
 6 | )
 7 | 
 8 | server <- function(input, output) {}
 9 | 
10 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/04-renderPlot.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- fluidPage(
 4 |   sliderInput("num", "Choose a number:", 1, 100, 50),
 5 |   plotOutput("hist")
 6 | )
 7 | 
 8 | server <- function(input, output) {
 9 |   output$hist <- renderPlot({
10 |      hist(rnorm(input$num))
11 |   })
12 | }
13 | 
14 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/05-reactivity.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- fluidPage(
 4 |   sliderInput("num", "Choose a number:", 1, 100, 50),
 5 |   plotOutput("hist")
 6 | )
 7 | 
 8 | server <- function(input, output) {
 9 |   output$hist <- renderPlot({
10 |      hist(rnorm(input$num))
11 |   })
12 | }
13 | 
14 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/07-vocab:
--------------------------------------------------------------------------------
1 | 1. The value of a normal R value will not change once you set it, unless you reset it or arrange for a program to reset it. The value of a reactive value will change when a user manipulates your app. 
2 | 
3 | 2. A reactive function can call reactive values and reactive objects. A normal function cannot call reactive values or reactive objects (unless the normal function is called during the execution of a reactive function).
4 | 
5 | 3. In R, objects and values are essentially the same: the value of a normal object will not change unless you reset it or arrange for a program to reset it. The value of a reactive object will change when the reactive values that it depends on change. Also, reactive "objects" have a more active role in the context of a Shiny app than normal objects have in R. Objects can trigger actions long after they are made.


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/08-render.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- fluidPage(
 4 |   sliderInput("num", "Slide Me", 1, 100, 50),
 5 |   plotOutput("hist"),
 6 |   verbatimTextOutput("sum")
 7 | )
 8 | server <- function(input, output) {
 9 |    output$hist <- renderPlot({
10 |      hist(rnorm(input$num))   
11 |    })
12 |    output$sum <- renderPrint({
13 |      summary(rnorm(input$num))
14 |    })
15 | }
16 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/09-reactive.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- fluidPage(
 4 |   sliderInput("num", "Slide Me", 1, 100, 50),
 5 |   actionButton("go", "Update"),
 6 |   plotOutput("hist"),
 7 |   verbatimTextOutput("sum")
 8 | )
 9 | 
10 | server <- function(input, output) {
11 |    data <- eventReactive(input$go,{ rnorm(input$num) })
12 |    output$hist <- renderPlot({
13 |      hist(data())   
14 |    })
15 |    output$sum <- renderPrint({
16 |      summary(data())
17 |    })
18 | }
19 | 
20 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/10-eventReactive.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- fluidPage(
 4 |   sliderInput("num", "Slide Me", 1, 100, 50),
 5 |   actionButton("go", "Update"),
 6 |   plotOutput("hist"),
 7 |   verbatimTextOutput("sum")
 8 | )
 9 | 
10 | server <- function(input, output) {
11 |    data <- eventReactive(input$go, {rnorm(input$num)})
12 |    output$hist <- renderPlot({
13 |      hist(data())   
14 |    })
15 |    output$sum <- renderPrint({
16 |      summary(data())
17 |    })
18 | }
19 | 
20 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/11-reactiveValues.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- fluidPage(
 4 |   sliderInput("num", "Slide Me", 1, 100, 50),
 5 |   actionButton("norm", "Normal Data"),
 6 |   actionButton("unif", "Uniform Data"),
 7 |   plotOutput("hist"),
 8 |   verbatimTextOutput("sum")
 9 | )
10 | 
11 | server <- function(input, output) {
12 |    rv <- reactiveValues(data = rnorm(50))
13 |    
14 |    observeEvent(input$norm, {rv$data <- rnorm(input$num)})
15 |    observeEvent(input$unif, {rv$data <- runif(input$num)})
16 |    
17 |    output$hist <- renderPlot({hist(rv$data)})
18 |    output$sum <- renderPrint({summary(rv$data)})
19 | }
20 | 
21 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/17-layout.R:
--------------------------------------------------------------------------------
 1 | ui <- fluidPage(
 2 |   fluidRow(
 3 |     column(3, 
 4 |       sliderInput("num", "Choose a number", 1, 100, 50)
 5 |     ),
 6 |     column(9, 
 7 |       plotOutput("hist")
 8 |     )
 9 |   ),
10 |   fluidRow(
11 |     column(5, offset = 5,
12 |       verbatimTextOutput("sum")
13 |     )
14 |   )
15 | )
16 | 
17 | server <- function(input, output) {
18 |   data <- reactive({rnorm(input$num)})
19 |   output$hist <- renderPlot({
20 |     hist(data())   
21 |   })
22 |   output$sum <- renderPrint({
23 |     summary(data())
24 |   })
25 | }
26 | 
27 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/18-panels.R:
--------------------------------------------------------------------------------
 1 | ui <- navbarPage("My App", 
 2 |   tabPanel("Slider", 
 3 |       sliderInput(inputId = "num", 
 4 |         label = "Choose a number", 
 5 |         value = 25, min = 1, max = 100)
 6 |   ),
 7 |   tabPanel("Plot", plotOutput("hist")),
 8 |   tabPanel("Summary", verbatimTextOutput("sum"))
 9 | )
10 | 
11 | server <- function(input, output) {
12 |   data <- reactive({rnorm(input$num)})
13 |   output$hist <- renderPlot({
14 |     hist(data())   
15 |   })
16 |   output$sum <- renderPrint({
17 |     summary(data())
18 |   })
19 | }
20 | 
21 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/19-navbarPage.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- navbarPage(title = "Random generator",
 4 |   tabPanel(title = "Normal data",
 5 |     fluidRow(
 6 |       column(width = 2,
 7 |         actionButton("renorm", "Sample")),
 8 |       column(width = 10,
 9 |         plotOutput("norm")))),
10 |   tabPanel(title = "Uniform data",
11 |     fluidRow(
12 |       column(width = 2,
13 |         actionButton("reunif", "Sample")),
14 |       column(width = 10,
15 |         plotOutput("unif")))),
16 |   tabPanel(title = "Chi Squared data",
17 |     fluidRow(
18 |       column(width = 2,
19 |         actionButton("rechisq", "Sample")),
20 |       column(width = 10,
21 |         plotOutput("chisq")))))
22 | 
23 | server <- function(input, output) {
24 |   
25 |   normdata <- eventReactive(input$renorm, {rnorm(100)})
26 |   unifdata <- eventReactive(input$reunif, {runif(100)})
27 |   chisqdata <- eventReactive(input$rechisq, {rchisq(100, df = 2)})
28 |   
29 |   output$norm <- renderPlot({hist(normdata(), breaks = 30, col = "grey", bor = "white")})
30 |   output$unif <- renderPlot({hist(unifdata(), breaks = 30, col = "grey", bor = "white")})
31 |   output$chisq <- renderPlot({hist(chisqdata(), breaks = 30, col = "grey", bor = "white")})
32 | }
33 | 
34 | shinyApp(server = server, ui = ui)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/20-tags/app.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- fluidPage(
 4 |   sidebarLayout(
 5 |     sidebarPanel( 
 6 |       sliderInput("num", "Choose a number", 1, 100, 50),
 7 |       tags$img(height = 120, src = "shiny.png"),
 8 |       tags$br(),
 9 |       tags$em("Powered by "),
10 |       tags$a(href = "shiny.rstudio.com", "shiny")
11 |     ),
12 |     mainPanel( 
13 |       plotOutput("hist"),
14 |       verbatimTextOutput("sum")
15 |     )
16 |   )
17 | )
18 | 
19 | server <- function(input, output) {
20 |   output$hist <- renderPlot({
21 |     hist(rnorm(input$num))   
22 |   })
23 |   
24 |   output$sum <- renderPrint({
25 |     summary(rnorm(input$num))
26 |   })
27 | }
28 | 
29 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/exercises/20-tags/www/shiny.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/shiny/exercises/20-tags/www/shiny.png


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/pdfs/01-Intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/shiny/pdfs/01-Intro.pdf


--------------------------------------------------------------------------------
/lectures/dataviz/shiny/pdfs/02-Reactivity-and-UI.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/shiny/pdfs/02-Reactivity-and-UI.pdf


--------------------------------------------------------------------------------
/lectures/dataviz/shiny_section/map/app.R:
--------------------------------------------------------------------------------
 1 | 
 2 | library(shiny)
 3 | 
 4 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 5 | # Load data
 6 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 7 | library(maps)
 8 | library(mapproj)
 9 | counties <- readRDS("data/counties.rds")
10 | source("helpers.R")
11 | 
12 | 
13 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
14 | # ui
15 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
16 | ui <- fluidPage(
17 |   titlePanel("censusVis"),
18 |   
19 |   sidebarLayout(
20 |     sidebarPanel(
21 |       helpText("Create demographic maps with 
22 |                information from the 2010 US Census."),
23 |       
24 |       selectInput("var", 
25 |                   label = "Choose a variable to display",
26 |                   choices = c("Percent White", "Percent Black",
27 |                               "Percent Hispanic", "Percent Asian"),
28 |                   selected = "Percent White"),
29 |       
30 |       sliderInput("range", 
31 |                   label = "Range of interest:",
32 |                   min = 0, max = 100, value = c(0, 100))
33 |       ),
34 |     
35 |     mainPanel(plotOutput("map"))
36 |   )
37 | )
38 | 
39 | 
40 | 
41 | server <-  function(input, output) {
42 |     output$map <- renderPlot({
43 |       data <- switch(input$var, 
44 |         "Percent White" = counties$white,
45 |         "Percent Black" = counties$black,
46 |         "Percent Hispanic" = counties$hispanic,
47 |         "Percent Asian" = counties$asian)
48 |     
49 |       color <- switch(input$var, 
50 |         "Percent White" = "darkgreen",
51 |         "Percent Black" = "black",
52 |         "Percent Hispanic" = "darkorange",
53 |         "Percent Asian" = "darkviolet")
54 |     
55 |       legend <- switch(input$var, 
56 |         "Percent White" = "% White",
57 |         "Percent Black" = "% Black",
58 |         "Percent Hispanic" = "% Hispanic",
59 |         "Percent Asian" = "% Asian")
60 | 
61 | 
62 |       percent_map(var = data, 
63 |         color = color, 
64 |         legend.title = legend, 
65 |         max = input$range[2], 
66 |         min = input$range[1])
67 | 
68 |     })
69 |   }
70 | 
71 | 
72 | 
73 | # Run the application 
74 | shinyApp(ui = ui, server = server)
75 | 
76 | 


--------------------------------------------------------------------------------
/lectures/dataviz/shiny_section/map/data/counties.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/dataviz/shiny_section/map/data/counties.rds


--------------------------------------------------------------------------------
/lectures/dataviz/shiny_section/map/helpers.R:
--------------------------------------------------------------------------------
 1 | # Note: percent map is designed to work with the counties data set
 2 | # It may not work correctly with other data sets if their row order does 
 3 | # not exactly match the order in which the maps package plots counties
 4 | percent_map <- function(var, color, legend.title, min = 0, max = 100) {
 5 | 
 6 |   # generate vector of fill colors for map
 7 |   shades <- colorRampPalette(c("white", color))(100)
 8 |   
 9 |   # constrain gradient to percents that occur between min and max
10 |   var <- pmax(var, min)
11 |   var <- pmin(var, max)
12 |   percents <- as.integer(cut(var, 100, 
13 |     include.lowest = TRUE, ordered = TRUE))
14 |   fills <- shades[percents]
15 | 
16 |   # plot choropleth map
17 |   map("county", fill = TRUE, col = fills, 
18 |     resolution = 0, lty = 0, projection = "polyconic", 
19 |     myborder = 0, mar = c(0,0,0,0))
20 |   
21 |   # overlay state borders
22 |   map("state", col = "white", fill = FALSE, add = TRUE,
23 |     lty = 1, lwd = 1, projection = "polyconic", 
24 |     myborder = 0, mar = c(0,0,0,0))
25 |   
26 |   # add a legend
27 |   inc <- (max - min) / 4
28 |   legend.text <- c(paste0(min, " % or less"),
29 |     paste0(min + inc, " %"),
30 |     paste0(min + 2 * inc, " %"),
31 |     paste0(min + 3 * inc, " %"),
32 |     paste0(max, " % or more"))
33 |   
34 |   legend("bottomleft", 
35 |     legend = legend.text, 
36 |     fill = shades[c(1, 25, 50, 75, 100)], 
37 |     title = legend.title)
38 | }


--------------------------------------------------------------------------------
/lectures/dataviz/shiny_section/maps.r:
--------------------------------------------------------------------------------
 1 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 2 | # File Name          : maps.r
 3 | # Programmer Name    : Luis Campos
 4 | #                     lfcampos87@gmail.com
 5 | #
 6 | # Purpose            : This file contains all the code needed to build a 
 7 | #                      maps Shiny app
 8 | #
 9 | # Date               : 4/22/2016
10 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
11 | 
12 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
13 | # This was taken straight out of Lesson 5 of the Shiny Tutorial
14 | #  http://shiny.rstudio.com/tutorial/lesson5/
15 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
16 | 
17 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
18 | # You'll need to 
19 | #	1. install maps and mapproj: install.packages(c("maps", "mapproj"))
20 | #	2. Download some pre-cleaned Census Data
21 | #		http://shiny.rstudio.com/tutorial/lesson5/census-app/data/counties.rds
22 | #		Save this to a /data subdirectory
23 | #	3. Download a plotting function they wrote for us so we can 
24 | #	   concetrate on the Shiny app
25 | #		http://shiny.rstudio.com/tutorial/lesson5/census-app/helpers.R
26 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
27 | 
28 | library(maps)
29 | library(mapproj)
30 | source("helpers.R")
31 | counties <- readRDS("data/counties.rds")
32 | 
33 | 
34 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
35 | # percent_map is already written for us: Let's play with it
36 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
37 | percent_map(counties$white, "darkgreen", "% white")
38 | 
39 | percent_map(counties$black,  "black", "% Black")
40 | 
41 | percent_map(counties$hispanic, "darkorange", "% Hispanic")
42 | 
43 | percent_map(counties$asian, "darkviolet", "% Asian")
44 | 
45 | 
46 | 
47 | 
48 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
49 | # Notice that the % Asian Map looks a bit pale, we can adjust the min-max
50 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
51 | hist(counties$asian)
52 | percent_map(counties$asian, "darkviolet", "% Asian", 0, 10)
53 | 
54 | 
55 | 
56 | 
57 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
58 | # So, we esentially have 2 options, 
59 | #	1. a drop-down to select race
60 | #  	2. a slider to select the min-max cutoffs
61 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
62 | 
63 | 


--------------------------------------------------------------------------------
/lectures/dataviz/shiny_section/smoother/app.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # This is a Shiny web application. You can run the application by clicking
  3 | # the 'Run App' button above.
  4 | #
  5 | # Find out more about building applications with Shiny here:
  6 | #
  7 | #    http://shiny.rstudio.com/
  8 | #
  9 | 
 10 | library(dplyr)
 11 | library(ggplot2)
 12 | library(broom)
 13 | library(stringr)
 14 | library(lubridate)
 15 | library(tidyr)
 16 | library(XML)
 17 | theurl <- paste0("http://www.pollster.com/08USPresGEMvO-2.html")
 18 | polls_2008 <- readHTMLTable(theurl,stringsAsFactors=FALSE)[[1]] %>%
 19 |   tbl_df() %>% 
 20 |   separate(col=Dates, into=c("start_date","end_date"), sep="-",fill="right") %>% 
 21 |   mutate(end_date = ifelse(is.na(end_date), start_date, end_date)) %>% 
 22 |   separate(start_date, c("smonth", "sday", "syear"), sep = "/",  convert = TRUE, fill = "right")%>% 
 23 |   mutate(end_date = ifelse(str_count(end_date, "/") == 1, paste(smonth, end_date, sep = "/"), end_date)) %>% 
 24 |   mutate(end_date = mdy(end_date))  %>% mutate(syear = ifelse(is.na(syear), year(end_date), syear + 2000)) %>% 
 25 |   unite(start_date, smonth, sday, syear)  %>% 
 26 |   mutate(start_date = mdy(start_date)) %>% 
 27 |   separate(`N/Pop`, into=c("N","population_type"), sep="\ ", convert=TRUE, fill="left") %>% 
 28 |   mutate(Obama = as.numeric(Obama)/100, 
 29 |          McCain=as.numeric(McCain)/100,
 30 |          diff = Obama - McCain,
 31 |          day=as.numeric(start_date - mdy("11/04/2008"))) 
 32 | 
 33 | dat <-  filter(polls_2008, start_date>="2008-06-01") %>% 
 34 |   group_by(X=day)  %>% 
 35 |   summarize(Y=mean(diff))
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | library(shiny)
 45 | 
 46 | # Define UI for application that draws a histogram
 47 | ui <- fluidPage(
 48 |    # Application title
 49 |    titlePanel("Kernel Regression Smoother"),
 50 |    
 51 |    # Sidebar with a slider input for number of bins 
 52 |    sidebarLayout(
 53 |       sidebarPanel(
 54 |         selectInput("select", label = h3("Smoothing Method"), 
 55 |                     choices = list("Box Kernel" = 1, "Normal Kernel" = 2, "Loess Regression" = 3), 
 56 |                     selected = 1),
 57 |         sliderInput("bw",
 58 |                      "Bandwidth:",
 59 |                      min = 5,
 60 |                      max = 50,
 61 |                      step =2,
 62 |                      value = 25)
 63 | 
 64 |       ),
 65 |       
 66 |       # Show a plot of the generated distribution
 67 |       mainPanel(
 68 |          plotOutput("plot")
 69 |       )
 70 |    )
 71 | )
 72 | 
 73 | # Define server logic required to draw a histogram
 74 | server <- function(input, output) {
 75 |    output$plot <- renderPlot({
 76 |       # Box Kernel
 77 |       if(input$select == 1){
 78 |         mod <- ksmooth(dat$X, dat$Y, kernel="box", bandwidth = input$bw)
 79 |         fit <- data.frame(X=dat$X, .fitted=mod$y)
 80 |       }
 81 |      # Normal Kernel
 82 |      if(input$select == 2){
 83 |        mod <- ksmooth(dat$X, dat$Y, kernel="normal", bandwidth = input$bw)
 84 |        fit <- data.frame(X=dat$X, .fitted=mod$y)
 85 |       }
 86 |      if(input$select == 3){
 87 |        mod <- loess(Y~X, degree=1, span = input$bw/50, data=dat)
 88 |        fit <- augment(mod)
 89 |      }
 90 |      
 91 |      ggplot(dat, aes(X, Y)) + ylab("(% Obama - % McCain)") + 
 92 |        xlab("Days to Election") + geom_point(cex=5) + 
 93 |        geom_line(aes(x=X, y=.fitted), data=fit, color="red") + 
 94 |        ggtitle('Aggregated Poll Results: 2008 U.S. Presidential')
 95 |    })
 96 | }
 97 | 
 98 | # Run the application 
 99 | shinyApp(ui = ui, server = server)
100 | 
101 | 


--------------------------------------------------------------------------------
/lectures/dataviz/shiny_section/smoothing.r:
--------------------------------------------------------------------------------
  1 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
  2 | # File Name          : smoothing.r
  3 | # Programmer Name    : Luis Campos
  4 | #                     lfcampos87@gmail.com
  5 | #
  6 | # Purpose            : This file contains all the code needed to build a 
  7 | #                      smoother Shiny app
  8 | #
  9 | # Date               : 4/21/2016
 10 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 11 | 
 12 | 
 13 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 14 | # This file contains all the pieces of the puzzle to create the Kernel
 15 | # smoother shiny app:
 16 | #  - see finished product at: https://lfcampos.shinyapps.io/Kernel_Smoother/
 17 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 18 | 
 19 | 
 20 | 
 21 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 22 | # Libraries: not all may be needed, maybe filter these
 23 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 24 | library(dplyr); library(ggplot2); library(broom)
 25 | library(stringr); library(lubridate); library(tidyr)
 26 | library(XML)
 27 | 
 28 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 29 | # Download the data and clean up
 30 | #   Lots of cleanup here, feel free to change whatever you want.
 31 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 32 | theurl <- paste0("http://www.pollster.com/08USPresGEMvO-2.html")
 33 | polls_2008 <- readHTMLTable(theurl,stringsAsFactors=FALSE)[[1]] %>%
 34 |   tbl_df() %>% 
 35 |   separate(col=Dates, into=c("start_date","end_date"), sep="-",fill="right") %>% 
 36 |   mutate(end_date = ifelse(is.na(end_date), start_date, end_date)) %>% 
 37 |   separate(start_date, c("smonth", "sday", "syear"), sep = "/",  convert = TRUE, fill = "right")%>% 
 38 |   mutate(end_date = ifelse(str_count(end_date, "/") == 1, paste(smonth, end_date, sep = "/"), end_date)) %>% 
 39 |   mutate(end_date = mdy(end_date))  %>% mutate(syear = ifelse(is.na(syear), year(end_date), syear + 2000)) %>% 
 40 |   unite(start_date, smonth, sday, syear)  %>% 
 41 |   mutate(start_date = mdy(start_date)) %>% 
 42 |   separate(`N/Pop`, into=c("N","population_type"), sep="\ ", convert=TRUE, fill="left") %>% 
 43 |   mutate(Obama = as.numeric(Obama)/100, 
 44 |          McCain=as.numeric(McCain)/100,
 45 |          diff = Obama - McCain,
 46 |          day=as.numeric(start_date - mdy("11/04/2008"))) 
 47 | 
 48 | dat <-  filter(polls_2008, start_date>="2008-06-01") %>% 
 49 |   group_by(X=day)  %>% 
 50 |   summarize(Y=mean(diff))
 51 | 
 52 | 
 53 | 
 54 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 55 | # From a drop-down menu you should be able to select one of three options:
 56 | #  1. "Box Kernel" = 1 
 57 | #  2. "Normal Kernel" = 2
 58 | #  3. "Loess Regression" = 3
 59 | #  
 60 | #  You should also be able to select a Bandwith from a slider
 61 | #  
 62 | #  bw should be between 1 and 50, as fine precision as you want
 63 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 64 | 
 65 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 66 | # For the Box Kernel smoother 
 67 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 68 | mod <- ksmooth(dat$X, dat$Y, kernel="box", bandwidth = bw)
 69 | fit <- data.frame(X=dat$X, .fitted=mod$y)
 70 | 
 71 | 
 72 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 73 | # For the Normal Kernel smoother 
 74 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 75 | mod <- ksmooth(dat$X, dat$Y, kernel="box", bandwidth = bw)
 76 | fit <- data.frame(X=dat$X, .fitted=mod$y)
 77 | 
 78 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 79 | # For Loess Regression Smoother 
 80 | # Note: If you look in the help file, you'll see span should be 
 81 | #       between 0 and 1, so just divide by the max you chose above 
 82 | #       for simplicity
 83 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 84 | 
 85 | mod <- loess(Y~X, degree=2, span = bw/50, data=dat)
 86 | fit <- augment(mod)
 87 | 
 88 | 
 89 | 
 90 | 
 91 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 92 | # Make figure
 93 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 94 | ggplot(dat, aes(X, Y)) + ylab("(% Obama - % McCain)") + 
 95 |  xlab("Days to Election") + geom_point(cex=5) + 
 96 |  geom_line(aes(x=X, y=.fitted), data=fit, color="red") + 
 97 |  ggtitle('Aggregated Poll Results: 2008 U.S. Presidential')
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/lectures/git-and-github/images/git_add.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/git_add.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/git_clone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/git_clone.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/git_commit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/git_commit.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/git_fetch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/git_fetch.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/git_layout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/git_layout.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/git_merge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/git_merge.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/git_push.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/git_push.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/git_status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/git_status.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/gitclean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/gitclean.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/gitclone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/gitclone.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/gitcommit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/gitcommit.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/github-https-clone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/github-https-clone.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/github-ssh-clone.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/github-ssh-clone.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/github.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/github_ssh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/github_ssh.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/gitpush.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/gitpush.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/gitstaged.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/gitstaged.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/gituntracked.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/gituntracked.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/mac-git-security.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/mac-git-security.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/sshkeygen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/sshkeygen.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/wgi-defaultlines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/wgi-defaultlines.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/wgi-git-bash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/wgi-git-bash.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/wgi-scarymessage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/wgi-scarymessage.png


--------------------------------------------------------------------------------
/lectures/git-and-github/images/wgi-usemintty.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/git-and-github/images/wgi-usemintty.png


--------------------------------------------------------------------------------
/lectures/git-and-github/setting-up-git.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Setting up Git and GitHub
  3 | output: html_document
  4 | author: Stephanie Hicks, Luis Campos
  5 | ---
  6 | 
  7 | Starting with homework 2 in BIO 260 and CSCI E-107, we will be using a 
  8 | specific tool to submit homework assignments, other than Canvas. 
  9 | The tool we will be using is called git and GitHub. 
 10 | This is a tutorial that will help you 
 11 | install the tool git on your computer and create a GitHub account. Once
 12 | you have set up git on your computer and registered for a GitHub account, 
 13 | you will be prepared for lecture on Wednesday Feb 10, 2016 where we 
 14 | will: 
 15 | 
 16 | 1. Introduce you to the basics of git and GitHub
 17 | 2. Walk you thought the homework submission process using git and GitHub
 18 | 
 19 | #### Acknowledgements 
 20 | This lab is largely taken from the the 
 21 | [first CS109 lab in 2015](https://github.com/cs109/2015lab1) which is 
 22 | in turn largely taken from 
 23 | [IACS's AC 297s](https://github.com/rdadolf/ac297r-tools-tutorial) course, 
 24 | and the world keeps turning. 
 25 | 
 26 | 
 27 | ## Create your GitHub account
 28 | 
 29 | The first week of class we asked each of you to set up your GitHub account
 30 | and submit your GitHub username in [this survey that was due 
 31 | Jan 29, 2016](http://goo.gl/forms/sxqUPZKfUf). 
 32 | If you have not done this yet, please fill out the survey 
 33 | as soon as possible. **Othewise, you will not be able to submit 
 34 | your homework assignments starting with HW2.**
 35 | 
 36 | To sign up for an account, just go to [github](https://github.com) 
 37 | and pick a unique username, an email address, and a password. 
 38 | Once you've done that, your github page will be at 
 39 | `https://github.com/<your-username-here>`.
 40 | 
 41 | Github also provides a student 
 42 | [developer package](https://education.github.com/pack). 
 43 | This is something that might be nice to have, but it is not 
 44 | necessary for the course. Github may take some time to approve 
 45 | your application for the package. Please note that this is 
 46 | optional and you do not have to have the package 
 47 | approved to fill out the survey. 
 48 | 
 49 | 
 50 | #### Programming expectations
 51 | 
 52 | All the lecture material and homework for this class will use R and
 53 | R Markdown files. Knowledge of R is not a prerequisite for this course, 
 54 | **provided you are comfortable learning on your own as needed**. 
 55 | Basically, you should feel comfortable with:
 56 | 
 57 | * How to look up R syntax on Google and StackOverflow.
 58 | * Basic programming concepts like functions, loops, arrays, dictionaries, strings, and if statements.
 59 | * How to learn new libraries by reading documentation.
 60 | * Asking questions on Canvas forums and StackOverflow.
 61 | 
 62 | 
 63 | ## Setting up your git environment
 64 | 
 65 | ### 1. Installing git
 66 | 
 67 | We will be using the [command line version of git](http://git-scm.com/docs/gittutorial).
 68 | 
 69 | On Linux, install git using your system package manager (yum, apt-get, etc). 
 70 | 
 71 | On the Mac, if you ever installed [Xcode](https://developer.apple.com/xcode/),
 72 | you should have git installed. Alternatively, you may have installed 
 73 | it using `homebrew`. Either of these are fine as long as the 
 74 | git version is greater than 2.0. To determine the version of git 
 75 | that is installed on your computer, open a terminal window and type:
 76 | 
 77 | > $ `git --version`
 78 | 
 79 | If git is installed, you should see a version number. Check to see if it 
 80 | is greater than version 2.0. If it is not, please update your version 
 81 | of git. 
 82 | 
 83 | If git is not installed on your Mac and Windows, go to http://git-scm.com. 
 84 | Accept all defaults in the installation process. 
 85 | On Windows, installing git will also install for you a minimal 
 86 | unix environment with a "bash" shell and terminal window. 
 87 | Voila, your windows computer is transformed into a unixy form.
 88 | 
 89 | #### Windows specific notes
 90 | 
 91 | There will be an installer `.exe` file you need to click. Accept all the defaults.
 92 | 
 93 | Here is a screen shot from one of the defaults. It makes sure you will have the "bash" tool talked about earlier.
 94 | 
 95 | ![use git bash](./images/wgi-git-bash.png)
 96 | 
 97 | Choose the default line-encoding conversion:
 98 | 
 99 | ![default lines](./images/wgi-defaultlines.png)
100 | 
101 | Use the terminal emulator they provide, its better than the one shipped with windows.
102 | 
103 | ![use mintty](./images/wgi-usemintty.png)
104 | 
105 | Towards the end, you might see a message like this. It looks scary, but all you need to do is click "Continue"
106 | 
107 | ![scary message](./images/wgi-scarymessage.png)
108 | 
109 | 
110 | At this point you will have git installed. You can bring up "git bash" 
111 | either from your start menu, or from the right click menu on any 
112 | folder background. When you do so, a terminal window will open. 
113 | This terminal is where you will issue further git setup commands, 
114 | and git commands in general.
115 | 
116 | Get familiar with the terminal. It opens in your home folder, and 
117 | maps `\\` paths on windows to more web/unix like paths with '/'. 
118 | Try issuing the commands `ls`, `pwd`, and `cd folder` where folder 
119 | is one of the folders you see when you do a ls. You can do 
120 | a `cd ..` to come back up.
121 | 
122 | 
123 | #### Mac specific notes
124 | 
125 | As mentioned earlier, if you ever installed Xcode or the 
126 | "Command Line Developer tools", you may already have git.
127 | Make sure its version 2.0 or higher. (`git --version`)
128 | 
129 | Or if you use **Homebrew**, you can install it from there. 
130 | The current version on homebrew is 2.4.3
131 |  You don't need to do anything more in this section.
132 | 
133 | -----
134 | 
135 | First click on the `.mpkg` file that comes when you open the 
136 | downloaded `.dmg` file.
137 | 
138 | When I tried to install git on my mac, I got a warning saying my 
139 | security preferences wouldn't allow it to be installed. So I opened 
140 | my system preferences and went to "Security".
141 | 
142 | ![system pref](./images/mac-git-security.png)
143 | 
144 | Here you must click "Open Anyway", and the installer will run.
145 | 
146 | The installer puts git as `/usr/local/git/bin/git`. 
147 | That's not a particularly useful spot. Open up `Terminal.app`. 
148 | It's usually in `/Applications/Utilities`. Once the terminal opens up, issue
149 | 
150 | > $ `sudo ln -s /usr/local/git/bin/git /usr/local/bin/git`
151 | 
152 | Keep the Terminal application handy in your dock. (You could also 
153 | download and use iTerm.app, which is a nicer terminal, if you are into 
154 | terminal geek-ery). We'll be using the terminal extensively for git. 
155 | 
156 | Try issuing the commands `ls`, `pwd`, and `cd folder` where 
157 | folder is one of the folders you see when you do a ls. You 
158 | can do a `cd ..` to come back up.
159 | 
160 | ### 2. Optional: Creating ssh keys on your machine
161 | 
162 | This is an optional step. But it makes things much easier so 
163 | it's highly recommended.
164 | 
165 | There are two ways git talks to github: https, which is a 
166 | web based protocol
167 | 
168 | ![github https](./images/github-https-clone.png)
169 | 
170 | or over ssh
171 | 
172 | ![github ssh](./images/github-ssh-clone.png)
173 | 
174 | Which one you use is your choice. I recommend ssh, and the 
175 | github urls in this homework and in labs will be ssh urls.
176 | Every time you contact your upstream repository (hosted on github), 
177 | you need to prove you're you. You *can* do this with passwords over 
178 | HTTPS, but it gets old quickly. By providing an ssh public key to 
179 | github, your ssh-agent will handle all of that for you, 
180 | and you won't have to put in any passwords.
181 | 
182 | At your terminal, issue the command (skip this if you are a 
183 | seasoned ssh user and already have keys):
184 | 
185 | `ssh-keygen -t rsa`
186 | 
187 | It will look like this:
188 | ![github ssh keygen](./images/sshkeygen.png)
189 | 
190 | Accept the defaults. When it asks for a passphrase for your keys, 
191 | put in none. (you can put in one if you know how to set up a ssh-agent).
192 | 
193 | This will create two files for you, in your home folder if 
194 | you accepted the defaults.
195 | 
196 | `id_rsa` is your PRIVATE key. NEVER NEVER NEVER give that to anyone. 
197 | `id_rsa.pub` is your public key. You must supply this to github.
198 | 
199 | ----
200 | 
201 | ### 3.  Optional: Uploading ssh keys and Authentication
202 | 
203 | To upload an ssh key, log in to github and click on the gear icon 
204 | in the top right corner (settings). Once you're there, click on 
205 | "SSH keys" on the left. This page will contain all your ssh 
206 | keys once you upload any.
207 | 
208 | Click on "add ssh key" in the top right. You should see this box:
209 | 
210 | <img src="./images/github_ssh.png" alt="github ssh" style="width: 500px;"/>
211 | 
212 | The title field should be the name of your computer or some other 
213 | way to identify this particular ssh key.
214 | 
215 | In the key field, you'll need to copy and paste 
216 | your *public* key. **Do not paste your private ssh key here.**
217 | 
218 | When you hit "Add key", you should see the key name and some 
219 | hexadecimal characters show up in the list. You're set.
220 | 
221 | Now, whenever you clone a repository using this form:
222 | 
223 | `$ git clone git@github.com:rdadolf/ac297r-git-demo.git`,
224 | 
225 | you'll be connecting over ssh, and will not be asked for your github password
226 | 
227 | You will need to repeat steps 2 and 3 of the setup for each computer you wish to use with github.
228 | 
229 | ### 4. Setting global config for git
230 | 
231 | Again, from the terminal, issue the command
232 | 
233 | `git config --global user.name "YOUR NAME"`
234 | 
235 | This sets up a name for you. Then do
236 | 
237 | `git config --global user.email "YOUR EMAIL ADDRESS"`
238 | 
239 | Use the **SAME** email address you used in setting up your github account.
240 | 
241 | These commands set up your global configuration. On my Mac, 
242 | these are stored in the text file `.gitconfig` in my home folder.
243 | 
244 | 
245 | 
246 | 


--------------------------------------------------------------------------------
/lectures/git-and-github/submitting-HW-using-git.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Homework with Git and GitHub
  3 | output: html_document
  4 | ---
  5 | 
  6 | Starting with homework 2 in BIO 260 and CSCI E-107, we will be using 
  7 | git and GitHub to get your homework assignment, work on your homework, 
  8 | and submit your homework solutions. This tutorial will walk you through 
  9 | that process using git and GitHub. 
 10 | 
 11 | #### Acknowledgements 
 12 | This lab is largely taken from the the 
 13 | [first CS109 lab in 2015](https://github.com/cs109/2015lab1) which is 
 14 | in turn largely taken from 
 15 | [IACS's AC 297s](https://github.com/rdadolf/ac297r-tools-tutorial) course, 
 16 | and the world keeps turning. 
 17 | 
 18 | 
 19 | 
 20 | 
 21 | ## Getting and Working on Homework
 22 | 
 23 | ### Cloning your Homework repository
 24 | Each of you will be made members of the [`datasciencelabs-students` 
 25 | organization on GitHub](https://github.com/datasciencelabs-students). 
 26 | This means is that your homework repositories all technically belong to us. 
 27 | But you will be granted unlimited access throughout the course!
 28 | 
 29 |  <center><img src="./images/github.png" alt="gitclean" style="width: 600px;"/></center>
 30 | 
 31 | 
 32 | You will notice when you visit the Data Science Labs' 
 33 | [Github page](https://github.com/datasciencelabs-students) that you can 
 34 | only see repositories with your GitHub username on them. You will get 
 35 | one repository for each homework throughout the semester. When a new 
 36 | homework is released you can go to the corresponding repository to 
 37 | see what is in store. The work flow will be pretty simple. 
 38 | 
 39 | 1. Go to: https://github.com/datasciencelabs-students
 40 | 
 41 | 2. Click on the repository you want to work on. For 
 42 | example `<your_GitHub_username>-2016HW2` for Homework 2. 
 43 | 
 44 | 3. Copy the link near the top of the page.
 45 | 
 46 | 4. Go to your `Terminal` (on Mac) or `git bash` (on Windows), 
 47 | change directories into your BIO 260 folder.
 48 | 
 49 | 5. Use `git clone` to clone the repository, for example:
 50 | 
 51 | > `$ git clone https://github.com/datasciencelabs-students/<your_GitHub_username>-2016HW2.git`
 52 | 
 53 | 6. You should now see a new directory called `<your_GitHub_username>-2016HW2`. 
 54 | Move into that directory. 
 55 | 
 56 | 7. If you type `git status` it will give you the current status of your 
 57 | directory. It should look something like this:
 58 | 
 59 |  <center><img src="./images/gitclean.png" alt="gitclean" style="width: 700px;"/></center>
 60 | 
 61 | 
 62 | ### Working on your homework
 63 | 
 64 | Once you have a local copy of your repository, it's time to get to work! 
 65 | 
 66 | After writing some of your homework in an `Rmd` file, then `knit` it, 
 67 | make pretty plots, find out some cool stuff about the dataset it's 
 68 | time to `add/commit/push`. After some work if you head back to `Terminal` 
 69 | you will see that something has changed when you type `git status`:
 70 | 
 71 |  <center><img src="./images/gituntracked.png" alt="gitclean" style="width: 700px;"/></center>
 72 | 
 73 | You will notice that there are two `untracked files`, these are here 
 74 | because I created these two files in the homework repository. 
 75 | In order to get git to track changes on these files we need to 
 76 | add them. So we type :
 77 | 
 78 | > `$ git add HW2_Problems.Rmd HW2_Problems.html `
 79 | 
 80 |  <center><img src="./images/gitstaged.png" alt="gitclean" style="width: 700px;"/></center>
 81 | 
 82 | Now you will notice that the files have turned green and are now 
 83 | labeled as changes to be committed, now it's time to commit. 
 84 | This is equivalent to `save` in most programs. But what is special 
 85 | about `git` and other version control software is that we can track 
 86 | and revert changes! We also need to give what's called a `commit message`, 
 87 | which will help us keep track of the changes we made when we loot at 
 88 | this in the future. Leave detailed messages so that future you will 
 89 | know what you did. Future you will thank you. We will get to this 
 90 | part later. Notice the `-am` flag, the `a` stands for *all*, 
 91 | as in all tracked files, and the `m` stands for *message*.
 92 | 
 93 | We do that by typing:
 94 | 
 95 | ``
 96 | git commit -am "This is my commit message, it is very detailed."
 97 | ``
 98 | 
 99 |  <center><img src="./images/gitcommit.png" alt="gitclean" style="width: 700px;"/></center>
100 | 
101 | Cool! Now we've saved our work on our local directory, we can now push 
102 | our work to Github. Note, we can (and should) do this as many times as 
103 | we want before the homework deadline. What is great about this is that 
104 | it will make getting help from your TA easier as well as keeping a 
105 | copy of your work in the cloud in case your computer crashes, or you 
106 | accidentally delete something.	
107 | 
108 |  <center><img src="./images/gitpush.png" alt="gitclean" style="width: 700px;"/></center>
109 | 
110 | ### Summary
111 | To summarize, it is important to do the following 
112 | steps whenever you finish working on your homework to make full 
113 | use of `git` and Github as well as generally having the best 
114 | experience in this class. 
115 | 
116 | 1. Work on your homework
117 | 2. Add changes to track with: `git add`
118 | 3. Commit changes to your local repository: `git commit`
119 | 4. Push the changes to your github repo: `git push`
120 | 
121 | Generally keep this picture in mind whenever you want to do this 
122 | loop, it is important to only add changed files you care about 
123 | and nothing you do not care about. If certain files keep popping 
124 | up in your git status that you will never want to add, e.g. `.Rhistory`, 
125 | etc, add them to your `.gitignore` to simplify your life, this will keep 
126 | those files from showing up here. For more info on this see the 
127 | `version_control.Rmd`
128 | 
129 | ![add](./images/git_add.png)
130 | ![commit](./images/git_commit.png)
131 | ![push](./images/git_push.png)
132 | 
133 | # Late Day Policy
134 | From the course web-page:
135 | 
136 | > Each student is given six late days for homework at the beginning of the semester. A late day extends the individual homework deadline by 24 hours without penalty. No more than two late days may be used on any one assignment. Assignments handed in more than 48 hours after the original deadline will not be graded. We do not accept any homework under any circumstances more than 48 hours after the original deadline. Late days are intended to give you flexibility: you can use them for any reason no questions asked. You don't get any bonus points for not using your late days. Also, you can only use late days for the individual homework deadlines all other deadlines (e.g., project milestones) are hard.
137 | 
138 | We made this policy because we understand that you are all busy 
139 | and things happen. We hope that this added flexibility makes gives you 
140 | the freedom to enjoy the courses and engage with the material fully. 
141 | 
142 | ## Some unsolicited advice
143 | 
144 | To be fair to all the students we have to enforce this late day policy, 
145 | so we have put together a list of things to consider near the deadline.
146 | 
147 | Say the homework is due Sunday at 11:59 pm.
148 | 
149 | 1. If we do not see any more `commit`s after the deadline we will take 
150 | the last `commit` as your final submission.
151 | 2. Check that the final `commit` is showing on your Github repo page. 
152 | "I forgot to `push`" is not an acceptable excuse for late work.
153 | 3. It may help to add a message like "This is my final version of the 
154 | homework please grade this" but that's up to you.
155 | 4. If there are `commit`s after the deadline **we will take the last `commit`**
156 | up to Tuesday at 11:59 pm as the final version. 
157 | 5. We will assess the number of late days you used and keep track.
158 | 6. You **do not** need to tell us that you will take extra days, we will 
159 | be able to see the time stamp of your last `commit`.
160 | 7. When you are done with the homework, do not `commit` or `push` any more. 
161 | If you `commit` and `push` after the deadline you will be charged a late day. 
162 | This is strict.
163 | 
164 | # Happy `git`-ing


--------------------------------------------------------------------------------
/lectures/inference/aggregators.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Poll Aggregators"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ## In-class questions
  7 | We're going to try something new today. If you have small technical questions during class, go to this [link](https://docs.google.com/document/d/1rClyG2ZMoTM9tER9zjXCBT-26lrpuIH_oSkCLCI6pII/edit?usp=sharing) and ask away.
  8 | 
  9 | 
 10 | ## Averaging polls
 11 | 
 12 | 
 13 | ```{r, echo=FALSE, cache=TRUE, warning=FALSE, message=FALSE}
 14 | library(ggplot2)
 15 | library(dplyr)
 16 | library(tidyr)
 17 | library(pollstR)
 18 | 
 19 | theme_set(theme_bw())
 20 | 
 21 | race2012 <- pollstr_polls(topic = '2012-president', after= as.Date("2012-11-3"), max_pages = Inf)
 22 | 
 23 | polls <- race2012$questions %>% 
 24 |   filter(topic=="2012-president" & state=="US") %>% 
 25 |   select(choice, value, margin_of_error, observations, id) %>% 
 26 |   filter(choice %in% c("Obama","Romney") & !is.na(margin_of_error)) %>% 
 27 |   spread(choice, value) %>% 
 28 |   left_join(select(race2012$polls, id, pollster, method), by="id") %>%
 29 |   filter(method!="Internet") %>%
 30 |   mutate(diff= Obama - Romney) %>%
 31 |   select(pollster, diff, Obama, Romney, margin_of_error, observations)
 32 | 
 33 | arrange(polls,diff) %>% rename( n=observations) %>% 
 34 |   mutate(pollster=ifelse(pollster=="PPP (D-Americans United for Change)","PPP",pollster))
 35 | ```
 36 | 
 37 | 
 38 | Given that so many polls pointed to a toss up, why was Nate Silver so confident?
 39 | 
 40 | 
 41 | ## Competition Data
 42 | 
 43 | Let's start by looking at our competition data.
 44 | 
 45 | ```{r}
 46 | library(readr)
 47 | filename <- "https://raw.githubusercontent.com/datasciencelabs/data/master/blue-bead-comp-results.csv"
 48 | tab <- read_csv(filename)
 49 | names(tab)<-c("timestamp", "name", "estimate","N","ci")
 50 | ```
 51 | 
 52 | Looking at the data, are there more blue or red balls?
 53 | 
 54 | BTW, the winners are:
 55 | 
 56 | ```{r}
 57 | date <- sapply(strsplit(tab$timestamp," "), function(x)x[1])
 58 | date <- as.Date(date,"%m/%d/%Y")
 59 | tab %>% mutate(date = date) %>% filter(date<"2016-02-23") %>%
 60 |   mutate(diff = abs(estimate - 53.4)) %>% arrange(diff) 
 61 | ```
 62 | 
 63 | Now back to aggregating:
 64 | 
 65 | ```{r}
 66 | tab <- mutate(tab,estimate=ifelse(estimate<1, estimate*100, estimate))
 67 | boxplot(tab$estimate)
 68 | 
 69 | tab = filter(tab, estimate>20)
 70 | ```
 71 | 
 72 | 
 73 | So now it's my turn to enter a competing entry. 
 74 | 
 75 | We can assume that all these entries were based on independent data. So this data actually shows data for
 76 | 
 77 | ```{r}
 78 | tab %>% summarise(sum(N))
 79 | ```
 80 | 
 81 | We can deduce how many blue and red beads the first person saw:
 82 | 
 83 | ```{r}
 84 | slice(tab,1)
 85 | ```
 86 | 
 87 | In general we know each person saw
 88 | 
 89 | ```{r}
 90 | tab %>% mutate(blue = estimate/100 * N, red = (1-estimate/100) * N) %>% select(blue, red)
 91 | ```
 92 | 
 93 | So we can add these all up and create an estimate on thousands of draws:
 94 | 
 95 | We have $k=1,\dots,K$ polls each with $N_k$ observations and en estimate $\hat{p}_k$. We can deduce then that for poll $k$, $N_k \hat{p}_k$ are blue. So the aggregate $\hat{p}$ is
 96 | 
 97 | $$
 98 | \frac{\sum_{k=1}^K \hat{p}_k N_k}{\sum_{k=1}^K N_k}
 99 | $$
100 | which can be viewed as a weighted average:
101 | 
102 | $$
103 | \sum_{k=1}^P w_k \hat{p}_k \mbox{ with } w_k = \frac{N_k}{\sum_{k=1}^K N_k}
104 | $$
105 | 
106 | So our new estimate is
107 | 
108 | ```{r}
109 | N <- sum(tab$N)
110 | p_hat <- sum(tab$estimate*tab$N)/N
111 | ```
112 | 
113 | And our confidence interval is pretty tight:
114 | ```{r}
115 | c(-1,1)*qnorm(0.975)*sqrt(p_hat/100*(1-p_hat/100))/sqrt(N)*100
116 | ```
117 | 
118 | ## Back to the 2012 Election
119 | 
120 | Let's compute this for Obama and Romney (we assume the undecided at random)
121 | 
122 | ```{r}
123 | O <- sum(polls$Obama*polls$observations)/sum(polls$observations)/100
124 | R <- sum(polls$Romney*polls$observations)/sum(polls$observations)/100
125 | ```
126 | 
127 | and the difference is 
128 | ```{r}
129 | round(O-R,2)
130 | ```
131 | 
132 | The margin of error for each one is:
133 | 
134 | ```{r}
135 | N <- sum(polls$observations)
136 | round( qnorm(.975)*sqrt(O*(1-O))/sqrt(N), 3)
137 | round( qnorm(.975)*sqrt(R*(1-R))/sqrt(N), 3)
138 | ```
139 | 
140 | Now we have two confidence intervals?  We can form a confidence interval for the difference. The difference is approximately $\hat{p} - (1-\hat{p})$ or $1-2\hat{p}$. This implies that the standard error for the difference is twice as large than for $\hat{p}$. So our confidence interval is:
141 | 
142 | ```{r}
143 | O-R + c(-2,2)*qnorm(.975)*sqrt(O*(1-O))/sqrt(N)
144 | ```
145 | 
146 | So a 95% confidence interval still didn't quite call the election for Obama (but it was close). However, as we will learn there was much more involved in the "90% chance of winning"" prediction which we will learn next.
147 | 
148 | ## More Polls
149 |  
150 | Let's get polls for two months before. 
151 | 
152 | ####Assessment: 
153 | Adapt the code above to make a plot of difference between Obama and Romney against time.
154 | 
155 | 
156 | 
157 | # Questions?
158 | If you have any questions, please fill this out: http://goo.gl/forms/BWzC2kVNZC
159 | 


--------------------------------------------------------------------------------
/lectures/inference/inference.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/inference.pdf


--------------------------------------------------------------------------------
/lectures/inference/models.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/models.pdf


--------------------------------------------------------------------------------
/lectures/inference/pics/DLP_slide.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/pics/DLP_slide.jpg


--------------------------------------------------------------------------------
/lectures/inference/pics/DLP_slide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/pics/DLP_slide.pdf


--------------------------------------------------------------------------------
/lectures/inference/pics/Moneyball_Poster.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/pics/Moneyball_Poster.jpg


--------------------------------------------------------------------------------
/lectures/inference/pics/depodesta.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/pics/depodesta.jpg


--------------------------------------------------------------------------------
/lectures/inference/pics/jar-of-beads.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/pics/jar-of-beads.jpg


--------------------------------------------------------------------------------
/lectures/inference/pics/meonbike.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/pics/meonbike.jpg


--------------------------------------------------------------------------------
/lectures/inference/pics/nate-silver-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/pics/nate-silver-1.png


--------------------------------------------------------------------------------
/lectures/inference/pics/nate-silver-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/pics/nate-silver-2.png


--------------------------------------------------------------------------------
/lectures/inference/pics/nythist.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/pics/nythist.gif


--------------------------------------------------------------------------------
/lectures/inference/pics/nytimesvotingpattern.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/pics/nytimesvotingpattern.jpg


--------------------------------------------------------------------------------
/lectures/inference/pics/tweetsproductplacement.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/inference/pics/tweetsproductplacement.gif


--------------------------------------------------------------------------------
/lectures/inference/probability.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Probability"
  3 | output: html_document
  4 | ---
  5 | 
  6 | # Probability and Inference
  7 | 
  8 | Before we start today's lecture please complete the following probability assessment: 
  9 | 
 10 | [http://goo.gl/forms/St2D6YJxUO](http://goo.gl/forms/St2D6YJxUO).
 11 | 
 12 | You are not being tested for grade. In fact, we won't record your name. We simply want to adapt the lecture based on your knowledge. If you don't know the answer to a question simply chose the "I would like a review".
 13 | 
 14 | ## Motivation
 15 | 
 16 | In 2012, Nate Silver [declared](http://fivethirtyeight.blogs.nytimes.com/fivethirtyeights-2012-forecast/?_r=0) that Barack Obama had a 91% chance of winning the election. 
 17 | 
 18 | ![](pics/nate-silver-1.png)  |  ![](pics/nate-silver-2.png)
 19 | 
 20 | Several pundits were not happy. Here is what Joe Scarborough had to say:
 21 | 
 22 | > Anybody that thinks that this race is anything but a tossup right now is such an ideologue, they should be kept away from typewriters, computers, laptops and microphones for the next 10 days, because they’re jokes. 
 23 | 
 24 | 
 25 | To support this, you could easily cherry pick from the polls at the time:
 26 | 
 27 | ```{r, echo=FALSE, cache=TRUE, warning=FALSE, message=FALSE}
 28 | library(ggplot2)
 29 | library(dplyr)
 30 | library(tidyr)
 31 | library(pollstR)
 32 | 
 33 | theme_set(theme_bw())
 34 | 
 35 | race2012 <- pollstr_polls(topic = '2012-president', after= as.Date("2012-11-3"), max_pages = Inf)
 36 | 
 37 | polls <- race2012$questions %>% 
 38 |   filter(topic=="2012-president" & state=="US") %>% 
 39 |   select(choice, value, margin_of_error, observations, id) %>% 
 40 |   filter(choice %in% c("Obama","Romney")) %>% 
 41 |   spread(choice, value) %>% 
 42 |   left_join(select(race2012$polls, id, pollster, method), by="id") %>%
 43 |   filter(method!="Internet") %>%
 44 |   mutate(diff= Obama - Romney) %>%
 45 |   select(pollster, diff, Obama, Romney, margin_of_error, observations)
 46 | 
 47 | arrange(polls,diff) %>% rename( n=observations) %>% 
 48 |   mutate(pollster=ifelse(pollster=="PPP (D-Americans United for Change)","PPP",pollster))
 49 | polls %>% 
 50 |   mutate(margin_of_error=ifelse(is.na(margin_of_error),0,margin_of_error)) %>%
 51 |   arrange(diff) %>% 
 52 |   ggplot( aes(seq_along(diff), Obama-Romney, 
 53 |               min=diff-2*margin_of_error,
 54 |               max=diff+2*margin_of_error,
 55 |               color=pollster)) + 
 56 |   geom_point() + geom_errorbar() + 
 57 |   xlab("") + 
 58 |   theme(axis.ticks = element_blank(),
 59 |         axis.text.x = element_blank())
 60 | 
 61 | ```
 62 | 
 63 | Not only did Nate Silver correctly predict that Obama would win, but he also predicted the outcome of all 50 states and DC correctly. Here we will explain how  poll aggregators use Data Science to make these prediction. Our first step is to understand probability. We will learn what _margin of error_ means and why there is variability between different polls.
 64 | 
 65 | ## Relative Frequency
 66 | 
 67 | The word probability is used in everyday language. For example Google's auto-complete of "What are the chances of" give us "getting pregnant", "having twins", and "rain tomorrow". Answering questions about probability is not easy if not impossible. Here we discuss a mathematical definition of _probability_ that does permit us to give precises answers to certain questions. 
 68 | 
 69 | For example, if I have 2 red beads and 3 blue beads in a bag and I pick one  at random what is the probability of picking a red one? The answer is 2/5 or 40%. This can be considered a definition: The probability of an event happening is the proportion of times it happens if we repeat the choice over and over independently and under the same condition.
 70 | 
 71 | 
 72 | ### Notation
 73 | 
 74 | We use the notation $\mbox{Pr}(A)$ to denote the probability of event $A$ happening.  In data science applications, we will often deal with continues variables and events will of the form $\mid X \mid a$, in other words the event is that an observed random quantity is larger than a constant $a$. We will say more on this below.
 75 | 
 76 | 
 77 | ## Monte Carlo Simulations
 78 | 
 79 | Computers provide a way to actually perform the experiment described above. Random number generators permit us to mimic the process of picking at random. An example is the `sample` function in R. Here is the example above. We first use the function `rep` to generate the beads.
 80 | 
 81 | ```{r}
 82 | beads <- rep( c("red", "blue"), times = c(2,3))
 83 | ```
 84 | 
 85 | To pick a bead at random we simply type
 86 | 
 87 | ```{r}
 88 | sample( beads, 1)
 89 | ```
 90 | 
 91 | Now, above we used the phrase "over and over". Technically, we can't repeat this over and over, but we can repeat the experiment a large enough number of times to make it practically equivalent. This is referred to as a _Monte Carlo_ simulation. The `replicate` function permits us repeat the same task, say, $B$ times:
 92 | 
 93 | ```{r}
 94 | B <- 10^5
 95 | events <- replicate( B, sample( beads, 1))
 96 | ```
 97 | 
 98 | We can now see if in fact, our definition is in agreement with this simulation. We introduce a two new useful functions, `table` and `prop.table`. The function `table` quickly tabulates the outcomes
 99 | 
100 | ```{r}
101 | tab <- table(events)
102 | tab
103 | ```
104 | 
105 | and `prop.table` gives us the proportions:
106 | 
107 | ```{r}
108 | prop.table(tab)
109 | ```
110 | 
111 | Now, before we continue, let's point out that `sample` can pick more than one element. However, this selection occurs   _without replacement_.  Note what happens when we ask to select five beads:
112 | 
113 | ```{r}
114 | sample(beads, 5)
115 | sample(beads, 5)
116 | ```
117 | 
118 | This results in a re-arrangement since once one bead is selected it is not returned. So all beads appear at the end. To repeat the same experiment of picking one out of 5, over and over, we need to sample _with replacement_. We can tell `sample` to do this as many times as we want:
119 | 
120 | ```{r}
121 | events <- sample(beads, B, replace = TRUE) ##default is FALSE
122 | prop.table( table( events ) )
123 | ```
124 | 
125 | This code is equivalent to what we did above with `replicate`. When the same exact experiment is conducted and when one outcome does not affect the other, we say that the events are _indepenent_. 
126 | 
127 | 
128 | ## Probability Distributions
129 | 
130 | We previously described distributions. In particular we defined the height distribution for our class. There is a connection between distributions and probabilities. We introduced the empirical cumulative distribution function and the notation $F(a)$, which represents the proportion of values in our list below or equal to a value $a$. Suppose our list of numbers is:
131 | 
132 | ```{r}
133 | x <- as.numeric(beads == "blue")
134 | x
135 | ```
136 | 
137 | We can quickly see that, for example $F(0) = 2/5$:
138 | 
139 | ```{r}
140 | mean(x <= 0)
141 | ```
142 | 
143 | 
144 | If we pick an element at random from this list:
145 | 
146 | ```{r}
147 | X <- sample(x,1)
148 | ```
149 | 
150 | then intuitively we see that the probability of the observed value $X$ being smaller or equal to $0$ is $\mbox{Pr}(X \leq 0) = \mbox{Pr}(X = 0) = F(0)$. We also see that
151 | $\mbox{Pr}(X=1) = F(1) - F(0) = 3/5$. The cumulative distribution function defines the probabilities of elements picked at random from that list.
152 | 
153 | Assessment: If we pick a random student from this class at random, what is the probability that this student is taller than 6 feet tall? 
154 | 
155 | ```{r,echo=FALSE, warning=FALSE}
156 | ##Your code here
157 | ```
158 | 
159 | Note that for categorical outcomes, such as blue and red beads, the concept of a cumulative distribution function is not very intuitive. This is particularly true when we have more than two outcomes. Instead, it is more natural to define the probability of each possible event: $\mbox{Pr}(\mbox{blue bead})=3/5$ and $\mbox{Pr}(\mbox{red bead})=2/5$.
160 | 
161 | 
162 | ## Conditional Probabilities
163 | 
164 | An important concept that will we will start seeing more and more relates to _conditional_ distributions and probabilities. We start with an example.
165 | 
166 | Assessment: 
167 | 
168 | 1. I have a deck of cards. I take two cards (without replacement). What is the probability that the first card is the three of hearts ? Using the notation above we write: What is $\mbox{Pr}(\mbox{first card is } 3\heartsuit)$ ?
169 | 
170 | 2. Given that the first card is not the three of hearts what is the probability that the second one is the three of hearts? We introduce new notation:
171 | 
172 | $$\mbox{Pr}(\mbox{second card is } 3\heartsuit \mid \mbox{first card is not } 3\heartsuit )$$
173 | 
174 | 
175 | ### `expand.grid` 
176 | 
177 | Before continuing we quickly introduce the `paste` function which can be quite useful. We use it to create strings by joining smaller strings. For example, if we have the number of suit of card we can get the card name like this:
178 | 
179 | ```{r}
180 | number <- "Three"
181 | suit <- "Hearts"
182 | paste(number, suit)
183 | ```
184 | 
185 | To see why is this useful. We will use the function `expand.grid` to generate all possible combinations. Here is a quick example:
186 | 
187 | ```{r}
188 | expand.grid(letters[1:3], c(2,4))
189 | ```
190 | 
191 | So here is how we generate a deck of cards:
192 | ```{r}
193 | suits <- c("Diamonds", "Clubs", "Hearts", "Spades")
194 | numbers <- c("Ace", "Deuce", "Three", "Four","Five", "Six", "Seven", "Eight", "Nine", "Ten", "Jack", "Queen", "King")
195 | deck <- expand.grid( number=numbers, suit=suits)
196 | deck <- paste( deck$number, deck$suit)
197 | ```
198 | 
199 | To answer the first question above we can simply compute
200 | ```{r}
201 | mean( deck =="Three Hearts")
202 | ```
203 | 
204 | which is $1/52$. 
205 | 
206 | Now let's generate all combinations of picking two cards. For this we use the function `permutations`
207 | 
208 | ```{r}
209 | library(gtools)
210 | index <- permutations(52,2)
211 | first_card <- deck[index[,1]]
212 | second_card <- deck[index[,2]]
213 | sum(second_card == "Three Hearts") / 
214 |   sum( first_card != "Three Hearts" )
215 | ```
216 | 
217 | which is $1/51$. 
218 | 
219 | ## Multiplication rule
220 | 
221 | If we want to know the probability of two events, say $A$ and $B$, occurring, we can use the multiplication rule
222 | 
223 | $$ \mbox{Pr}(A \mbox{ and } B) = \mbox{Pr}(A)\mbox{Pr}(B \mid A)
224 | $$
225 | 
226 | We can use induction to expand for more events:
227 | 
228 | $$ \mbox{Pr}(A \mbox{ and } B \mbox{ and } C) = \mbox{Pr}(A)\mbox{Pr}(B \mid A)\mbox{Pr}(C \mid A \mbox{ and } B)
229 | $$
230 | 
231 | 
232 | Above we mentioned _inpendent_ events. Using mathematical notation we can define $B$ as being independent from $A$ as
233 | 
234 | $\mbox{Pr}(B \mid A) = \mbox{Pr}(B)$
235 | 
236 | When we have independent events then the multiplication rule becomes simpler:
237 | 
238 | $$ \mbox{Pr}(A \mbox{ and } B \mbox{ and } C) = \mbox{Pr}(A)\mbox{Pr}(B)\mbox{Pr}(C)
239 | $$
240 | 
241 | But we have to be very careful here, as assuming independence can result in very different answers than when we don't.
242 | 
243 | 
244 | ### Birthday Problem
245 | 
246 | There are 50 people in this room. What is the chance that at least two people have the same birthday?
247 | 
248 | We can use a Monte Carlo simulation.
249 | 
250 | Assessment: Use `sample` to randomly select 50 birthdays. Check if they are unique.
251 | 
252 | ```{r}
253 | ##your code here
254 | ```
255 | 
256 | ```{r birthday-problem}
257 | compute_prob <- function(n, B=10000){
258 |   same_day <- replicate(B, {
259 |     bdays <- sample(1:365, n, replace=TRUE)
260 |     any(duplicated(bdays))
261 |   })
262 |   mean(same_day)
263 | }
264 | n <- seq(2,60)
265 | prob <- sapply(n, compute_prob)
266 | plot(n, prob)
267 | ```
268 | 
269 | In this case it is much faster to actually do the math. It is also exact so more precise.
270 | 
271 | Instead of computing the probability of it happening we will compute the probability of it not happening. Let's start with the first person. The probability that person one has a unique birthday is 1. The probability that the second person has a unique birthday, given that person one already took one is 364/365. Then for person three, given the first two have unique birthdays, leaves 363 days to choose from. We continue this way and find the chances of all 50 people having a unique birthday is:
272 | 
273 | Assessment: Try to compute it using the multiplication rule:
274 | 
275 | We can write a function that does this for any number.
276 | 
277 | ```{r}
278 | exact_prob <- function(n){
279 |   prob_unique <- seq(365,365-n+1)/365 
280 |   1 - prod( prob_unique)
281 | }
282 | eprob <- sapply(n, exact_prob)
283 | plot(n, prob)
284 | lines(n,eprob)
285 | ```
286 | 
287 | 
288 | 
289 | 
290 | 
291 | 


--------------------------------------------------------------------------------
/lectures/ml/RafaClass_Ensembles_Rose.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/ml/RafaClass_Ensembles_Rose.pdf


--------------------------------------------------------------------------------
/lectures/ml/cross-validation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/ml/cross-validation.pdf


--------------------------------------------------------------------------------
/lectures/ml/cv.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "cv"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ## Cross validation
  7 | ```{r, message=FALSE, warning=FALSE}
  8 | library(readr)
  9 | library(dplyr)
 10 | library(tidyr)
 11 | library(ggplot2)
 12 | theme_set(theme_bw(base_size = 16))
 13 | ```
 14 | 
 15 | ```{r}
 16 | plotit <- function(dat, i, n=sqrt(ncol(dat)-1)){
 17 |   dat <- slice(dat,i)
 18 |   tmp <-  expand.grid(Row=1:n, Column=1:n) %>%  
 19 |       mutate(id=i, label=dat$label,  
 20 |              value = unlist(dat[,-1]))
 21 |   tmp%>%ggplot(aes(Row, Column, fill=value)) + 
 22 |     geom_raster() + 
 23 |     scale_y_reverse() +
 24 |     scale_fill_gradient(low="white", high="black") +
 25 |     ggtitle(tmp$label[1])
 26 | }
 27 | ```
 28 | 
 29 | 
 30 | ```{r}
 31 | url <- "https://raw.githubusercontent.com/datasciencelabs/data/master/hand-written-digits-train.csv"
 32 | original_dat <- read_csv(url)
 33 | original_dat <- mutate(original_dat, label = as.factor(label))
 34 | ```
 35 | 
 36 | There a test set with no labels given:
 37 | 
 38 | ```{r}
 39 | url <- "https://raw.githubusercontent.com/datasciencelabs/data/master/hand-written-digits-test.csv"
 40 | original_test<- read_csv(url)
 41 | View(original_test)
 42 | ```
 43 | 
 44 | ## Data Exploration
 45 | 
 46 | ```{r}
 47 | X <- sample_n(original_dat,200) %>% 
 48 |                   arrange(label)
 49 |                    
 50 | d <- dist(as.matrix(X[,-1]))
 51 | image(as.matrix(d))
 52 | 
 53 | plot(hclust(d),labels=as.character(X$label))
 54 | ``` 
 55 | 
 56 | 784 is too much for use to handle in a demo. So let's. We will compress the  predictors by combining groups of 16 pixels. 
 57 | 
 58 | ```{r}
 59 | tmp <- slice(original_dat,1:100)
 60 | names(tmp) <- gsub("pixel","",names(tmp))
 61 | tmp <- tmp %>% mutate(obs = 1:nrow(tmp)) 
 62 | tmp <- tmp %>% gather(feature, value, `0`:`783`) 
 63 | tmp <- tmp %>% mutate(feature = as.numeric(feature))
 64 | tmp <- tmp %>% mutate(row = feature%%28, col =floor(feature/28))
 65 | tmp <- tmp %>% mutate(row = floor(row/4), col = floor(col/4))
 66 | tmp <- tmp %>% group_by(obs, row, col) 
 67 | tmp <- tmp %>% summarize(label = label[1], value = mean(value)) 
 68 | tmp <- tmp %>% ungroup
 69 | tmp <- tmp %>%  mutate(feature = sprintf("X_%02d_%02d",col,row))
 70 | tmp <- tmp %>%  select(-row, -col) 
 71 | tmp <- tmp %>% group_by(obs) %>% spread(feature, value) %>% ungroup %>% select(-obs)
 72 | ```
 73 | 
 74 | Let's write a function
 75 | 
 76 | ```{r}
 77 | compress <- function(tbl, n=4){
 78 |   names(tbl) <- gsub("pixel","",names(tbl))
 79 |   tbl %>% mutate(obs = 1:nrow(tbl)) %>% 
 80 |     gather(feature, value, `0`:`783`) %>% 
 81 |     mutate(feature = as.numeric(feature)) %>% 
 82 |     mutate(row = feature%%28, col =floor(feature/28)) %>% 
 83 |     mutate(row = floor(row/n), col = floor(col/n)) %>% 
 84 |     group_by(obs, row, col)  %>% 
 85 |     summarize(label = label[1], value = mean(value)) %>% 
 86 |     ungroup %>% 
 87 |     mutate(feature = sprintf("X_%02d_%02d",col,row)) %>% 
 88 |     select(-row, -col) %>% 
 89 |     group_by(obs) %>% spread(feature, value) %>% 
 90 |     ungroup %>% 
 91 |     select(-obs)
 92 | }
 93 | ```
 94 | 
 95 | Compress the entire dataset. This will take a bit:
 96 | 
 97 | 
 98 | ```{r}
 99 | dat <- compress(original_dat)
100 | ```
101 | 
102 | Note that some features are almost always 0:
103 | 
104 | ```{r}
105 | library(caret)
106 | set.seed(1)
107 | inTrain <- createDataPartition(y = dat$label,
108 |                                p=0.9)$Resample
109 | X <- dat %>% select(-label) %>% slice(inTrain) %>% as.matrix
110 | column_means <- colMeans(X)
111 | plot(table(round(column_means)))
112 | ```
113 | 
114 | Let's remove this low information feautures:
115 | 
116 | ```{r}
117 | keep_columns <- which(column_means>10)
118 | ``` 
119 | 
120 | Let's define the training data and test data:
121 | 
122 | ```{r}
123 | train_set <- slice(dat, inTrain) %>% 
124 |   select(label, keep_columns+1)
125 | test_set <- slice(dat, -inTrain) %>% 
126 |   select(label, keep_columns+1)
127 | ```
128 | 
129 | Note that the distances look a bit cleaner:
130 | ```{r}
131 | X <- sample_n(train_set,200) %>% 
132 |                   arrange(label)
133 |                    
134 | d <- dist(as.matrix(X[,-1]))
135 | image(as.matrix(d))
136 | plot(hclust(d),labels=as.character(X$label))
137 | ``` 
138 | 
139 | 
140 | 
141 | ```{r}
142 | tmp = sample_n(train_set,5000)
143 | 
144 | control <- trainControl(method='cv', number=20)
145 | res <- train(label ~ .,
146 |              data = tmp,
147 |              method = "knn",
148 |              trControl = control,
149 |              tuneGrid=data.frame(k=seq(1,15,2)),
150 |              metric="Accuracy")
151 | 
152 | plot(res)
153 | 
154 | fit <- knn3(label~., train_set, k=3)
155 | pred <- predict(fit, newdata = test_set, type="class")
156 | 
157 | tab <- table(pred, test_set$label)
158 | confusionMatrix(tab)
159 | ```
160 | 
161 | 
162 | Compete in Kaggle?
163 | 
164 | ```{r}
165 | original_test <- mutate(original_test, label=NA)
166 | test <- compress(original_test)
167 | test <- test %>% select(label, keep_columns+1)
168 | pred <- predict(fit, newdata = test, type="class")
169 | 
170 | i=11
171 | pred[i]
172 | plotit(original_test,i)
173 | 
174 | res <- data.frame(ImageId=1:nrow(test),Label=as.character(pred))
175 | write.csv(res, file="test.csv", row.names=FALSE)
176 | ```
177 | 
178 | 


--------------------------------------------------------------------------------
/lectures/ml/matrices.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: page
  3 | title: Matrices in R
  4 | ---
  5 | 
  6 | ```{r}
  7 | x_1 <- c(1,0,0,1)
  8 | x_2 <- c(2,4,6,3)
  9 | x_3 <- c(6,4,2,4)
 10 | X <- cbind(x_1, x_2, x_3)
 11 | X
 12 | ```
 13 | 
 14 | Alternatively
 15 | 
 16 | ```{r}
 17 | X <- matrix( c(x_1, x_2, x_3), nrow=4, ncol=3)
 18 | X
 19 | ```
 20 | 
 21 | We can extract specific entries using the `[` character:
 22 | 
 23 | ```{r}
 24 | X[2,3]
 25 | ```
 26 | 
 27 | We can extract columns like this:
 28 | 
 29 | ```{r}
 30 | X[ ,c(1,2)]
 31 | ```
 32 | 
 33 | and rows like this:
 34 | 
 35 | ```{r}
 36 | X[c(2,3),]
 37 | ```
 38 | 
 39 | Be aware that if you select one column or one row, the result is no longer a matrix but a numeric.
 40 | 
 41 | ```{r}
 42 | X[,1]
 43 | class(X[,2:3])
 44 | class(X[,1])
 45 | ```
 46 | However, we can preserve the matrix class by using the argument `drop`
 47 | 
 48 | ```{r}
 49 | X[,1,drop=FALSE]
 50 | ```
 51 | 
 52 | 
 53 | Useful function:
 54 | 
 55 | ```{r}
 56 | rowMeans(X)
 57 | ```
 58 | 
 59 | ```{r}
 60 | colMeans(X)
 61 | ```
 62 | 
 63 | Also in the matrixStats package:
 64 | 
 65 | ```{r}
 66 | library(matrixStats)
 67 | colSds(X)
 68 | colRanks(X)
 69 | ```
 70 | 
 71 | and many other useful functions.
 72 | 
 73 | We can apply any function to the rows using apply
 74 | 
 75 | ```{r}
 76 | apply(X,1,mean) ##same as rowMeans
 77 | ```
 78 | 
 79 | or the columns
 80 | 
 81 | 
 82 | ```{r}
 83 | apply(X,2,mean) ##same as rowMeans
 84 | ```
 85 | 
 86 | We can also define our own
 87 | 
 88 | ```{r}
 89 | apply(X,2,function(x){
 90 |   c(sum(x[1:2]), sum(x[3:4]))
 91 | })
 92 | ```
 93 | 
 94 | One very powerful approach  in it's simplicty and speed is to filter columns based on statistics of those columns:
 95 | 
 96 | ```{r}
 97 | X[ , colMeans(X)>3]
 98 | ```
 99 | 
100 | We can also do thigs like this:
101 | 
102 | ```{r}
103 | X[X<1] <- 0.5
104 | X
105 | ```
106 | 
107 | or like this
108 | 
109 | ```{r}
110 | X[2,3]<-NA
111 | X[is.na(X)] <- 0
112 | X
113 | ```
114 | 
115 | We can also use this nice trick:
116 | 
117 | ```{r}
118 | X - rowMeans(X)
119 | ```
120 | 
121 | we can scale each row like this:
122 | 
123 | ```{r}
124 | (X - rowMeans(X)) /rowSds(X)
125 | ```
126 | 
127 | but be careful, it does not work for columns. for columns do this
128 | 
129 | ```{r}
130 | sweep(X, 2, colMeans(X))
131 | ```
132 | or
133 | ```{r}
134 | t( t(X) - colMeans(X) )
135 | ```
136 | 
137 | 
138 | Finally, if you know linear algebra, you the transpose:
139 | ```{r}
140 | t(X)
141 | ```
142 | 
143 | matrix multiplication:
144 | ```{r}
145 | t(X) %*% X
146 | ```
147 | 
148 | crossproduct
149 | 
150 | ```{r}
151 | crossprod(X)
152 | ```
153 | 
154 | inverse
155 | ```{r}
156 | solve( crossprod(X))
157 | ```
158 | 
159 | QR decomposition
160 | 
161 | ```{r}
162 | qr(X)
163 | ```
164 | 
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/lectures/models/bayes.Rpres:
--------------------------------------------------------------------------------
  1 | Bayes' Rule
  2 | ====
  3 | author: Rafael A. Irizarry
  4 | transition: none
  5 | 
  6 | 
  7 | Cystic Fibrosis Test
  8 | ====
  9 | 
 10 | * A test for cystic fibrosis has an accuracy of 99%: 
 11 | 
 12 | $$\mbox{Prob}(+\mid D)=0.99, \mbox{Prob}(-\mid \mbox{no } D)=0.99,$$
 13 | 
 14 | 
 15 | * If we select a random person and they test positive, what is the probability that they have disease? 
 16 | 
 17 | * We write this as $\mbox{Prob}(D\mid+)?$
 18 | 
 19 | * Cystic fibrosis rate is $\mbox{Prob}(D) \approx 0.00025$
 20 | 
 21 | Bayes Rule
 22 | ====
 23 | 
 24 | <br>
 25 | <br>
 26 | 
 27 | $$
 28 | \mbox{Pr}(A|B)  =  \frac{\mbox{Pr}(B|A)\mbox{Pr}(A)}{\mbox{Pr}(B)} 
 29 | $$
 30 | 
 31 | 
 32 | Bayes Rule Applied to Cystic Fibrosis Test
 33 | ====
 34 | <br>
 35 | 
 36 | $$
 37 | \begin{eqnarray*}
 38 | \mbox{Prob}(D|+) & = & \frac{ P(+|D) \cdot P(D)} {\mbox{Prob}(+)} \\
 39 | & = & \frac{\mbox{Prob}(+|D)\cdot P(D)} {\mbox{Prob}(+|D) \cdot P(D) + \mbox{Prob}(+|\mbox{no } D) \mbox{Prob}(\mbox{no } D)} \\
 40 | \end{eqnarray*}
 41 | $$
 42 | 
 43 | 
 44 | Bayes Rule
 45 | ====
 46 | 
 47 | $$
 48 | \begin{eqnarray*}
 49 | \mbox{Prob}(D|+) & = & \frac{ P(+|D) \cdot P(D)} {\mbox{Prob}(+)} \\
 50 | & = & \frac{\mbox{Prob}(+|D)\cdot P(D)} {\mbox{Prob}(+|D) \cdot P(D) + \mbox{Prob}(+|\mbox{no } D) \mbox{Prob}(\mbox{no } D)} \\
 51 | & = & \frac{0.99 \cdot 0.00025}{0.99 \cdot 0.00025 + 0.01 \cdot (.99975)} \\
 52 | & = & 0.02 \;\;\; \mbox{not} \; \; \; 0.99
 53 | \end{eqnarray*}
 54 | $$
 55 | 
 56 | 
 57 | Monte Carlo Simulation
 58 | ====
 59 | 
 60 | Assessment: Write a Monte Carlo simulation that shows this. Let's go step-by-step. http://goo.gl/forms/JPFdaptfkv  
 61 | 
 62 | * Define a population with 1 in 4000 people with CF.
 63 | * Take one at random
 64 | * Give them the test that is 99% accurate.
 65 | * Return one of four: TP, FP, TN, FN
 66 | * Repeat 10,000
 67 | * Compute TP / P = TP / (TP +FP)
 68 | 
 69 | Simulation
 70 | ===
 71 | 
 72 | ```{r,echo=FALSE,fig.height=7,fig.width=11}
 73 | set.seed(3)
 74 | prev <- 1/20
 75 | acc <- 0.90
 76 | N <- 20; M <- 80
 77 | x<-rbinom(N*M,1,p=prev)
 78 | cols <- c("grey","red")
 79 | people <- expand.grid(1:M,N:1)
 80 | people2 <- expand.grid(1:(M/2),N:1)
 81 | cols1 <- cols[x+1]
 82 | cols2 <- rep(NA,length(cols1));count2<-1
 83 | cols3 <- rep(NA,length(cols1));count3<-1
 84 | rafalib::mypar()
 85 | layout(matrix(c(1,2,1,3),2,2))
 86 |  plot(people,col=cols1,pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main=paste0("Population: ",round(mean(x)*100),"% are red"))
 87 | axis(side=1,M/2,"O",col="black",tick=FALSE,cex.axis=2,line=1.5)
 88 | plot(people2,type="n",pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main="Tested Positive")
 89 | plot(people2,type="n",pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main="Tested Negative")
 90 | 
 91 | ```
 92 | 
 93 | ```{r,eval=FALSE,include=FALSE}
 94 | library(animation)
 95 | file.remove("/Users/ririzarr/myDocuments/teaching/cs109/2014_working/lectures/lecture14/bayes.gif")
 96 | saveGIF({
 97 | i=1
 98 | while(count3<=(N*M/2) & count2<=(N*M/2)){
 99 |   test <- sample(100,1);min=round(100*acc)
100 |   rafalib::mypar2()
101 |   layout(matrix(c(1,2,1,3),2,2))
102 |   plot(people,col=cols1,pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main=paste0("Population: ",round(mean(x)*100),"% are red"))
103 |   if(test>min) axis(side=1,M/2,"X",col="red",tick=FALSE,cex.axis=3,line=1.5) else axis(side=1,M/2,"O",col="black",tick=FALSE,cex.axis=2,line=1.5)
104 |   points(people[i,],pch=1,cex=1.5)
105 |   if(all(is.na(cols2))) plot(people2,type="n",pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main="Tested Positive") else plot(people2,col=cols2,pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main=paste0("Tested Positive: ",round(mean(cols2=="red",na.rm=TRUE)*100),"% are red"))
106 |   if(all(is.na(cols3))) plot(people2,type="n",pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main="Tested Negative") else plot(people2,col=cols3,pch=16,xaxt="n",yaxt="n",xlab="",ylab="",main=paste0("Tested Negative: ",round(mean(cols3=="red",na.rm=TRUE)*100,1),"% are red"))
107 |   outcome <- ifelse(x[i]==1, as.numeric(test<=min), as.numeric(test>min))
108 |   if(outcome==0) {cols3[count3]<-cols1[i];count3<-count3+1} else {cols2[count2]<-cols1[i];count2<-count2+1}
109 |   i<-i+1
110 | }},'bayes.gif', interval = .1, ani.width = 800, ani.height = 500,outdir="/Users/ririzarr/myDocuments/teaching/cs109/2014_working/lectures/lecture14")
111 | ```
112 | 
113 | Simulation
114 | ====
115 | <img src="bayes.gif">
116 | $\mbox{Prob}(D)$=% red on top, X means error, e.g. $+$ call when $\mbox{no } D$
117 | $\mbox{Prob}(+ \mid D)$ = % red bottom left, $\mbox{Prob}(+)$=%bottom left. 
118 | 
119 | 
120 | Application in Baseball
121 | =====
122 | 
123 | José Iglesias 2013
124 | ===
125 | 
126 | ![iglesias](http://upload.wikimedia.org/wikipedia/commons/thumb/9/98/Jos%C3%A9_Iglesias_on_September_28%2C_2012.jpg/902px-Jos%C3%A9_Iglesias_on_September_28%2C_2012.jpg)
127 | 
128 | 
129 | ***
130 | 
131 | 
132 | | Month | At Bats | H | AVG |
133 | |-------|---------|---|-----|
134 | | April | 20      | 9 | .450   |
135 | 
136 | What is your prediction for his average in October?
137 | 
138 | Standard error is 
139 | $$
140 | \sqrt{\frac{.450 (1-.450)}{20}}=.111
141 | $$
142 | 
143 | CI is $.450 \pm .222 =.228$ to $.672$.
144 | 
145 | No one has batted.400 since Ted Williams in
146 |  1941!
147 | 
148 | 
149 | Distribution of AVG 
150 | ===
151 | This is for all players (>500 AB) 2010, 2011, 2012
152 | 
153 | ```{r,fig.align="center",echo=FALSE,fig.width=14,fig.height=6}
154 | library(Lahman)
155 | library(dplyr)
156 | library(ggplot2)
157 | theme_set(theme_bw(base_size = 24))
158 | Batting %>% 
159 |   filter(yearID%in%2012:2014 & AB>500)  %>%
160 |   mutate(AVG=H/AB*1000) %>% 
161 |   ggplot(aes(AVG)) +
162 |   geom_histogram(bins=9) + 
163 |   facet_grid(.~yearID)
164 | ```
165 | 
166 | Average is about 275 and SD is about 27
167 | ====
168 | ```{r}
169 | options(digits = 3)
170 | filter(Batting, yearID%in%2011:2014 & AB>500) %>%
171 |   mutate(AVG=H/AB*1000) %>%
172 |   group_by(yearID) %>% 
173 |   summarize(mean(AVG), sd(AVG))
174 | ```
175 | 
176 | And the distribution appears Normal
177 | ====
178 | 
179 | ```{r,fig.align="center",cache=TRUE,echo=FALSE,fig.width=14,fig.height=6}
180 | filter(Batting, yearID%in%2012:2014 & AB>500) %>%
181 |   mutate(AVG=H/AB*1000) %>% 
182 |   ggplot(aes(sample=(AVG-mean(AVG))/sd(AVG)))+
183 |   geom_qq() + geom_abline(slope=1,intercept = 0) +
184 |   facet_grid(.~yearID)
185 | ```
186 | 
187 | 
188 | Hierarchical Model
189 | ===
190 | 
191 | Pick a random player, model their batting average after 20 AB?
192 | 
193 | $$\begin{eqnarray*}
194 | \theta &\sim& N(\mu, \tau^2) \\
195 | Y | \theta &\sim& N(\theta, \sigma^2) 
196 | \end{eqnarray*}$$
197 | 
198 | Two levels of variability:
199 | 
200 | * Player to player variability
201 | * Variability due to luck when batting
202 | 
203 | Hierarchical Model
204 | ===
205 | 
206 | $$\begin{eqnarray*}
207 | \theta &\sim& N(\mu, \tau^2) \mbox{ is called a prior}\\
208 | Y | \theta &\sim& N(\theta, \sigma^2) \mbox{ is called a sampling distribution}
209 | \end{eqnarray*}$$
210 | 
211 | * $\theta$ is our players "intrinsic" average value
212 | * $\mu$ is the average of all players
213 | * $\tau$ is the SD of all players
214 | * $Y$ is the observed average
215 | * $\sigma$ is the variability due to luck at each AB 
216 | 
217 | 
218 | Hierarchical Model
219 | ===
220 | 
221 | Here are the equations with our data
222 | 
223 | $$\begin{eqnarray*}
224 | \theta &\sim& N(.275, .027^2) \\
225 | Y | \theta &\sim& N(\theta, .110^2) 
226 | \end{eqnarray*}$$
227 | 
228 | 
229 | Posterior Distribution
230 | ====
231 | 
232 | The continuous version of Bayes rule can be used here
233 | 
234 | $$
235 | \begin{eqnarray*}
236 | f_{\theta|Y}(\theta|Y)&=&\frac{f_{Y|\theta}(Y|\theta) f_{\theta}(\theta)}{f_Y(Y)}\\
237 | &=&\frac{f_{Y|\theta}(Y|\theta) f_{\theta}(\theta)}{\int_{\theta}f_{Y|\theta}(Y|\theta)f_{\theta}(\theta)}\\
238 | 
239 | \end{eqnarray*}
240 | $$
241 | 
242 | We are particularly interested in the $\theta$ that maximizes $f_{\theta|Y}(\theta|Y)$.
243 | 
244 | We can show these are normal so all we need are: 
245 | 
246 | $$\mbox{E}(\theta|y) \mbox{ and } \mbox{var}(\theta|y)$$
247 | 
248 | 
249 | Posterior Distribution
250 | ====
251 | 
252 | <br>
253 | 
254 | $$
255 | \begin{eqnarray*}
256 | \mbox{E}(\theta|y) &=& B \mu + (1-B) Y\\
257 | &=& \mu + (1-B)(Y-\mu)\\
258 | \mbox{with }B &=& \frac{\sigma^2}{\sigma^2+\tau^2}\\
259 | \\
260 | \mbox{var}(\theta\mid y) &=& \frac{1}{1/\sigma^2+1/\tau^2}
261 | \end{eqnarray*}
262 | 
263 | $$
264 | 
265 | 
266 | Posterior Distribution
267 | ====
268 | 
269 | In the case of José Iglesias, we have:
270 | 
271 | $$
272 | \begin{eqnarray*}
273 | E(\theta | Y=.450) &=& B \times .275 + (1 - B) \times .450 \\
274 | &=& .275 + (1 - B)(.450 - .260) \\
275 | B &=&\frac{.110^2}{.110^2 + .027^2} = 0.943\\
276 |  &\approx& .285\\
277 | \end{eqnarray*}
278 | $$
279 | 
280 | 
281 | $$
282 | \begin{eqnarray*}
283 | \mbox{var}(\theta\mid y) &=& \frac{1}{1/.110^2 + 1/.027^2}\\ &\approx&0.00069
284 | \end{eqnarray*}
285 | $$
286 | 
287 | 
288 | SD is 0.026. The 95% _credible interval_ is (.233,.337)
289 | 
290 | 
291 | Results
292 | ====
293 | 
294 | |Month|At Bat| Hits| AVG |
295 | |-----|------|-----|-----|
296 | |April|20|9|.450|
297 | |May|26|11|.423|
298 | |June|86|34|.395|
299 | |July|83|17|.205|
300 | |August|85|25|.294|
301 | |September|50|10|.200|
302 | |Total w/o April|330|97|.293|
303 | 
304 | Frequentist confidence interval = .450 $\pm$ 0.220
305 | 
306 | Empirical Bayes credible interval = .285 $\pm$ 0.052
307 | 
308 | Actual = .293
309 | 
310 | 
311 | FiveThirtyEight
312 | ====
313 | 
314 | ![538confidenceintervals](http://espnfivethirtyeight.files.wordpress.com/2014/09/screen-shot-2014-09-16-at-5-09-30-pm.png)
315 | 
316 | 
317 | Primaries
318 | ====
319 | 
320 | [Florida predictions](http://projects.fivethirtyeight.com/election-2016/primary-forecast/florida-republican/)
321 | 
322 | Hierarchical Model to Predict Polls
323 | ===
324 | 
325 | You build a model (a prior) before seeing polls 
326 | $$
327 | \theta \sim N(\mu, \tau^2) 
328 | $$
329 | 
330 | The you start seing poll data:
331 | 
332 | $$
333 | Y | \theta \sim N(\theta, \sigma^2) 
334 | $$
335 | 
336 | We update our prediction:
337 | 
338 | $$
339 | \begin{eqnarray*}
340 | \mbox{E}(\theta|y) &=&  \mu + (1-B)(Y-\mu)
341 | \mbox{ with }B &=& \frac{\sigma^2}{\sigma^2+\tau^2}\\
342 | \\
343 | \mbox{var}(\theta\mid y) &=& \frac{1}{1/\sigma^2+1/\tau^2}
344 | \end{eqnarray*}
345 | 
346 | $$
347 | 
348 | 
349 | 


--------------------------------------------------------------------------------
/lectures/models/bayes.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/models/bayes.gif


--------------------------------------------------------------------------------
/lectures/regression/baseball.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Baseball"
  3 | output: html_document
  4 | fontsize: 12pt
  5 | ---
  6 | 
  7 | # Regression
  8 | 
  9 | ## Motivating Example: Money Ball
 10 | 
 11 | _Moneyball: The Art of Winning an Unfair Game_ is a book by Michael Lewis about the Oakland Athletics baseball team and its general manager, the person tasked with building the team, Billy Beane. During Billy Bean's tenure as general manager, ownership cut the budget drastically leaving the general manager with one of the lowest payrolls in baseball. Money Ball tells the story of how Billy Bean used analysts to find inefficiencies in the market. Specifically, his team used data science to find low cost players that the data predicted would help the team win. In this lab we will go back to 2002 and try to build a baseball team with a limited budget. Note that the Yankees $125,928,583 payroll more than tripled the Oakland A's $39,679,746  [budget](http://www.baseballchronology.com/Baseball/Years/2002/Payroll.asp).
 12 | 
 13 | Statistics have been used in baseball since its beginnings. Note that the dataset we will be using, included in the `Lahman` library, goes back to the 19th century. Batting average, for example, has been used to summarize a batter's success for decades. [Other statistics](http://mlb.mlb.com/stats/league_leaders.jsp) such as home runs, runs batted in (RBI) and stolen bases have been reported and players rewarded for high numbers. However, until [Bill James](https://en.wikipedia.org/wiki/Bill_James) introduced [sabermetrics](https://en.wikipedia.org/wiki/Sabermetrics), careful analyses had not been done to determine if these statistics actually help a team win. To simplify the exercise we will focus on scoring runs and ignore pitching and fielding. 
 14 | 
 15 | <img src="http://cdn.fansided.com/wp-content/blogs.dir/17/files/2011/12/bill-james-079006079.jpg">
 16 | 
 17 | 
 18 | We will see how regression analysis can help develop strategies to build a competitive baseball team with a constraint budget. The approach can be divided into two separate data analysis. In the first we determine which recorded player-specific statistics predict runs. In the second we examine if players were undervalued based on what our first analysis predicts. 
 19 | 
 20 | ### Baseball Basics
 21 | 
 22 | We actually don't need to understand all the details about the game of baseball, which has over 100 rules, to use Money Ball for motivation. Here we distill the problem down to the basic knowledge ones need to attack the data science problem. 
 23 | 
 24 | The goal of a baseball game is to score more runs (points) than the other team. Each team has 9 batters that bat in a predetermined order. After the 9th batter hits, we start with the first again. Each time they come to bat we call it a plate appearance (PA). At each PA, the other team's _pitcher_ throws that ball and you try to hit it. The PA ends with an binary outcome: you either make an _out_ (failure) and sit back down or you don't (success) and you get to run around the bases. Each team gets nine tries to score runs referred to as _innings_ and each inning ends after three outs.
 25 | 
 26 | Here  is a [success](https://www.youtube.com/watch?v=HL-XjMCPfio) and here is a [failure](https://www.youtube.com/watch?v=NeloljCx-1g). From the vidoes we see how luck is involved in the process. 
 27 | 
 28 | Now there are several ways to succeed. Understanding this distinction will be important for our analysis. When you hit the ball you want to pass as many _bases_ as possible. There are four bases with the fourth one called _home plate_ which is where you start as well.
 29 | 
 30 | <img src="http://quarterlyspeedbump.com/wp-content/uploads/2012/03/Baseball_Diamond1.jpeg">
 31 | 
 32 | If you get home you score a run. We are simplifying a bit but the ways you can succeed (not make an out) are the following: 
 33 | 
 34 | - Bases on balls (BB) - the pitcher does not pitch well so you got to first base.
 35 | - Single - You hit the ball and get to first base.
 36 | - Double (X2B) - You hit the ball and get to second base
 37 | - Triple  (X3B) - You hit the ball and get to third base
 38 | - Home Run (HR) - You hit the ball and go home and score a run. [Here](https://www.youtube.com/watch?v=xYxSZJ9GZ-w) is an example.
 39 |   
 40 |  If you get to a base, you still have the chance of getting home and scoring a run if the next batter hits successfully. While you are _on base_ you can also try to steal a base (SB). If you run fast enough you can go try to go first to second without the other team tagging you. [Here](https://www.youtube.com/watch?v=JSE5kfxkzfk) is a stolen base.
 41 |  
 42 | ### Why were BB undervalued?
 43 |  
 44 | Of the five ways to be successful listed above the last four are considered hits (H). Historically, the batting average has been considered the most important offensive statistic. 
 45 | 
 46 | <img src="http://i0.wp.com/mopupduty.com/wp-content/uploads/2010/09/JumboTron.jpg">
 47 | 
 48 | This is defined as hits divided by at bats (AB). An AB is the number of times you either get a hit or make an out.  This implies that PA is approximately BB+AB.
 49 | 
 50 | One of Bill James first important insights is that statistics such as the batting average ignore BB. But a BB is success. So a player that gets many more BB than the average might not be noticed if he does not excel in batting average. But how good is this player at producing runs? In contrast stolen bases were considered important. This seems arbitrary. Can we use data science to determine if it's better to pay for BB or SB? 
 51 | 
 52 | ### Base on Ball or Stolen bases?
 53 | 
 54 | One of the challenges in this analysis is that it is not obvious how to determine if a player produces run because so much depends on his teammates. We keep track of the number of runs scored by a player. But note that if you hit after someone who hits many HR you will score a lot. But these runs don't necessarily happen if we hire this player but not his HR hitting teammate. However, we can examine team-level statistics. How do teams with many SB compare to teams with few? How about BB? We have data! Let's examine some. 
 55 | 
 56 | Let's start with an obvious one: HR. Do teams that hit more home runs score more runs?e
 57 | ```{r, message=FALSE, warning=FALSE}
 58 | library(Lahman)
 59 | library(dplyr)
 60 | library(ggplot2)
 61 | library(broom)
 62 | theme_set(theme_bw(base_size = 16))
 63 | 
 64 | Teams %>% filter(yearID %in% 1961:2001 ) %>%
 65 |   mutate(R_per_game = R / G, HR_per_game = HR / G) %>%
 66 |   ggplot(aes(HR_per_game, R_per_game)) + geom_point()
 67 | ```
 68 | 
 69 | ```{r, fig.width=10, fig.height=5}
 70 | library(gridExtra)
 71 | tab <- Teams %>% filter(yearID %in% 1961:2001 ) %>%
 72 |   mutate(R_per_game = R / G, SB_per_game = SB / G, BB_per_game = BB/G)
 73 | 
 74 | g1 <- tab %>%  ggplot(aes(SB_per_game, R_per_game)) + geom_point()
 75 | g2 <- tab %>%  ggplot(aes(BB_per_game, R_per_game)) + geom_point()
 76 | grid.arrange(g1, g2, nrow = 1)
 77 | ```
 78 | 
 79 | But wait, association is not causation. In fact, it looks like home run hitters get more BB:
 80 | 
 81 | ```{r}
 82 | Batting %>% filter(yearID %in% 1961:2001) %>% group_by(playerID) %>%
 83 |   summarize(HR_tot=sum(HR), BB_tot=sum(BB), PA = sum(BB+AB)) %>%
 84 |   mutate(HR_per_PA = HR_tot/PA, BB_per_PA=BB_tot/PA) %>%
 85 |   filter(PA >1000) %>%
 86 |   ggplot(aes(HR_per_PA, BB_per_PA)) + geom_point()
 87 | ```
 88 | 
 89 | Linear regression will help us parse all this out and help us determine what players to recruit.
 90 | 
 91 | 
 92 | 
 93 | 
 94 |   
 95 |  
 96 |  
 97 |  
 98 | 
 99 | 
100 | <!--
101 | After three outs then the other team gets to hit. 
102 | ### Baseball
103 | 
104 | <img src="http://quarterlyspeedbump.com/wp-content/uploads/2012/03/Baseball_Diamond1.jpeg">
105 | 
106 | ### Statistics
107 | <img src="http://i0.wp.com/mopupduty.com/wp-content/uploads/2010/09/JumboTron.jpg">
108 | 
109 | ### Sabermetrics
110 | 
111 | <img src="http://cdn.fansided.com/wp-content/blogs.dir/17/files/2011/12/bill-james-079006079.jpg">
112 | 
113 | ###Bill James 
114 | 
115 | <img src="https://upload.wikimedia.org/wikipedia/commons/3/36/Bill_James_2010.jpg">"
116 | ### Some Terms
117 | 
118 | - R - Runs
119 | - AB - At bats
120 | - SO - Strike out
121 | - H - Hitter puts the ball in play and is not out
122 | - 2B - Hitter gets to second
123 | - 3B - Hitter gets to third
124 | - HR - Home Run, hitter scores a run
125 | - BB - Base on balls
126 | - PA - Plate apperances $\approx$ AB+BB
127 | 
128 | -->
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/lectures/regression/regression-broom.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Regression with broom and dplyr"
  3 | author: "David Robinson"
  4 | date: "March 23, 2016"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r echo = FALSE}
  9 | library(knitr)
 10 | opts_chunk$set(message = FALSE)
 11 | ```
 12 | 
 13 | [broom](https://github.com/dgrtwo/broom) is a package for converting R model objects into "tidy" data frames that can be recombined, manipulated, and visualized using tools like dplyr and ggplot2.
 14 | 
 15 | Install it with:
 16 | 
 17 | ```{r}
 18 | install.packages("broom")
 19 | ```
 20 | 
 21 | I'll start by listing some worthwhile resources for dplyr and broom:
 22 | 
 23 | * [broom and dplyr vignette](https://cran.r-project.org/web/packages/broom/vignettes/broom_and_dplyr.html)
 24 | * [broom manuscript](http://arxiv.org/abs/1412.3565)
 25 | 
 26 | ### mtcars data
 27 | 
 28 | The mtcars dataset comes built into R, and comes with information about 32 cars:
 29 | 
 30 | ```{r}
 31 | mtcars
 32 | ```
 33 | 
 34 | (You can do `help(mtcars)` for more about each column). Let's look at two columns of interest:
 35 | 
 36 | * `qsec` shows the speed and acceleration of the car, tested by the number of seconds it takes to drive a quarter of a mile.
 37 | * `mpg` represents the gas efficiency, in miles per gallon
 38 | 
 39 | Suppose we wanted to examine the relationship between the gas efficiency and the speed. We'd probably start with a graph:
 40 | 
 41 | ```{r}
 42 | library(ggplot2)
 43 | ggplot(mtcars, aes(qsec, mpg)) +
 44 |   geom_point()
 45 | ```
 46 | 
 47 | A relationship between the two (faster cars get fewer miles per gallon) is certainly plausible, though not certain. To test this statistically, we might perform a linear regression:
 48 | 
 49 | ```{r}
 50 | fit <- lm(mpg ~ qsec, mtcars)
 51 | ```
 52 | 
 53 | We can display a linear fit using the default print method:
 54 | 
 55 | ```{r}
 56 | fit
 57 | ```
 58 | 
 59 | This shows just the estimated intercept and slope. We can get a bit more information with summary:
 60 | 
 61 | ```{r}
 62 | summary(fit)
 63 | ```
 64 | 
 65 | This shows each of the estimates, standard errors, and p-values for the intercept and the slope. It also shows an overall $R^2$ value (variation in Y explained by X), and lots of other information.
 66 | 
 67 | One of the problems with these coefficients, however, is that it's difficult to get out into a format that we can manipulate with dplyr, plot with ggplot2, or otherwise wrangle.
 68 | 
 69 | For that, we use the broom package:
 70 | 
 71 | ```{r}
 72 | library(broom)
 73 | tidy(fit)
 74 | ```
 75 | 
 76 | Notice that this is the same per-coefficient data that appears in the `summary(fit)` method. But instead of being displayed in a summary, it is in the form of
 77 | 
 78 | 1. A data frame
 79 | 2. Without rownames (they've been moved into the `term` column)
 80 | 3. With names that do not have spaces or punctuation (besides `.`).
 81 | 
 82 | This makes the dataset ideal for further manipulation and recombination. broom works on many types of models (see the [README](https://github.com/dgrtwo/broom) for an extensive list), and all of its outputs are always in this form.
 83 | 
 84 | One more neat trick with `tidy` is that it extract the confidence intervals for each coefficient at the same time:
 85 | 
 86 | ```{r}
 87 | tidy(fit, conf.int = TRUE)
 88 | ```
 89 | 
 90 | (See `help(tidy.lm)` for more that can be done with it).
 91 | 
 92 | `tidy` is one of the three tidying functions provided by broom. The other two are `augment` and `glance`:
 93 | 
 94 | ```{r}
 95 | augmented <- augment(fit)
 96 | 
 97 | head(augmented)
 98 | ```
 99 | 
100 | The other, `glance`, gets out per-model values like R-squared (it's not relevant for HW4 so we're not going over it today).
101 | 
102 | ### Graph within groups using do and tidy
103 | 
104 | Correlation does not imply causation. It's entirely possible that `mpg` and `qsec` are both affected by another confounding factor.
105 | 
106 | Consider another column: `cyl`: this denotes the number of cylinders in the engine. Could this affect both acceleration and miles per gallon, such that their relationship otherwise disappears?
107 | 
108 | ```{r}
109 | ggplot(mtcars, aes(qsec, mpg, color = factor(cyl))) +
110 |   geom_point()
111 | ```
112 | 
113 | That certainly looks like it might be a confounding factor: cars with more cylinders are faster but have less gas mileage.
114 | 
115 | Instead of performing one big regression, let's perform a regression within each group defined by the number of cylinders. We can do that with dplyr's `do`:
116 | 
117 | ```{r}
118 | library(dplyr)
119 | 
120 | fits <- mtcars %>%
121 |   group_by(cyl) %>%
122 |   do(mod = lm(mpg ~ qsec, data = .))
123 | 
124 | fits
125 | ```
126 | 
127 | The relevant expression is `lm(mpg ~ qsec, data = .)`- notice that we are performing the same regression as above, but this time we give `data = .`. The `.` means "the data frame within this current `group_by` group.
128 | 
129 | We now have a list column called `mod` that contains the three per-group fits. We can extract out the terms and coefficients using `tidy`, which has a nice shortcut for working with list columns:
130 | 
131 | ```{r}
132 | tidy(fits, mod)
133 | ```
134 | 
135 | Notice that we've combined the three models, which each appear alongside a `cyl` showing what group they were in.
136 | 
137 | We can also extract out the `augment` results, such as the residuals and fitted values:
138 | 
139 | ```{r}
140 | augment(fits, mod)
141 | ```
142 | 
143 | Finally (this is important for the homework), note that this last result drops all the columns from the original data besides `cyl`, `mpg`, and `qsec` (the ones involved in the model or the grouping). What if we wanted to keep those alongside the residuals or fitted values?
144 | 
145 | We can fix this by doing two things: first, we remove the `mod =` in the original `do` statement- then we are no longer saving to a column, but rather expanding the data. Second, we add a `data = .` argument *to augment* alongside the `data = .` argument to `lm`- that tells augment to keep all the columns.
146 | 
147 | ```{r}
148 | augments <- mtcars %>%
149 |   group_by(cyl) %>%
150 |   do(augment(lm(mpg ~ qsec, data = .), data = .))
151 | 
152 | augments
153 | ```
154 | 
155 | Notice that all the columns from the original, like `disp`, `hp`, and `wt` are included now.
156 | 


--------------------------------------------------------------------------------
/lectures/regression/regression-in-practice.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Regression in Practice"
  3 | output: html_document
  4 | ---
  5 | 
  6 | # Regression
  7 | 
  8 | ```{r, message=FALSE, warning=FALSE}
  9 | library(dplyr)
 10 | library(tidyr)
 11 | library(broom)
 12 | library(ggplot2)
 13 | theme_set(theme_bw(base_size = 16))
 14 | ```
 15 | 
 16 | ## Sophomore Slump 
 17 | 
 18 | Wikipedia defines the _sophomore slump_ as 
 19 | 
 20 | > A sophomore slump or sophomore jinx or sophomore jitters refers to an instance in which a second, or sophomore, effort fails to live up to the standards of the first effort. It is commonly used to refer to the apathy of students (second year of high school, college or university), the performance of athletes (second season of play), singers/bands (second album),television shows (second seasons) and movies (sequels/prequels).
 21 | 
 22 | In Baseball the phrase is used to describe the observation that players that win Rookie of the Year don't do as well during their second year. 
 23 | "Will MLB's tremendous rookie class of 2015 suffer a sophomore slump?" asks this [article](http://www.foxsports.com/mlb/story/kris-bryant-carlos-correa-rookies-of-year-award-matt-duffy-francisco-lindor-kang-sano-120715) of the Sophomore Slump in baseball. 
 24 | 
 25 | Examining the data for batting average we see that this observation holds true.
 26 | 
 27 | ```{r}
 28 | library(Lahman)
 29 | ###we remove pitchers
 30 | playerInfo <- group_by(Fielding, playerID) %>% summarize(POS=POS[1]) %>% left_join(Master, by="playerID") %>% select(playerID, nameFirst, nameLast, POS)
 31 | ROY <- filter(AwardsPlayers, awardID == "Rookie of the Year") %>% 
 32 |   left_join(playerInfo, by="playerID") %>% filter(POS!="P") %>%
 33 |   rename(rookieYear = yearID) %>%
 34 |   right_join(Batting, by="playerID") %>% 
 35 |   mutate(AVG=H/AB)  %>%  
 36 |   filter(yearID==rookieYear | yearID==rookieYear+1) %>% group_by(playerID) %>% mutate(rookie = ifelse(yearID==min(yearID), "rookie", "sophomore")) %>%
 37 |   filter(n()==2) %>% ungroup %>%
 38 |   select(playerID, rookieYear, rookie, nameFirst, nameLast, AVG) %>%
 39 |   spread(rookie, AVG)
 40 | options(digits = 3)
 41 | arrange(ROY, desc(rookie))
 42 | cat(mean(ROY$sophomore - ROY$rookie <= 0)*100,"% performed worse.", sep="")
 43 | ```
 44 | 
 45 | So is it "jitters" or "jinx"? Let's look at all players in 2013 and 2014 seasons that batted more than 130 times (minimum to win Rookie of the Year).
 46 | 
 47 | ```{r}
 48 | dat <- Batting %>% filter(yearID%in%2013:2014) %>% group_by(playerID,yearID) %>%  filter(sum(AB)>=130) %>% summarize(AVG=sum(H)/sum(AB))  %>% ungroup %>% 
 49 |   spread(yearID, AVG) %>% filter(!is.na(`2013`) & !is.na(`2014`))
 50 | dat <- right_join(playerInfo, dat, by="playerID") %>% filter(POS!="P") %>% select(-POS)
 51 | arrange(dat, desc(`2013`)) %>% select(-playerID)
 52 | ```  
 53 | 
 54 | Note that the same pattern arises. Batting averages go down. But look at what happens at for the worse hitters of 2013:
 55 | 
 56 | ```{r}
 57 | arrange(dat, `2013`) %>% select(-playerID)
 58 | ```
 59 | 
 60 | ### Assessment
 61 | 
 62 | Miguel Cabrera hit 348 in 2013. Use linear regression to predict his 2014 average. Use the `broom` package to get a prediction for all players. In general how does the prediction for 2014 compare to the 2013 performance? 
 63 | 
 64 | ```{r}
 65 | fit <- dat %>% do(augment(lm(`2014`~`2013`, data=.), data=.))
 66 | select(fit, nameFirst, nameLast, X2013, .fitted, X2014) %>% arrange(desc(X2013)) %>% tbl_df
 67 | ```
 68 | 
 69 | ## Regression Fallacy 
 70 | 
 71 | In the assessment we noted that the correlation is not 1:
 72 | 
 73 | ```{r}
 74 | summarize(dat, cor(`2013`,`2014`))
 75 | ```
 76 | 
 77 | We can see this in a scatterplot of the standardized values:
 78 | 
 79 | ```{r}
 80 | dat %>% ggplot(aes(scale(`2013`), scale(`2014`))) + geom_point() + geom_smooth(method = "lm", se=FALSE, col="blue") + geom_abline(intercept = 0, slope = 1) + xlab("2013") + ylab("2014") +  scale_x_continuous(limits=c(-3,3)) +  scale_y_continuous(limits=c(-3,3))
 81 | ```
 82 | 
 83 | The data look very much like a bivariate normal distribution which means we predict 2014 batting average $Y$ for any given player that had 2013 batting average $X$ with:
 84 | 
 85 | $$ \frac{Y - 255}{32} = 0.46 \left( \frac{X - 261}{23}\right) $$
 86 | 
 87 | Because the correlation is not perfect regression tells us that on average, expect high performs from 2013 will to do a bit worse in 2014. It's not a jinx, it's just due to the chance. In the context of linear models 
 88 | 
 89 | ## Correlation is not causation
 90 | 
 91 | Correlation is not causation is perhaps the most important lesson one learns in a statistics class. Here is just one example that underscores this.
 92 | 
 93 | ```{r, echo=FALSE}
 94 | ## Extracted from http://www.tylervigen.com/spurious-correlations
 95 | library(dplyr)
 96 | library(ggplot2)
 97 | theme_set(theme_bw(base_size = 16))
 98 | dat <- data.frame( divorce_rate_maine = c(5, 4.7, 4.6, 4.4, 4.3, 4.1, 4.2, 4.2, 4.2, 3.7)/1000,
 99 |                    margarine_consumption_per_capita = c(8.2, 7, 6.5, 5.3, 5.2, 4, 4.6, 4.5, 4.2, 4.1),
100 |                    year = 2000:2009) 
101 | dat %>% ggplot(aes( margarine_consumption_per_capita, divorce_rate_maine*1)) + geom_point(cex=3) + geom_smooth(method = "lm") + 
102 |   ggtitle(paste("Correlation =", round(cor(dat$margarine_consumption_per_capita, dat$divorce_rate_main),2))) +
103 |   xlab("Margarine Consumption per Capita (lbs)") + ylab("Divorce rate in Maine (per 1000)")
104 | ```
105 | 
106 | You can see many more examples of these spurious correlations [here](http://tylervigen.com/spurious-correlations). There are many reasons that a variable $X$ can correlated with a variable $Y$ without being the cause. One obvious one is when $Y$ is actually the cause of $X$. Another is when we have a third variable $Z$ that affects both $X$ and $Y$. We call these _confounders_. In some cases we can use linear models to account for confounders.
107 | 
108 | 
109 | #### Do Bases on Ball predict runs ?
110 | 
111 | 
112 | Bases on balls (BB) and runs (R) correlate at the team level:
113 | 
114 | The data looks bivariate normal and a linear regression analysis tells us that BB does indeed predict runs.
115 | ```{r}
116 | Teams %>% filter(yearID > 1961 & G==162) %>%  lm(R ~ BB, data = .) %>% 
117 |   tidy(conf.int = TRUE)
118 | ```
119 | 
120 | If you have experience watching baseball you know that HR tend to receive many BBs. And we know that HR predict runs. The following plot confirms what we are pointing out. We stratify players by the number of HRs they hit and examine the distribution of BB. We only consider players with more than 500 plate appearances.
121 | 
122 | ```{r}
123 | Batting %>%
124 |   filter(yearID >= 1961 & BB+AB > 500 & !is.na(HR) & !is.na(BB)) %>% 
125 |   mutate(HR = factor(pmin(HR, 40))) %>%
126 |   ggplot(aes(HR, BB)) +
127 |   geom_boxplot()
128 | ```
129 | 
130 | So we have observed or know the following
131 | 
132 | * BB are associated with Runs
133 | * HR cause runs
134 | * Players that hit many HR are expensive. 
135 | * Players with many BB without many HR are not that expensive.
136 | 
137 | This seems to imply we should search for players with few HR but many BB as they will be expensive and produce runs. But what if the association between BB and R is completely explained by HR?
138 | 
139 | One way we can answer this question is by keeping HR fixed and examining the relationship within the strata. 
140 | 
141 | #### Assessment 
142 | 
143 | We can't perform this analysis on a single year, because there are not enough teams to obtain strata with more than one or two teams. Instead we will combine all data years since 1961. 
144 | 
145 | Start by creating a data set with R, BB and HR for every team from 1961 to today. What is the correlation between each of the three pairs.
146 | 
147 | ```{r}
148 | Teams %>%  filter(yearID >= 1961 & G==162) %>% 
149 |   summarize(cor(R,BB), cor(R,HR), cor(BB,HR))
150 | ```
151 | 
152 | #### Assessment 
153 | 
154 | One way to eliminate the the possibility the the BB effect on runs is driven by the confounding with HR is to keep HRs fixed and then examine the relationship. Give a confidence interval for the effect of BB on R for teams with 120-130 HRs. Make a plot to see if the 
155 | Consider only teams from 1961 and beyond. 
156 | 
157 | Compare to the coefficient when we don't stratify.
158 | 
159 | ```{r}
160 | my_data <- Teams %>%  filter(yearID >= 1961 & G==162 & HR %in% 120:130)
161 | my_data %>% ggplot(aes(BB,R)) + geom_point() + geom_smooth(method = "lm")
162 | my_data %>% lm(R ~ BB, data = .) %>% tidy(conf.int = TRUE)
163 | ```
164 | 
165 | #### Assessment
166 | 
167 | Now let's see if this holds for other strata.
168 | 
169 | Here is an example of how you can use functions  `quantile` and `cut` to assign each team to a strata like this
170 | 
171 | ```{r}
172 | x <- rnorm(100)
173 | qs <- quantile(x, prob=seq(0,1,.2))
174 | group <- cut(x, qs, include.lowest = TRUE)
175 | table(group)
176 | ```
177 | 
178 | You this idea to make a scatter plot of R versus BB for 10 HR strata. Hint: use `facet_wrap`
179 | 
180 | ```{r}
181 | my_data <- Teams %>%  filter(yearID >= 1961 & G==162) %>%
182 |   mutate(group = cut(HR, quantile(HR, prob = seq(0,1,.1)), include.lowest=TRUE))
183 | my_data %>%
184 |   ggplot(aes(BB, R)) + 
185 |   geom_point() + 
186 |   geom_smooth(method = "lm") + 
187 |   facet_wrap(~group)
188 | ```
189 | 
190 | #### Assessment
191 | 
192 | While the BB effect was certainly lower than before we controlled for HR, but it appears to be there. To check more formally we can fit a linear model to each strata.
193 | 
194 | ```{r}
195 | res <- my_data %>% group_by(group) %>%
196 |   do(tidy(lm(R ~ BB, data = .))) %>% ungroup
197 | ```
198 | 
199 | Note that the intercept increases linearly and that the BB effect seems about constant:
200 | 
201 | ```{r}
202 | res %>% select( group, term, estimate, std.error) %>%
203 |   filter(term=="BB") %>% 
204 |   ggplot(aes(group, estimate, 
205 |              ymin=estimate-2*std.error,
206 |              ymax=estimate+2*std.error)) +
207 |   geom_point() +
208 |   geom_errorbar()
209 | ```
210 | 
211 | If the effect of BB is linear and the same for all HR strata, and HR has a linear effect, then we might try this model:
212 | 
213 | $$
214 | \mbox{Runs} = \beta_0 + \beta_{BB} \mbox{BB} + \beta_{HR}{HR} + \varepsilon
215 | $$
216 | 
217 | In this model, we _adjust_ for HRs by including it as linear term. Note that we have already showed data that support this model. In general, simply fitting such a model does not necessarily adjust for a possible confounded. The model must be approximately correct.
218 | 
219 | We can fit this model like this:
220 | 
221 | ```{r}
222 | Teams %>%
223 |   filter(yearID >= 1961 & G==162) %>%
224 |   lm(R~BB+HR, data=.) %>%
225 |   tidy
226 | 
227 | ```
228 | 
229 | We can check residual plots:
230 | 
231 | ```{r}
232 | fit <- Teams %>%
233 |   filter(yearID >= 1961 & G==162) %>% 
234 |   do(augment(lm(R~BB+HR, data=.), data=.))
235 | qqnorm(scale(fit$.resid))
236 | abline(0,1)
237 | 
238 | g1 <- fit  %>% ggplot(aes(HR,.resid)) + geom_point() + geom_smooth()
239 | g2 <- fit  %>% ggplot(aes(BB,.resid)) + geom_point() + geom_smooth()
240 | library(gridExtra)
241 | grid.arrange(g1, g2, nrow = 1)
242 | ```
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 


--------------------------------------------------------------------------------
/lectures/wrangling/data-wrangling-with-tidyr.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Data Wrangling with `tidyr` 
  3 | author: Stephanie Hicks, Rafael Irizarry
  4 | ---
  5 | 
  6 | The data analysis process can be thought about in four parts
  7 | 
  8 | 1. Data cleaning
  9 | 2. Data transformation 
 10 | 3. Data visualization
 11 | 4. Modeling
 12 | 
 13 | where we each of these steps need their own tools and software to complete. 
 14 | 
 15 | ![Bottlenecks in data analysis](http://r4ds.had.co.nz/diagrams/data-science.png)
 16 | 
 17 | As we have seen in class, one of the most time-consuming aspects of 
 18 | the data analysis process is "data wrangling". This is also known 
 19 | as "data munging", which is a trendy term for 
 20 | _cleaning up a messy data set_. This refers to the first two steps in the 
 21 | data analysis process: 
 22 | 
 23 | 1. Data cleaning (or tidying data)
 24 | 2. Data transformation 
 25 | 
 26 | It can take a long time to clean and transform messy data into a format 
 27 | that is useful for data visualization and modeling, but there are tools 
 28 | that can help turn messy data into clean data. 
 29 | 
 30 | ### Defining data structures 
 31 | 
 32 | There are many ways to define the structure of a data set. 
 33 | Most data frames are made up of **rows** and **columns** where the columns 
 34 | are almost always labeled and the rows are *sometimes* labeled. 
 35 | 
 36 | For example, a data set could be structured in the following way: 
 37 | 
 38 | * each row represents one company (row names are companies)
 39 | * each column represent one time point
 40 | * the stock prices are defined for each row/column pair
 41 | 
 42 | ![stocks by company](pics/stocks-by-company.png)
 43 | 
 44 | Alternatively, a data set can be structured in the following way:
 45 | 
 46 | * each row represents one time point (but no row names)
 47 | * the first column defines the time variable and the last three columns contain the stock prices for three companies 
 48 | 
 49 | ![stocks by time](pics/stocks-by-time.png)
 50 | 
 51 | In both cases, the data is the same, but the structure is different. This 
 52 | can be  _frustrating_ to deal with because the meaning of the 
 53 | values (rows and columns) in the two data sets are different. Providing a 
 54 | standardized way of organizing values within a data set would alleviate 
 55 | a major portion of this frustration.  
 56 | 
 57 | 
 58 | ### Defining tidy data
 59 | 
 60 | Now, we will introduce the concept of **tidy** data. Tidy data is a 
 61 | standard way of mapping the meaning of a dataset to its structure.
 62 | The properties of a tidy data set are based on: 
 63 | 
 64 | * Each column is a variable
 65 | * Each rows is an observation
 66 | 
 67 | Working with tidy data is useful because it creates a structured way of
 68 | organizing data values within a data set. This makes the data analysis 
 69 | process more efficient and simplifies the development of data analysis tools
 70 | that work together. In this way, you can focus on the problem you are
 71 | investigating, rather than the uninteresting logistics of data.  
 72 | 
 73 | 
 74 | ### What is `tidyr`?
 75 | 
 76 | [`tidyr`](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html)
 77 | is an R package that transforms data sets to a tidy format. 
 78 | 
 79 | There are two main functions in `tidyr`: 
 80 | 
 81 | * `gather()` = takes multiple columns, and gathers them into key-value pairs  
 82 | (it makes "wide" data longer)
 83 | * `separate()` = turns a single character column into multiple columns
 84 | (it makes "long" data wider)
 85 | 
 86 | We'll explore what it means to go between a "wide" and "long" 
 87 | data format using `gather()` and `separate()` next. 
 88 | 
 89 | 
 90 | ### How do I get tidyr? 
 91 | 
 92 | To install `tidyr`
 93 | 
 94 | ```{r, eval=FALSE}
 95 | install.packages("tidyr")
 96 | ```
 97 | 
 98 | To load `tidyr` and we'll need `dplyr`
 99 | 
100 | ```{r, message=FALSE}
101 | library(tidyr)
102 | library(dplyr)
103 | ```
104 | 
105 | For motivation, a tidy version of the stock data we looked at above
106 | looks like this: (we'll learn how the functions work in just a moment)
107 | 
108 | ![stocks tidy](pics/stocks-tidy.png)
109 | 
110 | In this "tidy" data set, we have three columns representing three variables 
111 | (time, company name and stock price). Every row represents contains one
112 | stock price from a particular time and for a specific company. 
113 | 
114 | ### Pipe operator: %>%
115 | 
116 | We have introduced the operator: `%>%`. 
117 | dplyr imports this operator from another package (`magrittr` [see help file here](http://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html)). 
118 | This operator allows you to pipe the output from one function to 
119 | the input of another function. Instead of nesting functions (reading 
120 | from the inside to the outside), the idea of of piping is to read the 
121 | functions from left to right. 
122 | 
123 | Now in this case, we pipe the `stocks` data frame to the function that will 
124 | gather multiple columns into key-value pairs. 
125 | 
126 | 
127 | # Data 
128 | 
129 | ## 2016 Iowa Presidential Caucus 
130 | 
131 | We will explore [public poll data from HuffPost Pollster](http://elections.huffingtonpost.com/pollster) 
132 | from the 2016 Iowa [Democratic](http://elections.huffingtonpost.com/pollster/2016-iowa-presidential-democratic-caucus) and [Republican](http://elections.huffingtonpost.com/pollster/2016-iowa-presidential-republican-caucus) 
133 | Presidential Caucus.
134 | 
135 | First we will read in the data:
136 | ```{r}
137 | library(readr)
138 | dem_polls = read_csv("http://elections.huffingtonpost.com/pollster/2016-iowa-presidential-democratic-caucus.csv")
139 | rep_polls = read_csv("http://elections.huffingtonpost.com/pollster/2016-iowa-presidential-republican-caucus.csv")
140 | ```
141 | 
142 | Let's take a look at data
143 | ```{r, eval=FALSE}
144 | View(dem_polls)
145 | View(rep_polls)
146 | 
147 | glimpse(dem_polls)
148 | glimpse(rep_polls)
149 | ```
150 | 
151 | We see there is a lot of information in each data frame.  First let's use 
152 | `dplyr` to select a subset of the columns.  
153 | ```{r}
154 | dem_polls <- dem_polls %>% 
155 |                 select(Pollster, `End Date`, Clinton:Undecided)
156 | 
157 | rep_polls <- rep_polls %>% 
158 |                 select(Pollster, `End Date`, Trump:Walker)
159 | ```
160 | 
161 | In the democratic and republican polling data sets, there is one column 
162 | representing the polling percentages for each candidate, 
163 | similar to the stock price data set with multiple columns 
164 | representing different companies.  To **tidy** it, we need to *gather* these 
165 | columns into a two-column *key-value* pair.  This is often described as 
166 | transforming a _wide_ data set into a _long_ data set.  
167 | 
168 | 
169 | 
170 | 
171 | # gather()
172 | 
173 | This function gathers multiple columns and collapses them into new 
174 | *key-value* pairs. This transform data from _wide_ format into 
175 | a _long_ format. 
176 | 
177 | * The `key` is the name of the _new_ column that you are creating which 
178 | contains the values of the column headings that you are gathering 
179 | * The `value` is the name of the _new_ column that will contain the values
180 | themselves
181 | * The third argument defines the columns to gather
182 | 
183 | ```{r}
184 | dem_polls %>% 
185 |     gather(key = candidate, value = percentage, Clinton:Undecided)
186 | ```
187 | 
188 | To select a range of columns by name, use the ":" (colon) operator
189 | 
190 | #### Assessment 
191 | Using the democratic poll data, apply the `gather()` function 
192 | to tidy the poll data by _excluding_ the Pollster and End Date columns, 
193 | rather than directly providing the column names to gather.  
194 | 
195 | Hint: Look at the `gather()` help file on how to exclude column names.
196 | 
197 | ```{r}
198 | ## Provide your code here
199 | 
200 | dem_polls %>% 
201 |     gather(key = candidate, value = percentage, -c(Pollster, `End Date`))
202 | 
203 | ## To select all the columns *except* a specific column, 
204 | ## use the "-" (subtraction) operator (also known as negative indexing)
205 | ```
206 | 
207 | 
208 | #### Assessment
209 | Using the "tidy" democratic poll data, use dplyr to filter for only 
210 | the following candidates (Clinton, Sanders, O'Malley) and for polls 
211 | only ending after May 1, 2015. 
212 | 
213 | ```{r}
214 | ## Provide your code here
215 | 
216 | dem_polls %>% 
217 |     gather(key = candidate, value = percentage, Clinton:Undecided) %>%
218 |     filter(candidate %in% c("Clinton", "Sanders", "O'Malley") & 
219 |            `End Date` >= "2015-05-01")
220 | ```
221 | 
222 | #### Assessment (optional)
223 | Using the tidy and filtered democratic poll data set, 
224 | use `ggplot2` to plot the results from each poll (percentage) for each
225 | of the candiates. Color the lines by the candidate. 
226 | 
227 | ```{r}
228 | ## Provide your code here
229 | 
230 | library(ggplot2)
231 | 
232 | dem_polls %>% 
233 |     gather(key = candidate, value = percentage, Clinton:Undecided) %>%
234 |     filter(candidate %in% c("Clinton", "Sanders", "O'Malley") & 
235 |            `End Date` >= "2015-05-01") %>% 
236 |     ggplot(aes(x=`End Date`, y = percentage, color = candidate)) + 
237 |     geom_line()
238 | ```
239 | 
240 | 
241 | #### Assessment (optional) 
242 | Repeat this analysis using the republican poll data. 
243 | Filter for candidates (Trump, Cruz, Rubio, Carson, Bush) 
244 | and for polls only after May 1, 2015. Color the lines by candidates. 
245 | 
246 | ```{r}
247 | ## Provide your code here
248 | 
249 | rep_polls %>% 
250 |     gather(key = candidate, value = percentage, Trump:Walker) %>%
251 |     filter(candidate %in% c("Trump", "Cruz", "Rubio", "Carson", "Bush") & 
252 |            `End Date` >= "2015-05-01") %>% 
253 |     ggplot(aes(x=`End Date`, y = percentage, color = candidate)) + 
254 |     geom_line()
255 | ```
256 | 
257 | 
258 | 
259 | 
260 | # spread()
261 | 
262 | In contrast to *gathering* multiple columns into key-value pairs, we can 
263 | *spread* a key-value pair across multiple columns.  
264 | 
265 | The function `spread()` does just that. It transforms data from a _long_
266 | format into a _wide_ format. 
267 | 
268 | * The `key` is the name of the column in your data set that 
269 | contains the values of the column headings that you are spreading across 
270 | multiple columns
271 | * The `value` is the name of the column that contains the values for the 
272 | multiple columns
273 | 
274 | 
275 | ```{r}
276 | dem_polls_gathered <- dem_polls %>% 
277 |                          gather(key = candidate, value = percentage, 
278 |                                 Clinton:Undecided)
279 | dem_polls_gathered
280 | 
281 | dem_polls_gathered %>% 
282 |     spread(key = candidate, value = percentage)
283 | ```
284 | 
285 | 
286 | ## Other supporting functions in tidyr
287 | 
288 | * `separate()` = separate one column into multiple columns
289 | * `unite()` = unite multiple columns into one
290 | 
291 | ```{r}
292 | dem_polls_separate <- dem_polls %>% 
293 |                         separate(col = `End Date`, into = c("y", "m", "d"))
294 | dem_polls_separate
295 | ```
296 | 
297 | #### Assessment
298 | 
299 | Use the `unite()` function to create a new column titled "end_date" that 
300 | combines the columns `y`, `m` and `d` together into a single column separated 
301 | by the "/" character. 
302 | 
303 | ```{r}
304 | ## Provide your code here
305 | 
306 | dem_polls_separate %>% 
307 |     unite(col = end_date, y, m, d, sep = "/")
308 | ```
309 | 
310 | 
311 | # Cheatsheets
312 | 
313 | * [Data Wrangling with dplyr and tidyr from RStudio](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf)
314 | 
315 | 
316 | 
317 | 


--------------------------------------------------------------------------------
/lectures/wrangling/data-wrangling.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Data Wrangling"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ## Data Wrangling
  7 | 
  8 | In the real world, data science projects rarely involve data that can be easily imported ready for analysis. According to Wikipedia:
  9 | 
 10 | >Data munging or data wrangling is loosely the process of manually converting or mapping data from one "raw" form into another format that allows for more convenient consumption of the data with the help of semi-automated tools.
 11 | 
 12 | Our example dataset provides an example:
 13 | 
 14 | ```{r}
 15 | url <- "https://raw.githubusercontent.com/datasciencelabs/data/master/bio260-heights.csv"
 16 | dat <- read.csv(url)
 17 | ```
 18 | 
 19 | First note how we make assignments in R: we use `<-`. We can also use the equal sign `=` although here we try to stick to `<-` to make it very clear it is an assignment and not logical statement.
 20 | 
 21 | We also note that we have put the content of what comes out of `read.csv` into an _object_. We picked the object name `dat`. 
 22 | 
 23 | So what is `dat` exactly? We can get a quick summary of what an object is with the function `str` (stands for structure)
 24 | 
 25 | ```{r}
 26 | str(dat)
 27 | ```
 28 | 
 29 | Here we see that this object is a `data.frame`. These are one of the most widely used data types in R. They are particularly useful for storing tables. 
 30 | 
 31 | To see more of this object we can type it 
 32 | 
 33 | 
 34 | Now we want to describe the heights. We could simply report the list of numbers. But there is a problem. Take a look at the entries:
 35 | ```{r,eval=FALSE}
 36 | View(dat)
 37 | ```
 38 | 
 39 | Notice these not all entries are numbers. Furthermore, they are not all in inches. So what to do? We need to wrangle
 40 | 
 41 | #### Extracting columns
 42 | 
 43 | To extract columns from the data.frame we use the `$` character like this:
 44 | 
 45 | ```{r, eval=FALSE}
 46 | dat$Timestamp
 47 | ```
 48 | 
 49 | This now gives us a vector. We can access elements of the vector using the `[` symbol:
 50 | 
 51 | ```{r}
 52 | dat$Timestamp[2]
 53 | ```
 54 | 
 55 | #### Quick Review of Vectors
 56 | 
 57 | Vector are a sequence of data elements of the same type. Many of the operations used to analyze data are applied to vectors. In R vectors can be numeric, characters or logical. 
 58 | 
 59 | The most basic way to creat a vector is with the function `c`
 60 | ```{r}
 61 | x <- c(1,2,3,4,5)
 62 | ```
 63 | 
 64 | Two very common ways of generating vectors are using `:` or the `seq` function:
 65 | 
 66 | ```{r}
 67 | x <- 1:5
 68 | x <- seq(1,5)
 69 | ```
 70 | 
 71 | Vecotrs can have names
 72 | 
 73 | ```{r}
 74 | names(x) <- letters[1:5]
 75 | x
 76 | ```
 77 | 
 78 | 
 79 | #### Coercion
 80 | 
 81 | Vectors need to be homogenous. But when R is instructed to create a vector of different types, it does not give an error. Instead it tries to _coerce_ values to be the same. Here is an example:
 82 | 
 83 | ```{r}
 84 | height <- c(60, 59, 55, "5'5", 70)
 85 | height
 86 | ```
 87 | 
 88 | Note that no warning or error was given. It simply changed everything to a character. This is important to know because sometimes we make a mistake in entering data and receive no error message.
 89 | 
 90 | 
 91 | ## Data Manipulation wiht `dplyr`
 92 | 
 93 | R provides incredibly powerful and flexible language for data manipulation. However, the syntax is somewhat hard to get used to. We will therefore  introducing a package that makes the syntax much more like the English language. This package is `dplyr` which you should install if you have not done so already.
 94 | 
 95 | ```{r, message=FALSE}
 96 | library(dplyr)
 97 | ```
 98 | 
 99 | When using `dplyr` we recommend reading in data with the functions in the `readr` package:
100 | 
101 | ```{r}
102 | library(readr)
103 | dat <- read_csv("https://raw.githubusercontent.com/datasciencelabs/data/master/bio260-heights.csv")
104 | ```
105 | 
106 | This object is now a special type of `data.frame` called `tbl_df` that has a nicer printing method. We can now simply evaluate an expression with just the object and see a meaningful summary instead of 
107 | everything.
108 | 
109 | ```{r}
110 | dat
111 | ```
112 | 
113 | #### Selecting columns
114 | 
115 | Right, we are interested in looking at heights. We can select just that column using:
116 | 
117 | ```{r}
118 | select(dat, contains("height"))
119 | ```
120 | 
121 | We have a problem: this is a `character`. We want numbers. 
122 | 
123 | ## Renaming columns
124 | 
125 | Before we continue it will be convenient to change the names of our columns to something more convenient.
126 | 
127 | ```{r}
128 | names(dat) <- c("time","gender","height")
129 | ```
130 | 
131 | ## Vectorization
132 | 
133 | ```{r}
134 | height <- c(60, 59, 55, "5'5", 70)
135 | height[3]
136 | as.numeric(height[3])
137 | ```
138 | 
139 | One powerful feature of R is that we can _vectorize_
140 | most operation
141 | 
142 | ```{r}
143 |  as.numeric(height) 
144 | ```
145 | Note now we do receive an warning. This is because R has no idea how to convert "5'5" to a number.
146 | 
147 | ## Missing values
148 | 
149 | Note in the the `NA` value in the object above. 
150 | 
151 | These are missing values. We can find out which values are missing using the function 
152 | 
153 | ```{r,eval=FALSE}
154 | ?is.na
155 | ```
156 | 
157 | ## Adding columns
158 | ```{r}
159 | dat <- mutate(dat, numeric_height=as.numeric(height),
160 |               original=height)
161 | ```
162 | 
163 | ## Subsetting Observations
164 | 
165 | To see all the row in which we have problems:
166 | 
167 | ```{r}
168 | filter(dat, is.na(numeric_height))
169 | ```
170 | 
171 | ## The Pipe
172 | 
173 | ```{r}
174 | filter(dat, is.na(numeric_height)) %>% select(height) 
175 | ```
176 | 
177 | Let's see more
178 | 
179 | ```{r}
180 | filter(dat, is.na(numeric_height)) %>% select(height) %>% print(n=21)
181 | ```
182 | 
183 | #### `gsub`
184 | 
185 | One of the most useful functions for data wranglin is `gsub`. It let's us searches for characters and substitutes it for others. More general it searches for regular expression. We will learn about those later. 
186 | 
187 | Here is an example:
188 | 
189 | ```{r}
190 | x <- dat$height[109:116]
191 | x
192 | ```
193 | 
194 | Note that we are using both `'` and `ft` as the same thing. To simplify the problem we want to substitute one for the other. `gsub` does the trick:
195 | 
196 | ```{r}
197 | x <- gsub("ft", "'", x)
198 | x
199 | ```
200 | 
201 | The word inches is not doing anything here so we might as well remove it.
202 | ```{r}
203 | x <- gsub("inches","",x)
204 | x
205 | ```
206 | 
207 | 
208 | We are now ready to start fixing the height data:
209 | 
210 | ```{r}
211 | dat <- mutate(dat, height= gsub("ft","'",height) ) %>% 
212 |   mutate(height= gsub("\"|inches|\ |''","",height) )
213 | ```
214 | 
215 | 
216 | 
217 | ## Functions 
218 | 
219 | Up to now we have used prebuilt functions. However, many times we have to construct our own. We can do this in R using the `function`:
220 | 
221 | ```{r}
222 | avg <- function(x){
223 |   return( sum(x) / length(x) )
224 | }
225 | avg( 1:5 )
226 | ```
227 | 
228 | Assessment: Construct a function that computes the variance defined as follows for a vector $x_1,\dots,x_n$:
229 | 
230 | $$ \frac{1}{n} \sum_{i=1}^n ( x_i - \mu)^2 \mbox{ with } \mu=\frac{1}{n}\sum_{i=1}^n x_i
231 | $$
232 | 
233 | What is the variance of `1:5` ?
234 | 
235 | Assessment: Write a function `convert` that takes two character arguments, feet and inches as characters, and returns inches
236 | 
237 | Here we construct a more complicated function that changes 5'4 to `5*12+4`
238 | ```{r}
239 | fixheight <- function(x){
240 |   y <- strsplit(x, "'")
241 |   ret <- sapply(y, function(z){
242 |     ifelse( length(z)>1, as.numeric(z[1])*12 + as.numeric(z[2]) ,
243 |             as.numeric(z[1]))
244 |   })
245 |   return(ret)
246 | }
247 | ```
248 | 
249 | We can now test the function
250 | ```{r}
251 | fixheight( "70")
252 | fixheight( "5'10")
253 | fixheight( c("5'9","70","5'11"))
254 | ```
255 | 
256 | Finally we can mutate our data:
257 | 
258 | ```{r}
259 | dat <- mutate(dat, height=fixheight(height)) %>% select(-numeric_height)
260 | ```
261 | 
262 | The last call to select removes the now unecessary column `numeric_height`. Let's see the result:
263 | 
264 | ```{r}
265 | filter(dat, is.na(height)) %>% select(height) 
266 | ```
267 | 
268 | We have removed all the NAs
269 | 


--------------------------------------------------------------------------------
/lectures/wrangling/pics/dplyr_binding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/wrangling/pics/dplyr_binding.png


--------------------------------------------------------------------------------
/lectures/wrangling/pics/dplyr_filtering_joins.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/wrangling/pics/dplyr_filtering_joins.png


--------------------------------------------------------------------------------
/lectures/wrangling/pics/dplyr_mutating_joins.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/wrangling/pics/dplyr_mutating_joins.png


--------------------------------------------------------------------------------
/lectures/wrangling/pics/dplyr_set_operations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/wrangling/pics/dplyr_set_operations.png


--------------------------------------------------------------------------------
/lectures/wrangling/pics/dplyr_two_tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/wrangling/pics/dplyr_two_tables.png


--------------------------------------------------------------------------------
/lectures/wrangling/pics/stocks-by-company.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/wrangling/pics/stocks-by-company.png


--------------------------------------------------------------------------------
/lectures/wrangling/pics/stocks-by-time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/wrangling/pics/stocks-by-time.png


--------------------------------------------------------------------------------
/lectures/wrangling/pics/stocks-tidy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datasciencelabs/2016/660e3cf0677c165130dee2a9dc4e80c958b87475/lectures/wrangling/pics/stocks-tidy.png


--------------------------------------------------------------------------------
/lectures/wrangling/rvest-scraping.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Web scraping/processing with rvest and stringr"
  3 | author: "David Robinson"
  4 | date: "April 11, 2016"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r echo = FALSE}
  9 | library(knitr)
 10 | opts_chunk$set(message = FALSE)
 11 | ```
 12 | 
 13 | We've learned how to process ready-made datasets, as well as read them . But what if your data is on a website, formatted to be read by humans rather than read by R?
 14 | 
 15 | We're going to learn to extract data from regular web pages so that it can be analyzed in R. This process is sometimes called "web-scraping" or "screen-scraping", and the rvest package is a powerful tool for doing it.
 16 | 
 17 | ### Resources
 18 | 
 19 | #### rvest/CSS Selectors
 20 | 
 21 | * [rvest package](https://github.com/hadley/rvest)
 22 | * [SelectorGadget tool](http://selectorgadget.com/)
 23 | * [rvest and SelectorGadget guide](https://cran.r-project.org/web/packages/rvest/vignettes/selectorgadget.html)
 24 | * [Awesome tutorial for CSS Selectors](http://flukeout.github.io/#)
 25 | 
 26 | #### stringr/regular expressions
 27 | 
 28 | * [Introduction to stringr](https://cran.r-project.org/web/packages/stringr/vignettes/stringr.html)
 29 | * [Regular Expressions/stringr tutorial](https://stat545-ubc.github.io/block022_regular-expression.html)
 30 | * [Regular Expression online tester](https://regex101.com/#python)- explains a regular expression as it is built, and confirms live whether and how it matches particular text.
 31 | 
 32 | ### Amazon Reviews
 33 | 
 34 | We're going to be scraping [this page](http://www.amazon.com/ggplot2-Elegant-Graphics-Data-Analysis/product-reviews/0387981403/ref=cm_cr_dp_qt_see_all_top?ie=UTF8&showViewpoints=1&sortBy=helpful): it just contains the (first page of) reviews of the ggplot2 book by Hadley Wickham. 
 35 | 
 36 | ```{r}
 37 | library(dplyr)
 38 | library(stringr)
 39 | 
 40 | url <- "http://www.amazon.com/ggplot2-Elegant-Graphics-Data-Analysis/product-reviews/0387981403/ref=cm_cr_dp_qt_see_all_top?ie=UTF8&showViewpoints=1&sortBy=helpful"
 41 | ```
 42 | 
 43 | We use the rvest package to download this page.
 44 | 
 45 | ```{r}
 46 | library(rvest)
 47 | 
 48 | h <- read_html(url)
 49 | ```
 50 | 
 51 | Now `h` is an `xml_document` that contains the contents of the page:
 52 | 
 53 | ```{r}
 54 | h
 55 | ```
 56 | 
 57 | How can you actually pull the interesting information out? That's where CSS selectors come in.
 58 | 
 59 | ### CSS Selectors
 60 | 
 61 | CSS selectors are a way to specify a subset of nodes (that is, units of content) on a web page (e.g., just getting the titles of reviews). CSS selectors are very powerful and not too challenging to master- here's [a great tutorial](http://flukeout.github.io/#) But honestly you can get a lot done even with very little understanding, by using a tool called SelectorGadget.
 62 | 
 63 | Install the [SelectorGadget](http://selectorgadget.com/) on your web browser. (If you use Chrome you can use the Chrome extension, otherwise drag the provided link into your bookmarks bar). [Here's a guide for how to use it with rvest to "point-and-click" your way to a working selector](http://selectorgadget.com/).
 64 | 
 65 | For example, if you just wanted the titles, you'll end up with a selector that looks something like `.a-color-base`. You can pipe your HTML object along with that selector into the `html_nodes` function, to select just those nodes:
 66 | 
 67 | ```{r}
 68 | h %>%
 69 |   html_nodes(".a-color-base")
 70 | ```
 71 | 
 72 | But you need the text from each of these, not the full tags. Pipe to the `html_text` function to pull these out:
 73 | 
 74 | ```{r}
 75 | review_titles <- h %>%
 76 |   html_nodes(".a-color-base") %>%
 77 |   html_text()
 78 | 
 79 | review_titles
 80 | ```
 81 | 
 82 | Now we've extracted something useful! Similarly, let's grab the format (hardcover or paperback). Some experimentation with SelectorGadget shows it's:
 83 | 
 84 | ```{r}
 85 | h %>%
 86 |   html_nodes(".a-size-mini.a-color-secondary") %>%
 87 |   html_text()
 88 | ```
 89 | 
 90 | Now, we may be annoyed that it always starts with `Format: `. Let's introduce the `stringr` package.
 91 | 
 92 | ```{r}
 93 | library(stringr)
 94 | 
 95 | formats <- h %>%
 96 |   html_nodes(".a-size-mini.a-color-secondary") %>%
 97 |   html_text() %>%
 98 |   str_replace("Format: ", "")
 99 | 
100 | formats
101 | ```
102 | 
103 | ### Number of stars
104 | 
105 | Next, let's get the number of stars. Some clicking with SelectorGadget finds an selector expression that will work:
106 | 
107 | ```{r}
108 | h %>%
109 |   html_nodes("#cm_cr-review_list .review-rating")
110 | ```
111 | 
112 | We can confirm these are the right tags (and there are ten of them, just like there are ten titles- good start). There's more going on in these that we don't need to worry about (they aren't just text, they're replaced with images in the web page), but using `html_text` still gets out relevant text:
113 | 
114 | ```{r}
115 | h %>%
116 |   html_nodes("#cm_cr-review_list .review-rating") %>%
117 |   html_text()
118 | ```
119 | 
120 | Now we need to pull out just the digit, 1-5. This can be done with regular expressions. Regular expressions are very powerful tools for working with text through "patterns"- see [here](http://www.regular-expressions.info/) for one resource.
121 | 
122 | We'll use the [stringr](https://cran.r-project.org/web/packages/stringr/vignettes/stringr.html) package:
123 | 
124 | ```{r}
125 | h %>%
126 |   html_nodes("#cm_cr-review_list .review-rating") %>%
127 |   html_text() %>%
128 |   str_extract("\\d")
129 | ```
130 | 
131 | Note that we piped the character vector to the `str_extract` pattern, which pulls out the parts within a string that match a pattern. The `\\d` pattern means a digit (that is, 1-9).
132 | 
133 | Finally, we have to turn them from a character vector to a numeric vector:
134 | 
135 | ```{r}
136 | number_stars <- h %>%
137 |   html_nodes("#cm_cr-review_list .review-rating") %>%
138 |   html_text() %>%
139 |   str_extract("\\d") %>%
140 |   as.numeric()
141 | 
142 | number_stars
143 | ```
144 | 
145 | The same applies to the number of people that found a review useful. Let's collect that too:
146 | 
147 | ```{r}
148 | h %>%
149 |   html_nodes("#cm_cr-review_list .review-votes") %>%
150 |   html_text()
151 | ```
152 | 
153 | The difference is that here we don't want just one digit- there could be multiple. We can add a `+` (meaning "one or more") to the regular expression to the `\\d` to match that:
154 | 
155 | ```{r}
156 | h %>%
157 |   html_nodes("#cm_cr-review_list .review-votes") %>%
158 |   html_text() %>%
159 |   str_extract("\\d+")
160 | ```
161 | 
162 | You'll still need `as.numeric()`:
163 | 
164 | ```{r}
165 | number_helpful <- h %>%
166 |   html_nodes("#cm_cr-review_list .review-votes") %>%
167 |   html_text() %>%
168 |   str_extract("\\d+") %>%
169 |   as.numeric()
170 | 
171 | number_helpful
172 | ```
173 | 
174 | Now we have all our data, from the first page:
175 | 
176 | ```{r}
177 | ret <- data_frame(review_titles, formats, number_stars, number_helpful)
178 | 
179 | ret
180 | ```
181 | 
182 | ### Multiple pages
183 | 
184 | Take a look at the URL for the second page:
185 | 
186 |     http://www.amazon.com/ggplot2-Elegant-Graphics-Data-Analysis/product-reviews/0387981403/ref=undefined_2?ie=UTF8&showViewpoints=1&sortBy=helpful&pageNumber=2
187 | 
188 | Notice that `pageNumber=2` at the end? Try adding a few values there. We see we can get all 5 URLs easily.
189 | 
190 | ```{r}
191 | url_base <- "http://www.amazon.com/ggplot2-Elegant-Graphics-Data-Analysis/product-reviews/0387981403/ref=undefined_2?ie=UTF8&showViewpoints=1&sortBy=helpful&pageNumber="
192 | urls <- paste0(url_base, 1:5)
193 | urls
194 | ```
195 | 
196 | We may then want to scrape and combine all reviews. The way I like to do this to create a `read_page_reviews` function, then to use `lapply` and dplyr's `bind_rows` to combine them:
197 | 
198 | ```{r}
199 | read_page_reviews <- function(url) {
200 |   title <- h %>%
201 |     html_nodes(".a-color-base") %>%
202 |     html_text()
203 |   
204 |   format <- h %>%
205 |     html_nodes(".a-size-mini.a-color-secondary") %>%
206 |     html_text()
207 |   
208 |   helpful <- h %>%
209 |     html_nodes("#cm_cr-review_list .review-votes") %>%
210 |     html_text() %>%
211 |     str_extract("\\d+") %>%
212 |     as.numeric()
213 |   
214 |   stars <- h %>%
215 |     html_nodes("#cm_cr-review_list .review-rating") %>%
216 |     html_text() %>%
217 |     str_extract("\\d") %>%
218 |     as.numeric()
219 | 
220 |   data_frame(title, format, stars, helpful)
221 | }
222 | 
223 | ggplot2_reviews <- bind_rows(lapply(urls, read_page_reviews))
224 | 
225 | knitr::kable(ggplot2_reviews)
226 | ```
227 | 


--------------------------------------------------------------------------------