├── Employee_Earnings_Report_2014.csv
├── Lab1.R
├── README.md
├── bostonpayroll2013.csv
├── excel-to-r.R
├── excel-to-r.Rproj
└── function.R


/Lab1.R:
--------------------------------------------------------------------------------
  1 | ## Introduction to Data Journalism
  2 | ## Coding Lab 1
  3 | 
  4 | # working with R --------------------------
  5 | x <- c(4, 4, 5, 6, 7, 2, 9)     
  6 | length(x) ; mean(x)
  7 | plot(x)  # plot the vector
  8 | 
  9 | 
 10 | # look that the women dataset
 11 | data(women)
 12 | class(women)
 13 | print(women)
 14 | summary(women)
 15 | plot(women)
 16 | 
 17 | # fit a regression
 18 | women.lm <- lm(weight~height, data=women)
 19 | class(women.lm)
 20 | print(women.lm)
 21 | summary(women.lm)
 22 | plot(women.lm)
 23 | 
 24 | # managing the workspace
 25 | getwd()
 26 | ls()
 27 | rm(x)
 28 | 
 29 | # getting help
 30 | help(median)
 31 | ??median
 32 | 
 33 | # working with packages
 34 | install.packages("vcd")  # requires an internet connection
 35 | library(vcd)
 36 | help(package="vcd")
 37 | data(package="vcd")
 38 | help(Arthritis)
 39 | Arthritis
 40 | example(Arthritis)
 41 | 
 42 | # importing data
 43 | districts <- read.csv("district means grade equivalent std.csv")
 44 | 
 45 | library(readxl)
 46 | tstops <- read_excel("tstops_jan2014.xlsx")
 47 | 
 48 | # working data sets
 49 | dim(tstops)
 50 | ncol(tstops)
 51 | nrow(tstops)
 52 | str(tstops)
 53 | summary(tstops)
 54 | View(tstops)
 55 | save(tstops, file="tstops.rdata")
 56 | rm(tstops)
 57 | load(file="tstops.rdata")
 58 | 
 59 | # data structures --------------------------
 60 | 
 61 | # vectors
 62 | a <- c(1, 2, 5, 3, 6, -2, 4)
 63 | b <- c("one", "two", "three")
 64 | c <- c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE)
 65 | 
 66 | # identifying vector elements
 67 | a <- c(1, 2, 5, 3, 6, -2, 4)
 68 | a[3]
 69 | a[c(1, 3, 5)]
 70 | a[2:6]
 71 | 
 72 | # data frames
 73 | patientID   <- c(111, 208, 113, 408)
 74 | age         <- c(25, 34, 28, 52)
 75 | sex         <- c(1, 2, 1, 1)
 76 | diabetes    <- c("Type1", "Type2", "Type1", "Type1")
 77 | status      <- c(1, 2, 3, 1)
 78 | patientdata <- data.frame(patientID, age, sex, diabetes, status)
 79 | patientdata
 80 | 
 81 | # specifying the elements of a data frame
 82 | patientdata[1:2]
 83 | patientdata[c("diabetes", "status")]
 84 | patientdata$age
 85 | 
 86 | patientdata[2:3, 1:2]
 87 | 
 88 | # factors
 89 | patientdata$sex <- factor(patientdata$sex, 
 90 |              levels=c(1, 2),
 91 |              labels=c("Male", "Female"))
 92 | 
 93 | patientdata$status <- factor(patientdata$status, ordered=TRUE,
 94 |                          levels=c(1, 2, 3),
 95 |                          labels=c("Poor", "Improved", "Excellent"))
 96 | 
 97 | patientdata
 98 | str(patientdata)
 99 | 
100 | # lists
101 | g <- "My First List"
102 | h <- c(25, 26, 18, 39)
103 | j <- matrix(1:10, nrow = 5)
104 | k <- c("one", "two", "three")
105 | 
106 | mylist <- list(title = g, ages = h, j, k)
107 |  
108 | mylist
109 | 
110 | # specifying components and elements of a list
111 | mylist[[2]] 
112 | mylist[["ages"]]
113 | mylist[[2]][2]
114 | mylist[[3]][2,2]
115 | 
116 | 
117 | # data management with dplyr -------------------------
118 | library(dplyr)
119 | 
120 | # subset data by selecting rows
121 | df1 <- filter(mtcars, cyl==4, mpg > 20)
122 | df2 <- filter(mtcars, cyl==4 & mpg > 20) # same
123 | df3 <- filter(mtcars, cyl %in% c(4, 6) | am ==1)
124 | 
125 | # subset data by selecting columns (variables)
126 | df1 <- select(mtcars, mpg, cyl, wt)
127 | df2 <- select(mtcars, mpg:qsec, carb)
128 | df3 <- select(mtcars, -am, -carb)
129 | 
130 | # reorder rows
131 | df1 <- arrange(mtcars, cyl)
132 | df2 <- arrange(mtcars, cyl, mpg)
133 | df3 <- arrange(mtcars, cyl, desc(mpg))
134 | 
135 | # create new variables (add new columns)
136 | df1 <- mutate(mtcars,
137 |               power = disp * hp,
138 |               am = factor(am,
139 |                       levels=c(0, 1),
140 |                       labels = c("automatic", "manual"))
141 |  )
142 |  
143 | # rename variables (columns)
144 | df1 <- rename(mtcars,
145 |              displacement = disp,
146 |              transmission = am)
147 |  
148 | 
149 | # aggregate data by groups
150 | df <- group_by(mtcars, cyl, gear)
151 | df2 <- summarise(df, 
152 |                  disp_n = n(),
153 |                  disp_mean = mean(disp),
154 |                  disp_sd = sd(disp)
155 | )
156 | 
157 | df2 <- summarise_each(df, funs(mean))
158 | df3 <- summarise_each(df, funs(min, max))
159 | View(df3)
160 | 
161 | # puting it all together
162 | df <- select(mtcars, cyl, disp, mpg)
163 | df <- filter(df, mpg > 20)
164 | df <- arrange(df, cyl, desc(mpg))
165 | 
166 | df <- select(mtcars, cyl, disp, mpg) %>%
167 |   filter(mpg > 20) %>%
168 |   arrange(cyl, desc(mpg))
169 | 
170 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # excel_to_r
2 | Equivalent Excel methods as applied to R
3 | 
4 | Intro
5 | http://bit.ly/intro_to_r
6 | 
7 | First Data Analysis Steps Using R 
8 | http://bit.ly/excel_and_r
9 | 


--------------------------------------------------------------------------------
/excel-to-r.R:
--------------------------------------------------------------------------------
 1 | # Opening a file
 2 | 
 3 | earnings <- read.csv("Employee_Earnings_Report_2014.csv", stringsAsFactors=FALSE)
 4 | 
 5 | # How many rows? 
 6 | 
 7 | nrow(earnings)
 8 | 
 9 | # Look at the first five rows in the Console
10 | head(earnings)
11 | 
12 | # Investigate the structure of the dataframe
13 | str(earnings)
14 | 
15 | #Change column to number format (first you have to strip out the $)
16 | #The $ is a special character
17 | 
18 | earnings$TOTAL.EARNINGS <- gsub("\\$", "", earnings$TOTAL.EARNINGS)
19 | earnings$TOTAL.EARNINGS <- as.numeric(earnings$TOTAL.EARNINGS)
20 | 
21 | #Sort by column TOTAL.EARNINGS descending
22 | earnings <- earnings[order(-earnings$TOTAL.EARNINGS),]
23 | 
24 | #Create new column with a formula (Convert OT column into numeric first)
25 | earnings$OVERTIME <- gsub("\\$", "", earnings$OVERTIME)
26 | earnings$OVERTIME <- as.numeric(earnings$OVERTIME)
27 | 
28 | #FORMULA TIME
29 | earnings$Total.minus.OT <- earnings$TOTAL.EARNINGS - earnings$OVERTIME
30 | 
31 | #Filter out a column (in R, it's called "subset")
32 | fire_dept <- subset(earnings, DEPARTMENT.NAME=="Boston Fire Department")
33 | 
34 | #Calculations on columns
35 | earnings_total <- sum(earnings$TOTAL.EARNINGS)
36 | earnings_avg <- mean(earnings$TOTAL.EARNINGS)
37 | earnings_median <- median(earnings$TOTAL.EARNINGS)
38 | 
39 | ##DATA TO COLUMNS IN R
40 | #Create new column based on NAME column by deleting after comma
41 | earnings$Last.Name <- sub(",.*","",earnings$NAME)
42 | earnings$First.Name <- sub(".*,","",earnings$NAME)
43 | 
44 | #Create Middle name column based on First.Name column by deleting before space
45 | 
46 | #This makes an array out of the total number of observations in earnings
47 | earnings_list <- 1:nrow(earnings)
48 | 
49 | #Making a loop to go through every line of the dataframe
50 | for (i in earnings_list) {
51 | # Checks to see if there's a Space in each cell. 
52 |   # If it does, value of Middle is whatever was after the space
53 |   # If there is no space, that means there was no middle name, so it leaves it blank
54 | if (grepl(" ", earnings$First.Name[i])) {
55 |   earnings$Middle[i] <- sub(".* ","", earnings$First.Name[i])
56 | } else {
57 |   earnings$Middle[i] <- ""
58 | }
59 | } #NOTE, there is a more efficient way to do it. I assure you. 
60 | 
61 | #Cleaning First.Name column by deleting after space
62 | earnings$First.Name <- sub(" .*","",earnings$First.Name)
63 | 
64 | #Simple Pivot table to count number of employees per Department
65 | Department_Workers <- data.frame(table(earnings$DEPARTMENT.NAME))
66 | 
67 | #Sort it
68 | Department_Workers <- Department_Workers[order(-Department_Workers$Freq),]
69 | 
70 | #Rename Columns
71 | colnames(Department_Workers) <- c("Department", "Employees")
72 | 
73 | #Advanced calculations
74 | income <- tapply(earnings$TOTAL.EARNINGS, earnings$DEPARTMENT.NAME, sum)
75 | 
76 | #Convert the table into a dataframe
77 | income <- data.frame(income)
78 | 
79 | #Create a column based on row names
80 | income$Department <- rownames(income)
81 | 
82 | #Need the column of rown ames to merge it with the department workers count
83 | merged <- merge(Department_Workers, income, by="Department")
84 | 
85 | #Sort it one more time by income
86 | merged <- merged[order(-merged$income),]
87 | 
88 | #Save it as a csv
89 | write.csv(merged, "merged.csv")
90 | 


--------------------------------------------------------------------------------
/excel-to-r.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/function.R:
--------------------------------------------------------------------------------
 1 | # Opening a file
 2 | 
 3 | earnings_function <- function(x) {
 4 |   
 5 |   earnings <- read.csv(x, stringsAsFactors=FALSE)
 6 |   
 7 |   # How many rows? 
 8 |   
 9 |   nrow(earnings)
10 |   
11 |   # Look at the first five rows in the Console
12 |   head(earnings)
13 |   
14 |   # Investigate the structure of the dataframe
15 |   str(earnings)
16 |   
17 |   #Change column to number format (first you have to strip out the $)
18 |   #The $ is a special character
19 |   
20 |   earnings$TOTAL.EARNINGS <- gsub("\\$", "", earnings$TOTAL.EARNINGS)
21 |   earnings$TOTAL.EARNINGS <- gsub("\\,", "", earnings$TOTAL.EARNINGS)
22 |   earnings$TOTAL.EARNINGS <- as.numeric(earnings$TOTAL.EARNINGS)
23 |   
24 |   #Sort by column TOTAL.EARNINGS
25 |   
26 |   earnings <- earnings[order(-earnings$TOTAL.EARNINGS),]
27 |   earnings <- earnings[order(earnings$TOTAL.EARNINGS),]
28 |   
29 |   #Create new column with a formula (Convert OT column into numeric first)
30 |   earnings$OVERTIME <- gsub("\\$", "", earnings$OVERTIME)
31 |   earnings$OVERTIME <- gsub("\\,", "", earnings$OVERTIME)
32 |   earnings$OVERTIME <- as.numeric(earnings$OVERTIME)
33 |   
34 |   #FORMULA TIME
35 |   earnings$Total.minus.OT <- earnings$TOTAL.EARNINGS - earnings$OVERTIME
36 |   
37 |   #Filter out a column (in R, it's called "subset")
38 |   fire_dept <- subset(earnings, DEPARTMENT=="Boston Fire Department")
39 |   
40 |   #Calculations on columns
41 |   earnings_total <- sum(earnings$TOTAL.EARNINGS)
42 |   earnings_avg <- mean(earnings$TOTAL.EARNINGS)
43 |   earnings_median <- median(earnings$TOTAL.EARNINGS)
44 |   
45 |   ##DATA TO COLUMNS IN R
46 |   #Create new column based on NAME column by deleting after comma
47 |   earnings$Last.Name <- sub(",.*","",earnings$NAME)
48 |   earnings$First.Name <- sub(".*,","",earnings$NAME)
49 |   
50 |   #Create column based on First.Name column by deleting before space
51 |   earnings$Middle <- sub(".* ","", earnings$First.Name)
52 |   
53 |   #Modify First.Name column by deleting after space
54 |   earnings$First.Name <- sub(" .*","",earnings$First.Name)
55 |   
56 |   #Simple Pivot table to count number of employees per Department
57 |   Department_Workers <- data.frame(table(earnings$DEPARTMENT))
58 |   
59 |   #Sort it
60 |   Department_Workers <- Department_Workers[order(-Department_Workers$Freq),]
61 |   
62 |   #Rename Columns
63 |   colnames(Department_Workers) <- c("Department", "Employees")
64 |   
65 |   #Advanced calculations
66 |   income <- tapply(earnings$TOTAL.EARNINGS, earnings$DEPARTMENT, sum)
67 |   
68 |   #Convert the table into a dataframe
69 |   income <- data.frame(income)
70 |   
71 |   #Create a column based on row names
72 |   income$Department <- rownames(income)
73 |   
74 |   #Need the column of rown ames to merge it with the department workers count
75 |   merged <- merge(Department_Workers, income, by="Department")
76 |   
77 |   #Sort it one more time by income
78 |   merged <- merged[order(-merged$income),]
79 |   
80 |   #Save it as a csv
81 |   
82 |   filename <- paste(x, "_analyzed.csv", sep="")
83 |   
84 |   write.csv(merged, filename)
85 |   
86 | }


--------------------------------------------------------------------------------