├── Employee_Earnings_Report_2014.csv ├── Lab1.R ├── README.md ├── bostonpayroll2013.csv ├── excel-to-r.R ├── excel-to-r.Rproj └── function.R /Lab1.R: -------------------------------------------------------------------------------- 1 | ## Introduction to Data Journalism 2 | ## Coding Lab 1 3 | 4 | # working with R -------------------------- 5 | x <- c(4, 4, 5, 6, 7, 2, 9) 6 | length(x) ; mean(x) 7 | plot(x) # plot the vector 8 | 9 | 10 | # look that the women dataset 11 | data(women) 12 | class(women) 13 | print(women) 14 | summary(women) 15 | plot(women) 16 | 17 | # fit a regression 18 | women.lm <- lm(weight~height, data=women) 19 | class(women.lm) 20 | print(women.lm) 21 | summary(women.lm) 22 | plot(women.lm) 23 | 24 | # managing the workspace 25 | getwd() 26 | ls() 27 | rm(x) 28 | 29 | # getting help 30 | help(median) 31 | ??median 32 | 33 | # working with packages 34 | install.packages("vcd") # requires an internet connection 35 | library(vcd) 36 | help(package="vcd") 37 | data(package="vcd") 38 | help(Arthritis) 39 | Arthritis 40 | example(Arthritis) 41 | 42 | # importing data 43 | districts <- read.csv("district means grade equivalent std.csv") 44 | 45 | library(readxl) 46 | tstops <- read_excel("tstops_jan2014.xlsx") 47 | 48 | # working data sets 49 | dim(tstops) 50 | ncol(tstops) 51 | nrow(tstops) 52 | str(tstops) 53 | summary(tstops) 54 | View(tstops) 55 | save(tstops, file="tstops.rdata") 56 | rm(tstops) 57 | load(file="tstops.rdata") 58 | 59 | # data structures -------------------------- 60 | 61 | # vectors 62 | a <- c(1, 2, 5, 3, 6, -2, 4) 63 | b <- c("one", "two", "three") 64 | c <- c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE) 65 | 66 | # identifying vector elements 67 | a <- c(1, 2, 5, 3, 6, -2, 4) 68 | a[3] 69 | a[c(1, 3, 5)] 70 | a[2:6] 71 | 72 | # data frames 73 | patientID <- c(111, 208, 113, 408) 74 | age <- c(25, 34, 28, 52) 75 | sex <- c(1, 2, 1, 1) 76 | diabetes <- c("Type1", "Type2", "Type1", "Type1") 77 | status <- c(1, 2, 3, 1) 78 | patientdata <- data.frame(patientID, age, sex, diabetes, status) 79 | patientdata 80 | 81 | # specifying the elements of a data frame 82 | patientdata[1:2] 83 | patientdata[c("diabetes", "status")] 84 | patientdata$age 85 | 86 | patientdata[2:3, 1:2] 87 | 88 | # factors 89 | patientdata$sex <- factor(patientdata$sex, 90 | levels=c(1, 2), 91 | labels=c("Male", "Female")) 92 | 93 | patientdata$status <- factor(patientdata$status, ordered=TRUE, 94 | levels=c(1, 2, 3), 95 | labels=c("Poor", "Improved", "Excellent")) 96 | 97 | patientdata 98 | str(patientdata) 99 | 100 | # lists 101 | g <- "My First List" 102 | h <- c(25, 26, 18, 39) 103 | j <- matrix(1:10, nrow = 5) 104 | k <- c("one", "two", "three") 105 | 106 | mylist <- list(title = g, ages = h, j, k) 107 | 108 | mylist 109 | 110 | # specifying components and elements of a list 111 | mylist[[2]] 112 | mylist[["ages"]] 113 | mylist[[2]][2] 114 | mylist[[3]][2,2] 115 | 116 | 117 | # data management with dplyr ------------------------- 118 | library(dplyr) 119 | 120 | # subset data by selecting rows 121 | df1 <- filter(mtcars, cyl==4, mpg > 20) 122 | df2 <- filter(mtcars, cyl==4 & mpg > 20) # same 123 | df3 <- filter(mtcars, cyl %in% c(4, 6) | am ==1) 124 | 125 | # subset data by selecting columns (variables) 126 | df1 <- select(mtcars, mpg, cyl, wt) 127 | df2 <- select(mtcars, mpg:qsec, carb) 128 | df3 <- select(mtcars, -am, -carb) 129 | 130 | # reorder rows 131 | df1 <- arrange(mtcars, cyl) 132 | df2 <- arrange(mtcars, cyl, mpg) 133 | df3 <- arrange(mtcars, cyl, desc(mpg)) 134 | 135 | # create new variables (add new columns) 136 | df1 <- mutate(mtcars, 137 | power = disp * hp, 138 | am = factor(am, 139 | levels=c(0, 1), 140 | labels = c("automatic", "manual")) 141 | ) 142 | 143 | # rename variables (columns) 144 | df1 <- rename(mtcars, 145 | displacement = disp, 146 | transmission = am) 147 | 148 | 149 | # aggregate data by groups 150 | df <- group_by(mtcars, cyl, gear) 151 | df2 <- summarise(df, 152 | disp_n = n(), 153 | disp_mean = mean(disp), 154 | disp_sd = sd(disp) 155 | ) 156 | 157 | df2 <- summarise_each(df, funs(mean)) 158 | df3 <- summarise_each(df, funs(min, max)) 159 | View(df3) 160 | 161 | # puting it all together 162 | df <- select(mtcars, cyl, disp, mpg) 163 | df <- filter(df, mpg > 20) 164 | df <- arrange(df, cyl, desc(mpg)) 165 | 166 | df <- select(mtcars, cyl, disp, mpg) %>% 167 | filter(mpg > 20) %>% 168 | arrange(cyl, desc(mpg)) 169 | 170 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # excel_to_r 2 | Equivalent Excel methods as applied to R 3 | 4 | Intro 5 | http://bit.ly/intro_to_r 6 | 7 | First Data Analysis Steps Using R 8 | http://bit.ly/excel_and_r 9 | -------------------------------------------------------------------------------- /excel-to-r.R: -------------------------------------------------------------------------------- 1 | # Opening a file 2 | 3 | earnings <- read.csv("Employee_Earnings_Report_2014.csv", stringsAsFactors=FALSE) 4 | 5 | # How many rows? 6 | 7 | nrow(earnings) 8 | 9 | # Look at the first five rows in the Console 10 | head(earnings) 11 | 12 | # Investigate the structure of the dataframe 13 | str(earnings) 14 | 15 | #Change column to number format (first you have to strip out the $) 16 | #The $ is a special character 17 | 18 | earnings$TOTAL.EARNINGS <- gsub("\\$", "", earnings$TOTAL.EARNINGS) 19 | earnings$TOTAL.EARNINGS <- as.numeric(earnings$TOTAL.EARNINGS) 20 | 21 | #Sort by column TOTAL.EARNINGS descending 22 | earnings <- earnings[order(-earnings$TOTAL.EARNINGS),] 23 | 24 | #Create new column with a formula (Convert OT column into numeric first) 25 | earnings$OVERTIME <- gsub("\\$", "", earnings$OVERTIME) 26 | earnings$OVERTIME <- as.numeric(earnings$OVERTIME) 27 | 28 | #FORMULA TIME 29 | earnings$Total.minus.OT <- earnings$TOTAL.EARNINGS - earnings$OVERTIME 30 | 31 | #Filter out a column (in R, it's called "subset") 32 | fire_dept <- subset(earnings, DEPARTMENT.NAME=="Boston Fire Department") 33 | 34 | #Calculations on columns 35 | earnings_total <- sum(earnings$TOTAL.EARNINGS) 36 | earnings_avg <- mean(earnings$TOTAL.EARNINGS) 37 | earnings_median <- median(earnings$TOTAL.EARNINGS) 38 | 39 | ##DATA TO COLUMNS IN R 40 | #Create new column based on NAME column by deleting after comma 41 | earnings$Last.Name <- sub(",.*","",earnings$NAME) 42 | earnings$First.Name <- sub(".*,","",earnings$NAME) 43 | 44 | #Create Middle name column based on First.Name column by deleting before space 45 | 46 | #This makes an array out of the total number of observations in earnings 47 | earnings_list <- 1:nrow(earnings) 48 | 49 | #Making a loop to go through every line of the dataframe 50 | for (i in earnings_list) { 51 | # Checks to see if there's a Space in each cell. 52 | # If it does, value of Middle is whatever was after the space 53 | # If there is no space, that means there was no middle name, so it leaves it blank 54 | if (grepl(" ", earnings$First.Name[i])) { 55 | earnings$Middle[i] <- sub(".* ","", earnings$First.Name[i]) 56 | } else { 57 | earnings$Middle[i] <- "" 58 | } 59 | } #NOTE, there is a more efficient way to do it. I assure you. 60 | 61 | #Cleaning First.Name column by deleting after space 62 | earnings$First.Name <- sub(" .*","",earnings$First.Name) 63 | 64 | #Simple Pivot table to count number of employees per Department 65 | Department_Workers <- data.frame(table(earnings$DEPARTMENT.NAME)) 66 | 67 | #Sort it 68 | Department_Workers <- Department_Workers[order(-Department_Workers$Freq),] 69 | 70 | #Rename Columns 71 | colnames(Department_Workers) <- c("Department", "Employees") 72 | 73 | #Advanced calculations 74 | income <- tapply(earnings$TOTAL.EARNINGS, earnings$DEPARTMENT.NAME, sum) 75 | 76 | #Convert the table into a dataframe 77 | income <- data.frame(income) 78 | 79 | #Create a column based on row names 80 | income$Department <- rownames(income) 81 | 82 | #Need the column of rown ames to merge it with the department workers count 83 | merged <- merge(Department_Workers, income, by="Department") 84 | 85 | #Sort it one more time by income 86 | merged <- merged[order(-merged$income),] 87 | 88 | #Save it as a csv 89 | write.csv(merged, "merged.csv") 90 | -------------------------------------------------------------------------------- /excel-to-r.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /function.R: -------------------------------------------------------------------------------- 1 | # Opening a file 2 | 3 | earnings_function <- function(x) { 4 | 5 | earnings <- read.csv(x, stringsAsFactors=FALSE) 6 | 7 | # How many rows? 8 | 9 | nrow(earnings) 10 | 11 | # Look at the first five rows in the Console 12 | head(earnings) 13 | 14 | # Investigate the structure of the dataframe 15 | str(earnings) 16 | 17 | #Change column to number format (first you have to strip out the $) 18 | #The $ is a special character 19 | 20 | earnings$TOTAL.EARNINGS <- gsub("\\$", "", earnings$TOTAL.EARNINGS) 21 | earnings$TOTAL.EARNINGS <- gsub("\\,", "", earnings$TOTAL.EARNINGS) 22 | earnings$TOTAL.EARNINGS <- as.numeric(earnings$TOTAL.EARNINGS) 23 | 24 | #Sort by column TOTAL.EARNINGS 25 | 26 | earnings <- earnings[order(-earnings$TOTAL.EARNINGS),] 27 | earnings <- earnings[order(earnings$TOTAL.EARNINGS),] 28 | 29 | #Create new column with a formula (Convert OT column into numeric first) 30 | earnings$OVERTIME <- gsub("\\$", "", earnings$OVERTIME) 31 | earnings$OVERTIME <- gsub("\\,", "", earnings$OVERTIME) 32 | earnings$OVERTIME <- as.numeric(earnings$OVERTIME) 33 | 34 | #FORMULA TIME 35 | earnings$Total.minus.OT <- earnings$TOTAL.EARNINGS - earnings$OVERTIME 36 | 37 | #Filter out a column (in R, it's called "subset") 38 | fire_dept <- subset(earnings, DEPARTMENT=="Boston Fire Department") 39 | 40 | #Calculations on columns 41 | earnings_total <- sum(earnings$TOTAL.EARNINGS) 42 | earnings_avg <- mean(earnings$TOTAL.EARNINGS) 43 | earnings_median <- median(earnings$TOTAL.EARNINGS) 44 | 45 | ##DATA TO COLUMNS IN R 46 | #Create new column based on NAME column by deleting after comma 47 | earnings$Last.Name <- sub(",.*","",earnings$NAME) 48 | earnings$First.Name <- sub(".*,","",earnings$NAME) 49 | 50 | #Create column based on First.Name column by deleting before space 51 | earnings$Middle <- sub(".* ","", earnings$First.Name) 52 | 53 | #Modify First.Name column by deleting after space 54 | earnings$First.Name <- sub(" .*","",earnings$First.Name) 55 | 56 | #Simple Pivot table to count number of employees per Department 57 | Department_Workers <- data.frame(table(earnings$DEPARTMENT)) 58 | 59 | #Sort it 60 | Department_Workers <- Department_Workers[order(-Department_Workers$Freq),] 61 | 62 | #Rename Columns 63 | colnames(Department_Workers) <- c("Department", "Employees") 64 | 65 | #Advanced calculations 66 | income <- tapply(earnings$TOTAL.EARNINGS, earnings$DEPARTMENT, sum) 67 | 68 | #Convert the table into a dataframe 69 | income <- data.frame(income) 70 | 71 | #Create a column based on row names 72 | income$Department <- rownames(income) 73 | 74 | #Need the column of rown ames to merge it with the department workers count 75 | merged <- merge(Department_Workers, income, by="Department") 76 | 77 | #Sort it one more time by income 78 | merged <- merged[order(-merged$income),] 79 | 80 | #Save it as a csv 81 | 82 | filename <- paste(x, "_analyzed.csv", sep="") 83 | 84 | write.csv(merged, filename) 85 | 86 | } --------------------------------------------------------------------------------