├── .gitignore ├── Data_Analysis ├── CPS_JAGS.R ├── CPS_gradebygrade_dataprep.R ├── CPS_gradebygrade_modelingexploration.R ├── CPS_gradebygrade_script.R ├── DeepDive1EDA_Counts.Rmd ├── DeepDive1EDA_Counts.html ├── README.md ├── dataprep_geocodeschools.R ├── exploratory_data_analysis.R ├── explore_8_to_9_transition.R ├── explore_enroll_totals_from_website.Rmd ├── explore_enroll_totals_from_website.html ├── grade9_transition_matrix.R ├── history_evaluation.R ├── investigate_9thgrade.R ├── investigate_juarez_change.R ├── map_neighborhood_catchment.R ├── prepare_data_and_shapefile_for_tableau.R └── topdown_FINAL.R ├── Data_Pipeline ├── README.md ├── create_features_table.R ├── create_schools_table.R ├── create_students_table.R ├── geocode_addresses.R ├── school_aggregate_features_vw.sql ├── student_ISAT_scores_vw.sql ├── student_attendance_vw.sql └── student_table_with_ninth_snapshot_vw.sql ├── README.md └── Tool_Box ├── README.md ├── cohort_survival.R ├── eval_grade9_prediction.R ├── eval_new_grade9_prediction.R ├── grade9_prediction.R ├── main.R └── model_diagnostics.R /.gitignore: -------------------------------------------------------------------------------- 1 | # Data files 2 | *.csv 3 | 4 | # History files 5 | .Rhistory 6 | 7 | # Example code in package build process 8 | *-Ex.R 9 | 10 | # R data files from past sessions 11 | .Rdata 12 | -------------------------------------------------------------------------------- /Data_Analysis/CPS_JAGS.R: -------------------------------------------------------------------------------- 1 | # --------------------------------------- # 2 | # prepare data for hierarchical model # 3 | # --------------------------------------- # 4 | 5 | # variables: SCHOOL_CODE, GRADE, GROUP (charter / neighborhood), 6 | # for progressive grades: RATIO = [2012 grade k]/[2011 grade k-1] 7 | 8 | setwd("/Volumes/appdata-3/Count Data") 9 | countdata = read.csv("enrollment_byschool_byyear_bygrade.csv", stringsAsFactors = F) 10 | 11 | setwd("/Volumes/appdata-3/School Reports") 12 | schools = read.csv("schools.csv", stringsAsFactors = F) 13 | 14 | data1 = subset(countdata, YEAR == 2012 & ENROLLMENT > 0 & GRADE %in% c("X2","X3","X4","X5","X6","X7","X8","X10","X11","X12"), select = c("SCHOOL_CODE","GRADE","ENROLLMENT")) 15 | 16 | gradeorder = c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12") 17 | 18 | data1$PREV_GRADE = lapply(data1[,"GRADE"], function(x) gradeorder[which(gradeorder == x)-1]) 19 | data1$PREV_GRADE = as.character(data1$PREV_GRADE) 20 | table(data1$GRADE, data1$PREV_GRADE) 21 | 22 | data11 = subset(countdata, YEAR == 2011) 23 | 24 | data1= merge(data1, data11[,c("SCHOOL_CODE","GRADE","ENROLLMENT")], all.x = T, by.x = c("SCHOOL_CODE","PREV_GRADE"), by.y = c("SCHOOL_CODE","GRADE"), suffixes = c(".THIS_GRADE",".PREV_GRADE")) 25 | 26 | data1$CSRATIO = data1$ENROLLMENT.THIS_GRADE/data1$ENROLLMENT.PREV_GRADE 27 | hist(data1$CSRATIO) 28 | data1 = subset(data1, is.finite(CSRATIO)) 29 | 30 | data1 = merge(data1, schools[,c("SchoolID","SchoolType")], all.x = T, by.x = "SCHOOL_CODE", by.y = "SchoolID") 31 | 32 | data2 = subset(countdata, YEAR == 2011 & ENROLLMENT > 0 & GRADE %in% c("X2","X3","X4","X5","X6","X7","X8","X10","X11","X12"), select = c("SCHOOL_CODE","GRADE","ENROLLMENT")) 33 | 34 | gradeorder = c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12") 35 | 36 | data2$PREV_GRADE = lapply(data2[,"GRADE"], function(x) gradeorder[which(gradeorder == x)-1]) 37 | data2$PREV_GRADE = as.character(data2$PREV_GRADE) 38 | table(data2$GRADE, data2$PREV_GRADE) 39 | 40 | data10 = subset(countdata, YEAR == 2010) 41 | 42 | data2= merge(data2, data10[,c("SCHOOL_CODE","GRADE","ENROLLMENT")], all.x = T, by.x = c("SCHOOL_CODE","PREV_GRADE"), by.y = c("SCHOOL_CODE","GRADE"), suffixes = c(".THIS_GRADE",".PREV_GRADE")) 43 | 44 | data2$CSRATIO = data2$ENROLLMENT.THIS_GRADE/data2$ENROLLMENT.PREV_GRADE 45 | hist(data2$CSRATIO) 46 | data2 = subset(data2, is.finite(CSRATIO)) 47 | 48 | data2 = merge(data2, schools[,c("SchoolID","SchoolType")], all.x = T, by.x = "SCHOOL_CODE", by.y = "SchoolID") 49 | 50 | data = rbind(data1, data2) 51 | 52 | data = subset(data, CSRATIO <= 5) 53 | 54 | data$SCHOOL_GROUP = as.numeric(as.factor(data$SchoolType)) 55 | data$SCHOOL = data$SCHOOL_CODE 56 | data$SCHOOL = as.numeric(as.factor(data$SCHOOL)) 57 | data$GRADE_GROUP = data$GRADE 58 | data$GRADE_GROUP = as.numeric(as.factor(data$GRADE_GROUP)) 59 | 60 | 61 | #install.packages("R2jags") 62 | library("R2jags") 63 | 64 | model<-function(){ 65 | 66 | # ratios 67 | for (i in 1:n.obs){ 68 | y[i] ~ dnorm(gamma[SCHOOL[i]] + delta[GRADE[i]], tau) 69 | } 70 | 71 | tau <- pow(sigma, -2) 72 | sigma ~ dunif(0,100) 73 | 74 | # grades 75 | for (j in 1:n.grades){ 76 | delta[j] ~ dnorm(0, tau.delta) 77 | } 78 | 79 | tau.delta <- pow(sigma.delta, -2) 80 | sigma.delta ~ dunif(0,100) 81 | 82 | # schools 83 | for (k in 1:n.schools){ 84 | gamma[k] ~ dnorm(mu.gamma[SCHOOL_GROUP[k]], tau.gamma[SCHOOL_GROUP[k]]) 85 | } 86 | 87 | # groups 88 | for (l in 1:n.groups){ 89 | mu.gamma[l] ~ dnorm(mu.star, tau.mu) 90 | tau.gamma[l] <- pow(sigma.gamma[l], -2) 91 | sigma.gamma[l] ~ dunif(0, 100) 92 | } 93 | 94 | mu.star ~ dnorm(0, 1/10000) 95 | tau.mu<- pow(sigma.mu, -2) 96 | sigma.mu ~ dunif(0, 100) 97 | } 98 | 99 | schoolgroupmat = unique(data[,c("SCHOOL_CODE","SCHOOL_GROUP")]) 100 | 101 | data_jags=list(y=data$CSRATIO, SCHOOL = data$SCHOOL, GRADE = data$GRADE_GROUP, SCHOOL_GROUP = schoolgroupmat$SCHOOL_GROUP, n.obs = 7546, n.schools = 671, n.grades = 10, n.groups = 5) 102 | inits=function(){list("sigma" = 1, "delta" = c(rep(0, length(unique(data$GRADE)))), "sigma.delta" = 1, "mu.star" = 1, "sigma.gamma" = c(1,1,1,1,1), "sigma.mu" = 1)} 103 | parameters.to.save<-c("delta","sigma", "sigma.delta", "mu.gamma","mu.star", "sigma.gamma", "sigma.mu","gamma") 104 | 105 | sim = jags(data_jags, inits, parameters.to.save, model.file=model, n.chains=3, n.iter=10000) 106 | print(sim) 107 | 108 | tracemat = sim$BUGSoutput$sims.matrix 109 | tracemat = as.data.frame(tracemat) 110 | names(tracemat) 111 | 112 | # predictions and credible intervals for 2013/2012 113 | test = unique(data[,c("SCHOOL_CODE","GRADE","SCHOOL","SCHOOL_GROUP","GRADE_GROUP")]) 114 | test$postpredmean = NA 115 | test$postpred025 = NA 116 | test$postpred975 = NA 117 | 118 | for (i in 1:nrow(test)){ 119 | gammatemp = tracemat[,paste('gamma[',test[i,"SCHOOL"],']', sep = "")] 120 | deltatemp = tracemat[,paste('delta[',test[i,"GRADE_GROUP"],']', sep = "")] 121 | sigmatemp = tracemat$sigma 122 | 123 | ytemp = rnorm(3000, gammatemp + deltatemp, sd = sqrt(sigmatemp)) 124 | test[i,"postpredmean"] = mean(ytemp) 125 | test[i,"postpred025"] = quantile(ytemp, .025) 126 | test[i,"postpred975"] = quantile(ytemp,.975) 127 | } 128 | 129 | 130 | data3 = subset(countdata, YEAR == 2013 & ENROLLMENT > 0 & GRADE %in% c("X2","X3","X4","X5","X6","X7","X8","X10","X11","X12"), select = c("SCHOOL_CODE","GRADE","ENROLLMENT")) 131 | 132 | gradeorder = c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12") 133 | 134 | data3$PREV_GRADE = lapply(data3[,"GRADE"], function(x) gradeorder[which(gradeorder == x)-1]) 135 | data3$PREV_GRADE = as.character(data3$PREV_GRADE) 136 | table(data3$GRADE, data3$PREV_GRADE) 137 | 138 | data12 = subset(countdata, YEAR == 2012) 139 | 140 | data3= merge(data3, data12[,c("SCHOOL_CODE","GRADE","ENROLLMENT")], all.x = T, by.x = c("SCHOOL_CODE","PREV_GRADE"), by.y = c("SCHOOL_CODE","GRADE"), suffixes = c(".THIS_GRADE",".PREV_GRADE")) 141 | 142 | data3$CSRATIO = data3$ENROLLMENT.THIS_GRADE/data3$ENROLLMENT.PREV_GRADE 143 | hist(data3$CSRATIO) 144 | data3 = subset(data3, is.finite(CSRATIO)) 145 | 146 | data3 = merge(data3, schools[,c("SchoolID","SchoolType")], all.x = T, by.x = "SCHOOL_CODE", by.y = "SchoolID") 147 | 148 | test = merge(test, data3[,c("SCHOOL_CODE","GRADE","CSRATIO")], all.x = T, by.x = c("SCHOOL_CODE","GRADE"), by.y = c("SCHOOL_CODE","GRADE")) 149 | 150 | test = subset(test, CSRATIO <= 2) 151 | plot(test$CSRATIO, test$postpredmean) 152 | 153 | 154 | plot(1:3000, tracemat$'mu.gamma[5]', type ="l") 155 | lines(1:3000, tracemat$'mu.gamma[3]', col = colorlist[4]) 156 | lines(1:3000, tracemat$'mu.gamma[2]', col = colorlist[3]) 157 | lines(1:3000, tracemat$'mu.gamma[4]', col = colorlist[5]) 158 | lines(1:3000, tracemat$'mu.gamma[5]', col = colorlist[7]) 159 | 160 | unique(data[,c("SchoolType","SCHOOL_GROUP")]) 161 | legend("topright",c("Alternative","Charter","Contract","Performance","Regular"), col = colorlist[c(1,3,4,5,7)], pch = 16) 162 | 163 | plot(1:3000, tracemat$'delta[1]', type ="l",ylim=c(-.2,.2), col = colorlist[1]) 164 | lines(1:3000, tracemat$'delta[2]', col = colorlist[2]) 165 | lines(1:3000, tracemat$'delta[3]', col = colorlist[3]) 166 | lines(1:3000, tracemat$'delta[4]', col = colorlist[4]) 167 | lines(1:3000, tracemat$'delta[5]', col = colorlist[5]) 168 | lines(1:3000, tracemat$'delta[6]', col = colorlist[6]) 169 | lines(1:3000, tracemat$'delta[7]', col = colorlist[7]) 170 | lines(1:3000, tracemat$'delta[8]', col = colorlist[8]) 171 | lines(1:3000, tracemat$'delta[9]', col = colorlist[9]) 172 | lines(1:3000, tracemat$'delta[10]', col = colorlist[10]) 173 | 174 | legend("topright",c("2nd grade", "3rd grade", "4th grade", "5th grade", "6th grade", "7th grade", "8th grade", "10th grade", "11th grade", "12th grade"), col = colorlist[c(4,5,6,7,8,9,10,1,2,3)], pch = 16) 175 | 176 | unique(data[,c("GRADE","GRADE_GROUP")]) 177 | colorlist = brewer.pal(10, "Set3") 178 | -------------------------------------------------------------------------------- /Data_Analysis/CPS_gradebygrade_modelingexploration.R: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------- # 2 | # this code shows many of our exploratory models # 3 | # for the top-down model approach # 4 | # -------------------------------------------------- # 5 | 6 | # run CPS_gradebygrade_script.R 7 | # run CPS_gradebygrade_dataprep.R 8 | 9 | datafinal_train = datafinal12 10 | datafinal = datafinal13 11 | 12 | # fill in missing values 13 | for (j in c(7,8,9,11,25,26)){ 14 | datafinal_train[is.na(datafinal_train[,j]),j] = 0 15 | datafinal[is.na(datafinal[,j]),j] = 0 16 | } 17 | 18 | for (j in c(12:20,24)){ 19 | datafinal_train[is.na(datafinal_train[,j]),j] = 'Unknown' 20 | datafinal[is.na(datafinal[,j]),j] = 'Unknown' 21 | } 22 | 23 | # make sure there's no more missing values 24 | apply(datafinal_train, 2, function(x) sum(is.na(x))) 25 | apply(datafinal, 2, function(x) sum(is.na(x))) 26 | 27 | # all schools and grades: rollover 2012 to predict 2013 28 | test = predictschool(data_test = datafinal, data_train = datafinal_train, algorithm_of_choice = "rollover", modelformula = NA, grade_overall = 'gradebygrade') 29 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) # 52% 30 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) # 74% 31 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100) # 87% 32 | hist(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T), 20) 33 | 34 | # high school only 35 | test = subset(test, (enrollment_to_predict.X9 >= 10)|(enrollment_to_predict.X10 >= 10)|(enrollment_to_predict.X11 >= 10)|(enrollment_to_predict.X12 >= 10)) 36 | test = test[,c("SCHOOL_CODE","enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12","predictions X9","predictions X10","predictions X11","predictions X12")] 37 | 38 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 25) 39 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 50) 40 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 100) 41 | 42 | 43 | # Basic linear regression: train on 2012, test on 2013 44 | test = predictschool(data_test = datafinal, data_train = datafinal_train, algorithm_of_choice = "lm", modelformula = enrollment_to_predict ~ projection + NEW, grade_overall = 'gradebygrade') 45 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) # 50% 46 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) # 72% 47 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100) # 89% 48 | hist(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T), 20) 49 | 50 | # high school only 51 | test = subset(test, (enrollment_to_predict.X9 >= 10)|(enrollment_to_predict.X10 >= 10)|(enrollment_to_predict.X11 >= 10)|(enrollment_to_predict.X12 >= 10)) 52 | test = test[,c("SCHOOL_CODE","enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12","predictions X9","predictions X10","predictions X11","predictions X12")] 53 | 54 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 25) 55 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 50) 56 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 100) 57 | 58 | 59 | # Linear regression with features: train on 2012, test on 2013 60 | test = predictschool(data_test = datafinal, data_train = datafinal_train, algorithm_of_choice = "lm", modelformula = enrollment_to_predict ~ projection*Rating + (SCHOOL_TYPE == 'Regular') + NEW, grade_overall = 'gradebygrade') 61 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) 62 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) 63 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100) 64 | 65 | # plot by grade 66 | hist(test[,"predictions K"] - test[,"enrollment_to_predict.K"],20) 67 | hist(test[,"predictions X1"] - test[,"enrollment_to_predict.X1"],20) 68 | hist(test[,"predictions X2"] - test[,"enrollment_to_predict.X2"],20) 69 | hist(test[,"predictions X3"] - test[,"enrollment_to_predict.X3"],20) 70 | hist(test[,"predictions X4"] - test[,"enrollment_to_predict.X4"],20) 71 | hist(test[,"predictions X5"] - test[,"enrollment_to_predict.X5"],20) 72 | hist(test[,"predictions X6"] - test[,"enrollment_to_predict.X6"],20) 73 | hist(test[,"predictions X7"] - test[,"enrollment_to_predict.X7"],20) 74 | hist(test[,"predictions X8"] - test[,"enrollment_to_predict.X8"],20) 75 | hist(test[,"predictions X9"] - test[,"enrollment_to_predict.X9"],20) 76 | hist(test[,"predictions X10"] - test[,"enrollment_to_predict.X10"],20) 77 | hist(test[,"predictions X11"] - test[,"enrollment_to_predict.X11"],20) 78 | hist(test[,"predictions X12"] - test[,"enrollment_to_predict.X12"],20) 79 | hist(test[,"predictions LRE"] - test[,"enrollment_to_predict.LRE"],20) 80 | 81 | # 9th grade 82 | plot(test[,"enrollment_to_predict.X9"], test[,"predictions X9"],xlim=c(0,1500),ylim=c(0,1500),pch=16, xlab = "Actual 9th grade 2012", ylab = "Predicted 9th grade 2012") 83 | segments(x0 = test[,"enrollment_to_predict.X9"], y0 = test[,"predictions X9"] - 1.96*sqrt(test[,"var X9"]), x1 = test[,"enrollment_to_predict.X9"], y1 = test[,"predictions X9"] + 1.96*sqrt(test[,"var X9"])) 84 | segments(x0=0,y0=0,x1=1500,y1=1500, col = 2) 85 | mean((test[,"enrollment_to_predict.X9"] > test[,"predictions X9"] - 1.96*sqrt(test[,"var X9"]))&(test[,"enrollment_to_predict.X9"] < test[,"predictions X9"] + 1.96*sqrt(test[,"var X9"])), na.rm = T) 86 | 87 | # 2nd grade 88 | plot(test[,"enrollment_to_predict.X2"], test[,"predictions X2"],xlim=c(0,300),ylim=c(0,300),pch=16, xlab = "Actual 2nd grade 2012", ylab = "Predicted 2nd grade 2012") 89 | segments(x0 = test[,"enrollment_to_predict.X2"], y0 = test[,"predictions X2"] - 1.96*sqrt(test[,"var X2"]), x1 = test[,"enrollment_to_predict.X2"], y1 = test[,"predictions X2"] + 1.96*sqrt(test[,"var X2"])) 90 | segments(x0=0,y0=0,x1=1300,y1=1300, col = 2) 91 | mean((test[,"enrollment_to_predict.X2"] > test[,"predictions X2"] - 1.96*sqrt(test[,"var X2"]))&(test[,"enrollment_to_predict.X2"] < test[,"predictions X2"] + 1.96*sqrt(test[,"var X2"])), na.rm = T) 92 | 93 | 94 | # Random forest with features: train on 2012, test on 2013 95 | test = predictschool(data_test = datafinal, data_train = datafinal_train, algorithm_of_choice = "randomForest", modelformula = enrollment_to_predict ~ projection + NEW + Rating + SCHOOL_TYPE, grade_overall = 'gradebygrade') 96 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) 97 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) 98 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100) 99 | 100 | 101 | # Experimenting with training data 102 | 103 | # train only with schools that change less than a certain number of students 104 | datafinal_train_stable= datafinal_train[(apply(datafinal_train[,c("enrollment_to_predict","projection")], 1, function(x) max(x) - min(x) <= 25))|(datafinal_train$projection == 0),] 105 | 106 | test = predictschool(data_test = datafinal, data_train = datafinal_train_stable, algorithm_of_choice = "lm", modelformula = enrollment_to_predict ~ projection*Rating + NEW + (SCHOOL_TYPE == 'Regular') + newnearby, grade_overall = 'gradebygrade') 107 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) 108 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) 109 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100) 110 | 111 | # exclude new schools 112 | datafinal_train_nonew = datafinal_train[datafinal_train$projection > 0,] 113 | datafinal_nonew = datafinal[datafinal$projection > 0,] 114 | 115 | # neighborhood enrollment 116 | setwd("/Volumes/appdata-3/School reports") 117 | schools = read.csv("schools.csv", stringsAsFactors = F) 118 | neighborhoodschools = as.matrix(subset(schools, SchoolStyle == 'Neighborhood Enrollment', select = "SchoolID")) 119 | 120 | datafinal_train_temp = datafinal_train[(datafinal_train$projection > 0) & (datafinal_train$SCHOOL_CODE %in% neighborhoodschools),] 121 | datafinal_temp = datafinal[(datafinal$projection > 0) & (datafinal$SCHOOL_CODE %in% neighborhoodschools),] 122 | 123 | test = predictschool(data_test = datafinal_temp, data_train = datafinal_train_temp, algorithm_of_choice = "lm", modelformula = enrollment_to_predict ~ projection*(Rating + Teacher + Safety) + newnearby + GRADE, grade_overall = 'overall') 124 | 125 | schoolslist = unique(datafinal_temp$SCHOOL_CODE) 126 | test = test[test$SCHOOL_CODE %in% schoolslist,] 127 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) 128 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) 129 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100) 130 | -------------------------------------------------------------------------------- /Data_Analysis/CPS_gradebygrade_script.R: -------------------------------------------------------------------------------- 1 | # ================================== # 2 | # grade-by-grade modeling script # 3 | # ================================== # 4 | 5 | 6 | predictschool = function(data_test, data_train, algorithm_of_choice, modelformula, grade_overall){ 7 | # data should have columns named "SCHOOL_CODE", "enrollment_to_predict", "projection" and "GRADE" 8 | # all other columns are main effect predictors 9 | # algorithm_of_choice can be "lm","randomForest","rollover" 10 | # gradebygrade or overall 11 | 12 | # 1. turn character features into factors 13 | for (j in 1:ncol(data_train)){ 14 | if (class(data_train[,j]) == "character"){ 15 | data_train[,j] = as.factor(data_train[,j]) 16 | } 17 | } 18 | 19 | for (j in 1:ncol(data_test)){ 20 | if (class(data_test[,j]) == "character"){ 21 | data_test[,j] = as.factor(data_test[,j]) 22 | } 23 | } 24 | 25 | if (grade_overall == 'gradebygrade'){ 26 | # 2. predict each grade 27 | predictschools=reshape(data_test[,c("SCHOOL_CODE","GRADE","enrollment_to_predict")], v.names = "enrollment_to_predict", timevar = "GRADE", idvar = "SCHOOL_CODE", direction = "wide") 28 | 29 | for (grade in levels(data_train$GRADE)){ 30 | 31 | currentnames = names(predictschools) 32 | 33 | temp = predictgrade(data_train, data_test, grade, algorithm_of_choice, modelformula) 34 | 35 | if (algorithm_of_choice == "lm"){ 36 | predictschools = merge(predictschools, temp[,c("SCHOOL_CODE", "test_predictions", "test_var")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SCHOOL_CODE") 37 | colnames(predictschools) = c(currentnames, paste('predictions', grade), paste('var', grade)) 38 | } 39 | 40 | if ((algorithm_of_choice == "randomForest")|(algorithm_of_choice == "rollover")){ 41 | predictschools = merge(predictschools, temp[,c("SCHOOL_CODE", "test_predictions")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SCHOOL_CODE") 42 | colnames(predictschools) = c(currentnames, paste('predictions',grade)) 43 | } 44 | } 45 | 46 | # 3. output is predictions for test set 47 | predictschools 48 | } 49 | 50 | else if (grade_overall == 'overall'){ 51 | predictschools=reshape(data_test[,c("SCHOOL_CODE","GRADE","enrollment_to_predict")], v.names = "enrollment_to_predict", timevar = "GRADE", idvar = "SCHOOL_CODE", direction = "wide") 52 | 53 | # predict overall 54 | overallpredicts = predictoverall(data_train, data_test, algorithm_of_choice, modelformula) 55 | 56 | # output 57 | merge(predictschools, overallpredicts, all.x = TRUE) 58 | 59 | } 60 | 61 | } 62 | 63 | 64 | 65 | predictgrade = function(data_train, data_test, grade, algorithm_of_choice, modelformula){ 66 | # data should have columns named "SCHOOL_CODE", "enrollment_to_predict", "projection" and "GRADE" 67 | 68 | data_train_grade = subset(data_train, (GRADE == grade)&(enrollment_to_predict > 0)) 69 | data_test_grade = subset(data_test, (GRADE == grade)&(enrollment_to_predict > 0)) 70 | 71 | # algorithm = lm 72 | if (algorithm_of_choice == "lm"){ 73 | lm_train = lm(modelformula, data = data_train_grade) 74 | lm_test = predict(lm_train, newdata = data_test_grade, interval = "prediction", level = .95) 75 | lm_test = as.data.frame(lm_test) 76 | data_test_grade$test_predictions = lm_test$fit 77 | data_test_grade$test_var = ((lm_test$upr - lm_test$lwr)/4)^2 78 | 79 | data_test_grade[,c("SCHOOL_CODE", "test_predictions", "test_var")] # output 80 | } 81 | 82 | 83 | # algorithm = randomForest 84 | else if (algorithm_of_choice == "randomForest"){ 85 | library(randomForest) 86 | rf_train = randomForest(modelformula, data = data_train_grade) 87 | rf_test = predict(rf_train, newdata = data_test_grade) 88 | data_test_grade$test_predictions = rf_test 89 | 90 | data_test_grade[,c("SCHOOL_CODE", "test_predictions")] # output 91 | } 92 | 93 | # algorithm = rollover 94 | else if (algorithm_of_choice == "rollover"){ 95 | data_test_grade$test_predictions = data_test_grade$projection 96 | grademean = mean(data_test_grade[(data_test_grade$test_predictions > 0),"test_predictions"]) 97 | data_test_grade[(data_test_grade$test_predictions == 0),"test_predictions"] = grademean 98 | 99 | data_test_grade[,c("SCHOOL_CODE", "test_predictions")] # output 100 | } 101 | 102 | 103 | } 104 | 105 | 106 | 107 | predictoverall = function(data_train, data_test, algorithm_of_choice, modelformula){ 108 | # data should have columns named "SCHOOL_CODE", "enrollment_to_predict", "projection" and "GRADE" 109 | 110 | data_train = subset(data_train, (enrollment_to_predict > 0)) 111 | data_test = subset(data_test, (enrollment_to_predict > 0)) 112 | 113 | # algorithm = lm 114 | if (algorithm_of_choice == "lm"){ 115 | lm_train = lm(modelformula, data = data_train) 116 | lm_test = predict(lm_train, newdata = data_test, interval = "prediction", level = .95) 117 | lm_test = as.data.frame(lm_test) 118 | data_test$predictions = lm_test$fit 119 | data_test$var = ((lm_test$upr - lm_test$lwr)/4)^2 120 | 121 | outputtemp = data_test[,c("SCHOOL_CODE", "GRADE", "predictions", "var")] # output 122 | output = reshape(outputtemp, v.names = c("predictions", "var"), timevar = "GRADE", idvar = "SCHOOL_CODE", direction = 'wide',sep=" ") 123 | output 124 | } 125 | 126 | # algorithm = randomForest 127 | else if (algorithm_of_choice == "randomForest"){ 128 | library(randomForest) 129 | rf_train = randomForest(modelformula, data = data_train) 130 | rf_test = predict(rf_train, newdata = data_test) 131 | data_test$predictions = rf_test 132 | 133 | outputtemp = data_test[,c("SCHOOL_CODE", "GRADE", "predictions")] # output 134 | output = reshape(outputtemp, v.names = "predictions", timevar = "GRADE", idvar = "SCHOOL_CODE", direction = 'wide',sep=" ") 135 | output 136 | } 137 | 138 | 139 | } 140 | 141 | -------------------------------------------------------------------------------- /Data_Analysis/DeepDive1EDA_Counts.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "First Deep Dive EDA on Enrollment Counts" 3 | author: "Andrew Landgraf" 4 | date: "Tuesday, June 24, 2014" 5 | output: html_document 6 | --- 7 | 8 | Load packages. You need to be connected to the CPS VPN. 9 | ```{r} 10 | library(lubridate) 11 | library(ggplot2) 12 | library(reshape2) 13 | library(knitr) 14 | opts_chunk$set(warning = FALSE) 15 | # setwd("//admin/appdata/DataWarehouse/DSSG/Count Data/") 16 | ``` 17 | 18 | Read count data from the CPS folder. This data has the number of students enrolled in each school and in each grade on the first day of each month. I also format the data a little. 19 | ```{r} 20 | counts = read.csv("//admin/appdata/DataWarehouse/DSSG/Count Data/Enrollment_Counts_Month_Current.csv", stringsAsFactors = FALSE) 21 | counts$Date = as.Date(counts$Date) 22 | counts$STUDENT_ANNUAL_GRADE_CODE = factor(counts$STUDENT_ANNUAL_GRADE_CODE, 23 | c('PE','PK','K','1','2','3','4','5','6','7','8','9','10','11','12')) 24 | 25 | counts$Year = as.numeric(substring(counts$SCHOOL_YEAR,6,9)) 26 | counts$Month = month(counts$Date) 27 | names(counts)[names(counts)=="STUDENT_ANNUAL_GRADE_CODE"] = "Grade" 28 | ``` 29 | 30 | Read the school report data. This was downloaded from the Chicago data portal. Concentrating on just student performance and overall school rating variables. 31 | ```{r} 32 | report_elem = read.csv("//admin/appdata/DataWarehouse/DSSG/School Reports/CPS_Elementary_Report_20122013.csv") 33 | report_high = read.csv("//admin/appdata/DataWarehouse/DSSG/School Reports/CPS_HighSchool_Report_20122013.csv") 34 | levels(report_elem$Student.Performance.Level) <- 35 | c("Far Below Average", "Below Average", "Average", "Above Average", "Far Above Average") 36 | report_high$Student.Performance.Level = factor(report_high$Student.Performance.Level, 37 | c("Far Below Average", "Below Average", "Average", "Above Average", "Far Above Average")) 38 | report_elem$Overall.Rating = factor(report_elem$Overall.Rating, 39 | levels = c("Level 1", "Level 2", "Level 3")) 40 | report_high$Overall.Rating = factor(report_high$Overall.Rating, 41 | levels = c("Level 1", "Level 2", "Level 3")) 42 | 43 | school_reports <- rbind(subset(report_elem, 44 | select = c(School.ID, Student.Performance.Level, Overall.Rating)), 45 | subset(report_high, 46 | select = c(School.ID, Student.Performance.Level, Overall.Rating))) 47 | ``` 48 | 49 | The function below is used for adding grades. You can logically add one grade to `K` and it knows it is `1`. 50 | ```{r} 51 | advance_grade <- function(current_grade, years = 1) { 52 | locs = apply(matrix(current_grade),1,function(x) which(x == levels(counts$Grade))) + years 53 | is.na(locs[locs<1 | locs>length(levels(counts$Grade))]) <- TRUE 54 | levels(counts$Grade)[locs] 55 | } 56 | ``` 57 | 58 | Below, I split the data into five snapshots.The first is enrollments that we want to predict include. 59 | ```{r} 60 | pred_enrolls = subset(counts, Month == 10, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments)) 61 | ``` 62 | 63 | Last year 20th day enrollments. 64 | ```{r} 65 | prev_20_enrolls = subset(counts, Month == 10, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments)) 66 | prev_20_enrolls$Year = prev_20_enrolls$Year + 1 67 | names(prev_20_enrolls)[names(prev_20_enrolls)=="Enrollments"] <- "Prev_20_Enrollments" 68 | ``` 69 | 70 | Last year 20th day enrollments for previous grade. 71 | ```{r} 72 | prev_20_enrolls_grade = prev_20_enrolls 73 | prev_20_enrolls_grade$Grade = advance_grade(prev_20_enrolls_grade$Grade, 1) 74 | names(prev_20_enrolls_grade)[names(prev_20_enrolls_grade)=="Prev_20_Enrollments"] <- "Prev_Grade_20_Enrollments" 75 | ``` 76 | 77 | Last year end of year enrollments. 78 | ```{r} 79 | prev_end_enrolls = subset(counts, Month == 6, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments)) 80 | prev_end_enrolls$Year = prev_end_enrolls$Year + 1 81 | names(prev_end_enrolls)[names(prev_end_enrolls)=="Enrollments"] <- "Prev_End_Enrollments" 82 | ``` 83 | 84 | Last year end of year enrollments for previous grade. 85 | ```{r} 86 | prev_end_enrolls_grade = prev_end_enrolls 87 | prev_end_enrolls_grade$Grade = advance_grade(prev_end_enrolls_grade$Grade, 1) 88 | names(prev_end_enrolls_grade)[names(prev_end_enrolls_grade)=="Prev_End_Enrollments"] <- "Prev_Grade_End_Enrollments" 89 | ``` 90 | 91 | Combine the five snapshots together so that we can use previous snapshots to predict current enrollments. 92 | ```{r} 93 | enrolls <- merge(pred_enrolls, prev_20_enrolls, all.x=TRUE) 94 | enrolls <- merge(enrolls, prev_20_enrolls_grade, all.x=TRUE) 95 | enrolls <- merge(enrolls, prev_end_enrolls, all.x=TRUE) 96 | enrolls <- merge(enrolls, prev_end_enrolls_grade, all.x=TRUE) 97 | ``` 98 | 99 | I removed 2009 since there is no prior history and 2014 for testing purposes. 100 | ```{r} 101 | enrolls_test <- subset(enrolls, Year == 2014) 102 | enrolls <- subset(enrolls, Year >= 2009 & Year < 2014) 103 | ``` 104 | 105 | Set NAs to 0 to replicate real life unknowns, except when there is no previous history for school. I am not doing this right now so that we can only assess good data situations. 106 | ```{r} 107 | # enrolls[is.na(enrolls)] <- 0 108 | # enrolls <- merge(enrolls, school_info, all.x=TRUE) 109 | # set.all.to.na <- which(enrolls$Year == enrolls$Min_Year) 110 | # set.grade.to.na <- which(enrolls$Grade == enrolls$Min_Grade) 111 | # is.na(enrolls[set.all.to.na,7:10]) <- TRUE 112 | # is.na(enrolls[set.grade.to.na,c(8,10)]) <- TRUE 113 | ``` 114 | 115 | Add in some school information from the school reports. 116 | ```{r} 117 | enrolls <- merge(enrolls, school_reports, all.x=TRUE, 118 | by.x = "SCHOOL_CODE", by.y = "School.ID") 119 | ``` 120 | 121 | Look at how enrollments change throughout the year to see if it is predictive of future enrollment changes. 122 | ```{r} 123 | enrolls$Prev_Change = enrolls$Prev_End_Enrollments - enrolls$Prev_20_Enrollments 124 | enrolls$Prev_Grade_Change = enrolls$Prev_Grade_End_Enrollments - enrolls$Prev_Grade_20_Enrollments 125 | ``` 126 | 127 | 128 | Function to calculate correlation of 4 previous year variables with enrollments for a grade. 129 | ```{r} 130 | compare_cor <- function(grade) { 131 | # convert grade to a number with 0=Kindergarten, 1=first grade, etc. 132 | c(Grade = which(grade==levels(counts$Grade))-3, cor(subset(enrolls, Grade == grade)[,6:10], use="pair")[-1,1]) 133 | } 134 | correls = t(sapply(unique(enrolls$Grade), compare_cor)) 135 | correls = as.data.frame(correls) 136 | correls$Grade = factor(advance_grade("K",correls$Grade), levels = levels(counts$Grade)) 137 | ``` 138 | 139 | Plots for the presentation. Using last year's fifth grade enrollment to estimate this year's fifth grade enrollment. There is a fairly strong correlation year over year. 140 | ```{r} 141 | ggplot(subset(enrolls, Grade == "5"), aes(Prev_End_Enrollments, Enrollments)) + 142 | geom_point(alpha = 0.25) + geom_abline() + geom_smooth(method = lm, size = 1) + 143 | labs(x= "Previous Year 5th Grade Enrollments", y= "This Year 5th Grade Enrollments", title = "5th Grade") 144 | ``` 145 | 146 | In contrast, if we use previous year 4th grade enrollment for the same school, there is a stronger correlation. 147 | ```{r} 148 | ggplot(subset(enrolls, Grade == "5"), aes(Prev_Grade_End_Enrollments, Enrollments)) + 149 | geom_point(alpha = 0.25) + geom_abline() + geom_smooth(method = lm, size = 1) + 150 | labs(x= "Previous Year 4th Grade Enrollments", y= "This Year 5th Grade Enrollments", title = "5th Grade") 151 | ``` 152 | 153 | For eleventh grade, we can look at the same two plots. There is no longer a big difference between using eleventh and tenth grade. 154 | ```{r} 155 | ggplot(subset(enrolls, Grade == "11" & SCHOOL_CODE != "609686"), aes(Prev_End_Enrollments, Enrollments)) + 156 | geom_point(alpha = 0.75) + geom_abline() + geom_smooth(method = lm, size = 1) + 157 | labs(x= "Previous Year 11th Grade Enrollments", y= "This Year 11th Grade Enrollments", title = "11th Grade") 158 | 159 | ggplot(subset(enrolls, Grade == "11" & SCHOOL_CODE != "609686"), aes(Prev_Grade_End_Enrollments, Enrollments)) + 160 | geom_point(alpha = 0.75) + geom_abline() + geom_smooth(method = lm, size = 1) + 161 | labs(x= "Previous Year 10th Grade Enrollments", y= "This Year 11th Grade Enrollments", title = "11th Grade") 162 | ``` 163 | 164 | We can compare the correlations for all grades. Grade school does better than high school for using the same cohort. Also, kindergarten predicts first grade better than first grade, which is contrary to CPS's intuition. 165 | ```{r} 166 | correls_pres = correls 167 | names(correls_pres)[4:5] <- c("Same Grade", "Previous Grade") 168 | is.na(correls_pres[correls_pres$Grade == "9", 5]) <- TRUE 169 | ggplot(melt(subset(correls_pres[,c(1,4:5)], !(Grade %in% c("PE", "PK"))), id = 1), aes(Grade, value, colour = variable , group = variable)) + 170 | geom_point() + geom_line() + labs(y = "Correlation with Next Year Enrollment", colour = "Method") 171 | ``` 172 | 173 | We broke up each high school by their rating. The best school are easier to predict because students do not leave. 174 | ```{r} 175 | ggplot(subset(enrolls, Grade %in% paste(9:12)), aes(Prev_Grade_End_Enrollments, Enrollments)) + 176 | geom_point() + geom_abline() + geom_smooth(method = lm) + facet_grid(Overall.Rating ~ Grade) + 177 | labs(x = "Last Year Previous Grade Enrollments", y = "This Year Enrollments") 178 | ``` 179 | 180 | That is not necessarily the same for elementary school. This may have something to do with students not being old enough able to drop out of school yet. 181 | ```{r} 182 | ggplot(subset(enrolls, Grade %in% c("K",paste(1:5))), aes(Prev_Grade_End_Enrollments, Enrollments)) + 183 | geom_point(alpha = .5) + geom_abline() + geom_smooth(method = lm, size = 1) + facet_grid(Overall.Rating ~ Grade)+ 184 | labs(x = "Last Year Previous Grade Enrollments", y = "This Year Enrollments") 185 | ``` 186 | 187 | 188 | Other exploratory analysis. 189 | ```{r} 190 | ggplot(subset(enrolls, Grade != "PE"), aes(Prev_Grade_End_Enrollments, Enrollments)) + 191 | geom_point() + geom_abline() + geom_smooth(method = lm) + facet_wrap(~ Grade, scales="free") 192 | ``` 193 | 194 | ```{r} 195 | ggplot(subset(enrolls, !(Grade %in% c("PE","PK",paste(9:12)))), aes(Prev_Grade_Change, Enrollments - Prev_Grade_20_Enrollments)) + 196 | geom_point(alpha = .25) + geom_smooth(method = lm) + facet_grid(Overall.Rating ~ Grade) 197 | 198 | ggplot(subset(enrolls, Grade %in% paste(9:12)), aes(Prev_Grade_Change, Enrollments - Prev_Grade_20_Enrollments)) + 199 | geom_point(alpha = .25) + geom_smooth(method = lm) + facet_grid(Overall.Rating ~ Grade) 200 | ``` 201 | -------------------------------------------------------------------------------- /Data_Analysis/README.md: -------------------------------------------------------------------------------- 1 | ## Data Analysis for CPS Predicting Student Enrollment ## 2 | 3 | This folder contains our work on top-down school-level modeling and exploratory data analysis. 4 | 5 | ### Top-down school-level model ### 6 | 7 | The code `CPS_gradebygrade_dataprep.R` will create a data file with all of the features we considered in the top-down model. 8 | The code `CPS_gradebygrade_script.R` creates functions that we used to try out different regression models. 9 | The code `CPS_gradebygrade_modelingexploration.R` uses the above two scripts and goes through many different models. 10 | 11 | The code `topdown_FINAL.R` is the final code where we focused on modeling catchment high school enrollment. 12 | 13 | ### Exploratory data analysis ### 14 | 15 | The code `prepare_data_and_shapefile_for_tableau.R` prepares the data for the Tableau workbook that shows where students from one catchment area go to school in 2013. 16 | 17 | The code `map_neighborhood_catchment.R` plots the percent of students who go to their catchment area. 18 | 19 | The code `grade9_transition_matrix.R` produces our plots of where 9th graders live versus where they go to high school. 20 | 21 | `investigate_9thgrade.R` and `investigate_juarez_change.R` are files we used to explore where 9th grade 22 | students go to school and where they live. In particular we look at how many students at certain schools 23 | come from within their catchment areas or come from outside CPS. 24 | 25 | The codes called `DeepDive1EDA_Counts.Rmd`, `exploratory_data_analysis.R`, `explore_8_to_9_transition.R`, 26 | `explore_enroll_totals_from_website.Rmd` 27 | are some of our initial exploratory files. 28 | 29 | The code `CPS_JAGS.R` documents our exploratory hierarchical modeling of cohort survival ratios. 30 | Through R we use the program called JAGS, which stands for Just Another Gibbs Sampler. 31 | 32 | `dataprep_geocodeschools.R` is where we find that latitude and longitude of each CPS school using the ggmap package in R. 33 | 34 | `history_evaluation.R` is the visualization of historical error of CPS. 35 | 36 | -------------------------------------------------------------------------------- /Data_Analysis/dataprep_geocodeschools.R: -------------------------------------------------------------------------------- 1 | # =================================== # 2 | # geocode + prep data for tableau # 3 | # =================================== # 4 | 5 | # geocode school locations from street address --> lat long 6 | library("ggmap") 7 | 8 | setwd("/Volumes/appdata-3/School Reports") 9 | data = read.csv("CSVfileforR_excerptof_FY14SBBALLOCATIONS_FINALSUMMARY_asof_032414.csv", stringsAsFactors = F) 10 | school_locations = read.csv("CPS_SchoolLocations_SY13_14.csv", stringsAsFactors = F) 11 | 12 | schoolmat = matrix(nrow = 623, ncol = 6) 13 | schoolmat = as.data.frame(schoolmat) 14 | colnames(schoolmat) = c("School ID", "Lon", "Lat", "Actual Total", "Projected Total", "Difference") 15 | 16 | for (i in 1:623){ 17 | schoolmat[i,1] = data[i,1] # school ID 18 | 19 | findlocation = school_locations[(school_locations$SchoolID == data[i,1]),] 20 | geocodeschool = geocode(findlocation$Address) 21 | schoolmat[i,2] = geocodeschool$lon # longitude 22 | schoolmat[i,3] = geocodeschool$lat # latitude 23 | schoolmat[i,4] = data[i,22] # actual total 24 | schoolmat[i,5] = data[i,13] # projected total 25 | schoolmat[i,6] = schoolmat[i,5] - schoolmat[i,4] # projected - actual 26 | } 27 | 28 | # add more variables 29 | schoolmat_extra = matrix(nrow = 623, ncol = 5) 30 | schoolmat_extra = as.data.frame(schoolmat_extra) 31 | colnames(schoolmat_extra) = c("School ID", "Category", "Type", "Geo network", "Geographic region") 32 | 33 | for (i in 1:623){ 34 | schoolmat_extra[i,1] = data[i,1] # school ID 35 | 36 | findlocation = school_locations[(school_locations$SchoolID == data[i,1]),] 37 | if (nrow(findlocation) > 0){ 38 | schoolmat_extra[i,2] = findlocation$SchoolCate 39 | schoolmat_extra[i,3] = findlocation$SchoolType 40 | schoolmat_extra[i,4] = findlocation$Geographic 41 | schoolmat_extra[i,5] = findlocation$Geograph_1 42 | } 43 | } 44 | 45 | write.csv(schoolmat, file = "school_projections_geocode_fall13.csv") 46 | write.csv(schoolmat_extra, file = "school_projections_geocode_fall13_EXTRA.csv") -------------------------------------------------------------------------------- /Data_Analysis/exploratory_data_analysis.R: -------------------------------------------------------------------------------- 1 | # ---------------------- # 2 | # Exploring CPS data # 3 | # ---------------------- # 4 | 5 | # ======================================================= # 6 | # exploring schools that are far off from projections # 7 | # ======================================================= # 8 | 9 | # greg's data 10 | setwd("/Volumes/appdata-3/School Reports") 11 | greg_data = read.csv("CSVfileforR_excerptof_FY14SBBALLOCATIONS_FINALSUMMARY_asof_032414.csv", header = TRUE) 12 | 13 | # how far off are projections in aggregate for different types of schools? 14 | greg_data$dif = greg_data$Projected.Total - greg_data$Actual.Total 15 | 16 | # look at elementary schools off by more than 50 17 | elem_50off = greg_data[(abs(greg_data$Projected.Total - greg_data$Actual.Total) > 50)&(greg_data$School.Type == "Elementary"),] 18 | elem_50off$dif = elem_50off$Projected.Total - elem_50off$Actual.Total 19 | elem_50off$propbudget = elem_50off$dif/elem_50off$Projected.Total 20 | elem_50off[order(elem_50off$propbudget),c(1,2,13,22,23,24)] 21 | 22 | # high schools off by more than 50 23 | hs_50off = greg_data[(abs(greg_data$Projected.Total - greg_data$Actual.Total) > 50)&(greg_data$School.Type == "High School"),] 24 | hs_50off$dif = hs_50off$Projected.Total - hs_50off$Actual.Total 25 | hs_50off$propbudget = hs_50off$dif/hs_50off$Projected.Total 26 | hs_50off[order(hs_50off$propbudget),c(1,2,13,22,23,24)] 27 | 28 | # charter schools off by more than 50 29 | charter_50off = greg_data[(abs(greg_data$Projected.Total - greg_data$Actual.Total) > 50)&(greg_data$School.Type == "Charter"),] 30 | charter_50off$dif = charter_50off$Projected.Total - charter_50off$Actual.Total 31 | charter_50off$propbudget = charter_50off$dif/charter_50off$Projected.Total 32 | charter_50off[order(charter_50off$propbudget),c(1,2,13,22,23,24)] 33 | 34 | 35 | # ============================================================ # 36 | # exploring 2012 enrollments (less extreme year than 2013) # 37 | # ============================================================ # 38 | 39 | library(lubridate) 40 | library(plyr) 41 | 42 | # read count data 43 | setwd("/Volumes/appdata-3/Count Data") 44 | 45 | counts = read.csv("Enrollment_Counts_Month_Current.csv", stringsAsFactors = FALSE) 46 | counts$Date = as.Date(counts$Date) 47 | counts$STUDENT_ANNUAL_GRADE_CODE = factor(counts$STUDENT_ANNUAL_GRADE_CODE, c('PE','PK','K','1','2','3','4','5','6','7','8','9','10','11','12')) 48 | 49 | counts$Year = as.numeric(substring(counts$SCHOOL_YEAR,6,9)) 50 | counts$Month = month(counts$Date) 51 | names(counts)[names(counts)=="STUDENT_ANNUAL_GRADE_CODE"] = "Grade" 52 | 53 | # read school data 54 | setwd("/Volumes/appdata-3/School Reports") 55 | 56 | report_elem = read.csv("CPS_Elementary_Report_20122013.csv") 57 | report_high = read.csv("CPS_HighSchool_Report_20122013.csv") 58 | levels(report_elem$Student.Performance.Level) <- c("Far Below Average", "Below Average", "Average", "Above Average", "Far Above Average") 59 | report_high$Student.Performance.Level = factor(report_high$Student.Performance.Level, c("Far Below Average", "Below Average", "Average", "Above Average", "Far Above Average")) 60 | report_elem$Overall.Rating = factor(report_elem$Overall.Rating, levels = c("Level 1", "Level 2", "Level 3")) 61 | report_high$Overall.Rating = factor(report_high$Overall.Rating, levels = c("Level 1", "Level 2", "Level 3")) 62 | 63 | school_reports <- rbind(subset(report_elem, select = c(School.ID, Student.Performance.Level, Overall.Rating)), subset(report_high, select = c(School.ID, Student.Performance.Level, Overall.Rating))) 64 | 65 | # function for adding grades 66 | advance_grade <- function(current_grade, years = 1) { 67 | locs = apply(matrix(current_grade),1,function(x) which(x == levels(counts$Grade))) + years 68 | is.na(locs[locs<1 | locs>length(levels(counts$Grade))]) <- TRUE 69 | levels(counts$Grade)[locs] 70 | } 71 | 72 | # Enrollments we want to predict 73 | pred_enrolls = subset(counts, Month == 10, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments)) 74 | 75 | # Last year 20th day enrollments 76 | prev_20_enrolls = subset(counts, Month == 10, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments)) 77 | prev_20_enrolls$Year = prev_20_enrolls$Year + 1 78 | names(prev_20_enrolls)[names(prev_20_enrolls)=="Enrollments"] <- "Prev_20_Enrollments" 79 | 80 | # Last year 20th day enrollments for previous grade 81 | prev_20_enrolls_grade = prev_20_enrolls 82 | prev_20_enrolls_grade$Grade = advance_grade(prev_20_enrolls_grade$Grade, 1) 83 | names(prev_20_enrolls_grade)[names(prev_20_enrolls_grade)=="Prev_20_Enrollments"] <- "Prev_Grade_20_Enrollments" 84 | 85 | # Last year end of year enrollments 86 | prev_end_enrolls = subset(counts, Month == 6, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments)) 87 | prev_end_enrolls$Year = prev_end_enrolls$Year + 1 88 | names(prev_end_enrolls)[names(prev_end_enrolls)=="Enrollments"] <- "Prev_End_Enrollments" 89 | 90 | # Last year end of year enrollments for previous grade 91 | prev_end_enrolls_grade = prev_end_enrolls 92 | prev_end_enrolls_grade$Grade = advance_grade(prev_end_enrolls_grade$Grade, 1) 93 | names(prev_end_enrolls_grade)[names(prev_end_enrolls_grade)=="Prev_End_Enrollments"] <- "Prev_Grade_End_Enrollments" 94 | 95 | # combine to one dataset 96 | enrolls <- merge(pred_enrolls, prev_20_enrolls, all.x=TRUE) 97 | enrolls <- merge(enrolls, prev_20_enrolls_grade, all.x=TRUE) 98 | enrolls <- merge(enrolls, prev_end_enrolls, all.x=TRUE) 99 | enrolls <- merge(enrolls, prev_end_enrolls_grade, all.x=TRUE) 100 | 101 | # add in some school information 102 | enrolls <- merge(enrolls, school_reports, all.x=TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID") 103 | 104 | schools_basicdata = read.csv("schools.csv") 105 | 106 | enrolls <- merge(enrolls, schools_basicdata, all.x=TRUE, by.x = "SCHOOL_CODE", by.y = "SCHOOL_CODE") 107 | 108 | # look at 2011 to 2012 enrollments 109 | schools_fall11to12 = ddply(enrolls, .(SCHOOL_CODE), summarize, fall2011enrollment = sum(Enrollments[Year == 2012]), fall2012enrollment = sum(Enrollments[Year == 2013])) 110 | schools_fall11to12 = subset(schools_fall11to12, (fall2011enrollment > 0)|(fall2012enrollment > 0)) 111 | 112 | offby50 = schools_fall11to12[(abs(schools_fall11to12$fall2012enrollment - schools_fall11to12$fall2011enrollment) >= 100)&(schools_fall11to12$fall2011enrollment > 0)&(schools_fall11to12$fall2012enrollment > 0),] 113 | 114 | school_info = ddply(enrolls, .(SCHOOL_CODE), summarize, Min_Year = min(Year, na.rm=TRUE), Max_Year = max(Year, na.rm=TRUE)) 115 | 116 | newschools_fall2012 = school_info[(school_info$Min_Year == 2013),] 117 | closingschools_fall2012 = school_info[(school_info$Max_Year == 2012),] 118 | 119 | # ------------------------- # 120 | # new schools fall 2013 # 121 | # ------------------------- # 122 | 123 | newschools_fall2013 = school_info[(school_info$Min_Year == 2014),] 124 | 125 | # pick 1 new school from fall 2013 to analyze 126 | 127 | newschool1_fall2013 = newschools_fall2013[13,]$SCHOOL_CODE 128 | newschool1_type = schools_basicdata[(schools_basicdata$SchoolID == newschool1_fall2013),]$SchoolGradeGroup 129 | newschool1_location = schools_basicdata[(schools_basicdata$SchoolID == newschool1_fall2013),c("Longitude","Latitude")] 130 | newschool1_name = schools_basicdata[(schools_basicdata$SchoolID == newschool1_fall2013),]$SchoolName 131 | 132 | dist_school1 = rdist.earth(newschool1_location, schools_basicdata[,c("Longitude","Latitude")], miles = TRUE, R = NULL) 133 | schools_within1mile = schools_basicdata[(dist_school1 <= 1),] 134 | schools_within1mile = schools_within1mile[(!is.na(schools_within1mile$SchoolID)),] 135 | 136 | test = merge(schools_within1mile, enrolls, all.x = TRUE, by.x = "SchoolID", by.y = "SCHOOL_CODE") 137 | test = test[(test$SchoolGradeGroup == newschool1_type),] 138 | 139 | table(test[(test$SchoolID == newschool1_fall2013),]$Grade) 140 | 141 | test2 = ddply(test, .(SchoolID, Year, Longitude, Latitude), summarize, total_enr = sum(Enrollments, na.rm = TRUE)) 142 | test2 = test2[(!is.na(test2$Year)),] 143 | 144 | scaleparam = scale_size(limits=c(0,max(test2$total_enr)), range = c(1,15)) 145 | 146 | enr = as.data.frame(test2[(test2$Year == "2012"),]) 147 | qm3 = qmap(location = c(newschool1_location$Longitude, newschool1_location$Latitude) , zoom = 14) + labs(title = '2011-2012', size = "enrollment") +geom_point(data = enr[(enr$SchoolID != newschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), show_guide = FALSE) + scaleparam 148 | print(qm3) 149 | 150 | enr = as.data.frame(test2[(test2$Year == "2013"),]) 151 | qm4 = qmap(location = c(newschool1_location$Longitude, newschool1_location$Latitude) , zoom = 14) +labs(title = '2012-2013', size = "enrollment")+ geom_point(data = enr[(enr$SchoolID != newschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), show_guide = FALSE) + scaleparam 152 | print(qm4) 153 | 154 | enr = as.data.frame(test2[(test2$Year == "2014"),]) 155 | #qm5 = qmap(location = c(newschool1_location$lon, newschool1_location$lat) , zoom = 14) + labs(title = '2013-2014: newschool', size = "enrollment")+geom_point(data = enr[(enr$SCHOOL_CODE == newschool1_fall2013),], aes(x = lon.x, y = lat.x, size = total_enr), position = position_jitter(w = .0005, h = .0005), color = 'red', show_guide = FALSE) + geom_point(data = enr[(enr$SCHOOL_CODE != newschool1_fall2013),], aes(x = lon.x, y = lat.x, size = total_enr), show_guide = FALSE) + scaleparam 156 | qm5 = qmap(location = c(newschool1_location$Longitude, newschool1_location$Latitude) , zoom = 14) + labs(title = '2013-2014: Back of the Yards IB HS', size = "enrollment")+geom_point(data = enr[(enr$SchoolID == newschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), color = 'red', show_guide = FALSE) + geom_point(data = enr[(enr$SchoolID != newschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), show_guide = FALSE) + scaleparam 157 | print(qm5) 158 | 159 | 160 | # school closings 161 | closeschools_fall2013 = school_info[(school_info$Max_Year == 2013),] 162 | 163 | # pick 1 school that closed in spring 2013 to analyze 164 | closeschool1_fall2013 = sample(unique(closeschools_fall2013$SCHOOL_CODE), size = 1) 165 | closeschool1_type = schools_basicdata[(schools_basicdata$SchoolID == closeschool1_fall2013),]$SchoolGradeGroup 166 | closeschool1_location = report_elem[(report_elem$School.ID == closeschool1_fall2013),c("Longitude","Latitude")] 167 | closeschool1_name = schools_basicdata[(schools_basicdata$SchoolID == closeschool1_fall2013),]$SchoolName 168 | 169 | dist_school1 = rdist.earth(closeschool1_location[c(2,1)], schools_basicdata[,c("Longitude","Latitude")], miles = TRUE, R = NULL) 170 | 171 | schools_within1mile = schools_basicdata[((dist_school1 <= 1)|(schools_basicdata$SchoolID == closeschool1_fall2013)),] 172 | schools_within1mile = schools_within1mile[(!is.na(schools_within1mile$SchoolID)),] 173 | 174 | schools_within1mile[(schools_within1mile$SchoolID == closeschool1_fall2013),]$Longitude = closeschool1_location$Latitude 175 | schools_within1mile[(schools_within1mile$SchoolID == closeschool1_fall2013),]$Latitude = closeschool1_location$Longitude 176 | 177 | test = merge(schools_within1mile, enrolls, all.x = TRUE, by.x = "SchoolID", by.y = "SCHOOL_CODE") 178 | test = test[(test$SchoolGradeGroup == closeschool1_type),] 179 | 180 | table(test[(test$SchoolID == closeschool1_fall2013),]$Grade) # K - 8 181 | test = test[-(test$Grade == 'PE'),] 182 | test = test[-(test$Grade == 'PK'),] 183 | 184 | test2 = ddply(test, .(SchoolID, Year, Longitude, Latitude), summarize, total_enr = sum(Enrollments, na.rm = TRUE)) 185 | #test2 = test2[(!is.na(test2$Year)),] 186 | 187 | enr = test2[(test2$Year == "2010"),] 188 | qm1 = qmap(location = c(closeschool1_location$Latitude, closeschool1_location$Longitude), zoom = 14) + labs(title = '2010') + geom_point(data = enr[(enr$SchoolID == closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), position = position_jitter(w = .0005, h = .0005), color = 'cyan') + geom_point(data = enr[(enr$SchoolID != closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr)) + scale_size(breaks = c(200,400,600,800,1000), limits=c(0,1000), range = c(1,15)) 189 | 190 | enr = test2[(test2$Year == "2011"),] 191 | qm2 = qmap(location = c(closeschool1_location$Latitude, closeschool1_location$Longitude) , zoom = 14) + labs(title = '2011')+geom_point(data = enr[(enr$SchoolID == closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), position = position_jitter(w = .0005, h = .0005), color = 'cyan')+ geom_point(data = enr[(enr$SchoolID != closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr)) + scale_size(breaks = c(200,400,600,800,1000), limits=c(0,1000), range = c(1,15)) 192 | 193 | enr = test2[(test2$Year == "2012"),] 194 | qm3 = qmap(location = c(closeschool1_location$Latitude, closeschool1_location$Longitude) , zoom = 14) + labs(title = '2012') +geom_point(data = enr[(enr$SchoolID == closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), position = position_jitter(w = .0005, h = .0005), color = 'cyan') +geom_point(data = enr[(enr$SchoolID != closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr)) + scale_size(breaks = c(200,400,600,800,1000), limits=c(0,1000), range = c(1,15)) 195 | 196 | enr = test2[(test2$Year == "2013"),] 197 | qm4 = qmap(location = c(closeschool1_location$Latitude, closeschool1_location$Longitude) , zoom = 14) +labs(title = '2013')+ geom_point(data = enr[(enr$SchoolID == closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), position = position_jitter(w = .0005, h = .0005), color = 'cyan') +geom_point(data = enr[(enr$SchoolID != closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr)) + scale_size(breaks = c(200,400,600,800,1000), limits=c(0,1000), range = c(1,15)) 198 | 199 | enr = test2[(test2$Year == "2014"),] 200 | qm5 = qmap(location = c(closeschool1_location$Latitude, closeschool1_location$Longitude) , zoom = 14) + labs(title = '2014') + geom_point(data = enr[(enr$SchoolID != closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr)) + scale_size(breaks = c(200,400,600,800,1000), limits=c(0,1000), range = c(1,15)) 201 | 202 | pushViewport(viewport(layout = grid.layout(2, 3))) 203 | print(qm1, vp = viewport(layout.pos.row = 1, layout.pos.col = 1)) 204 | print(qm2, vp = viewport(layout.pos.row = 1, layout.pos.col = 2)) 205 | print(qm3, vp = viewport(layout.pos.row = 1, layout.pos.col = 3)) 206 | print(qm4, vp = viewport(layout.pos.row = 2, layout.pos.col = 1)) 207 | print(qm5, vp = viewport(layout.pos.row = 2, layout.pos.col = 2)) 208 | -------------------------------------------------------------------------------- /Data_Analysis/explore_8_to_9_transition.R: -------------------------------------------------------------------------------- 1 | setwd("//admin/appdata/DataWarehouse/DSSG") 2 | load("Student_Data/students.RData") 3 | library(reshape2) 4 | library(ggplot2) 5 | 6 | students$High_School = ifelse(is.na(students$NEXT_GRADE_STUDENT_ANNUAL_SCHOOL), 7 | "None", as.character(students$NEXT_GRADE_STUDENT_ANNUAL_SCHOOL)) 8 | students$High_School_Code = ifelse(is.na(students$NEXT_GRADE_STUDENT_ANNUAL_SCHOOL_CODE), 9 | 0, students$NEXT_GRADE_STUDENT_ANNUAL_SCHOOL_CODE) 10 | 11 | students2010 = subset(students, SCHOOL_YEAR == "2010-2011") 12 | 13 | transition_codes = table(students2010$STUDENT_ANNUAL_SCHOOL_CODE, 14 | students2010$High_School_Code) 15 | transition_codes_years = table(students$STUDENT_ANNUAL_SCHOOL_CODE, 16 | students$High_School_Code, 17 | students$SCHOOL_YEAR) 18 | 19 | 20 | 21 | trans_codes_m = melt(transition_codes_years, varnames = c("ES_code", "HS_code", "SCHOOL_YEAR")) 22 | trans_codes_m = trans_codes_m[trans_codes_m$value>0,] 23 | head(trans_codes_m) 24 | 25 | schools = read.csv("School Reports/schools.csv") 26 | schools_loc = subset(schools, select = c("SCHOOL_CODE", "lat", "lon", "SCHOOL_TYPE")) 27 | 28 | trans_codes_m = merge(trans_codes_m, schools_loc[, -4], by.x = "ES_code", by.y = "SCHOOL_CODE", all.x = TRUE) 29 | names(trans_codes_m)[5:6] <- paste("ES", c("lat","lon"), sep = "_") 30 | 31 | trans_codes_m = merge(trans_codes_m, schools_loc, by.x = "HS_code", by.y = "SCHOOL_CODE", all.x = TRUE) 32 | names(trans_codes_m)[7:8] <- paste("HS", c("lat","lon"), sep = "_") 33 | 34 | # plot on map for some schools #### 35 | library(ggmap) 36 | pdf("Visualizations/R plots/explore_8_to_9.pdf") 37 | source_school_codes = sample(unique(trans_codes_m$ES_code),20) 38 | # map = get_googlemap('chicago', zoom = 11) 39 | for (sc in source_school_codes) { 40 | es_trans = as.data.frame(trans_codes_m[trans_codes_m$ES_code == sc & trans_codes_m$SCHOOL_YEAR == "2010-2011",]) 41 | if (!is.na(es_trans$ES_lat[1])) { 42 | school_name = as.character(schools$SCHOOL_SHORT_NAME[schools$SCHOOL_CODE == es_trans$ES_code[1]]) 43 | qm <- qmap("Chicago", zoom = 10) + labs(size = "# Students", title = school_name) + 44 | geom_point(data = es_trans[1,], aes(x = ES_lon, y = ES_lat), shape = "x", colour = "black", size = 10) + 45 | geom_point(data = es_trans, aes(x = HS_lon, y = HS_lat, size = value), colour = "red") 46 | print(qm) 47 | } 48 | } 49 | dev.off() 50 | 51 | # plot with "background singers" #### 52 | high_schools = as.data.frame(unique(trans_codes_m[!is.na(trans_codes_m$HS_lat), 7:9])) 53 | plot_base = ggplot(high_schools, aes(HS_lon, HS_lat)) + geom_point(aes(colour = SCHOOL_TYPE), shape = 1) + coord_map() 54 | 55 | source_school_codes = sample(unique(trans_codes_m$ES_code),20) 56 | for (sc in source_school_codes) { 57 | es_trans = as.data.frame(trans_codes_m[trans_codes_m$ES_code == sc, ]) 58 | es_trans[es_trans$HS_code == 0, 7:8] = rep(c(41.9, -87.55), each = sum(es_trans$HS_code == 0)) 59 | if (!is.na(es_trans$ES_lat[1])) { 60 | school_name = as.character(schools$SCHOOL_SHORT_NAME[schools$SCHOOL_CODE == es_trans$ES_code[1]]) 61 | school_type = as.character(schools$SCHOOL_TYPE[schools$SCHOOL_CODE == es_trans$ES_code[1]]) 62 | p <- plot_base + geom_point(data = es_trans, aes(size = value, colour = SCHOOL_TYPE)) + 63 | geom_point(data = es_trans, aes(x = ES_lon, y = ES_lat, colour = school_type), shape = "x", size = 10) + 64 | labs(size = "# Students", title = paste0(school_name, " (", school_type, ")"), x = "Latitude", y = "Longitude") + 65 | facet_wrap( ~ SCHOOL_YEAR, nrow = 1) 66 | print(p) 67 | } 68 | } 69 | 70 | # distance between elem school and high schools #### 71 | rdist.earth.vec <- function(x1, x2, miles = TRUE, R = NULL) { 72 | # modified from fields::rdist.earth 73 | if (is.null(R)) { 74 | if (miles) 75 | R <- 3963.34 76 | else R <- 6378.388 77 | } 78 | coslat1 <- cos((x1[, 2] * pi)/180) 79 | sinlat1 <- sin((x1[, 2] * pi)/180) 80 | coslon1 <- cos((x1[, 1] * pi)/180) 81 | sinlon1 <- sin((x1[, 1] * pi)/180) 82 | 83 | coslat2 <- cos((x2[, 2] * pi)/180) 84 | sinlat2 <- sin((x2[, 2] * pi)/180) 85 | coslon2 <- cos((x2[, 1] * pi)/180) 86 | sinlon2 <- sin((x2[, 1] * pi)/180) 87 | 88 | pp <- cbind(coslat1 * coslon1, coslat1 * sinlon1, sinlat1) * 89 | cbind(coslat2 * coslon2, coslat2 * sinlon2, sinlat2) 90 | pp = rowSums(pp) 91 | return(R * acos(ifelse(abs(pp) > 1, 1 * sign(pp), pp))) 92 | } 93 | 94 | trans_codes_m$Distance = with(trans_codes_m, rdist.earth.vec(cbind(ES_lon, ES_lat), cbind(HS_lon, HS_lat))) 95 | 96 | trans_codes_m2010 = subset(trans_codes_m, SCHOOL_YEAR == "2010-2011" & !is.na(Distance)) 97 | ggplot(trans_codes_m2010, aes(Distance, weight = value)) + geom_histogram(binwidth=1) + facet_wrap( ~ SCHOOL_TYPE, scales = "free_y") 98 | 99 | wquantile <- function(v,w=rep(1,length(v)),p=.5) { 100 | # from https://stat.ethz.ch/pipermail/r-help/2009-February/188762.html 101 | if (!is.numeric(v) || !is.numeric(w) || length(v) != length(w)) 102 | stop("Values and weights must be equal-length numeric vectors") 103 | if ( !is.numeric(p) || any( p<0 | p>1 ) ) 104 | stop("Quantiles must be 0<=p<=1") 105 | ranking <- order(v) 106 | sumw <- cumsum(w[ranking]) 107 | if ( is.na(w[1]) || w[1]<0 ) stop("Weights must be non-negative numbers") 108 | plist <- sumw/sumw[length(sumw)] 109 | sapply(p, function(p) v [ ranking [ which.max( plist >= p ) ] ]) 110 | } 111 | 112 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Regular"), 113 | wquantile(Distance, value, p = .9)) 114 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Charter"), 115 | wquantile(Distance, value, p = .9)) 116 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Performance"), 117 | wquantile(Distance, value, p = .9)) 118 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Contract"), 119 | wquantile(Distance, value, p = .9)) 120 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Alternative"), 121 | wquantile(Distance, value, p = .9)) 122 | with(subset(trans_codes_m2010), 123 | wquantile(Distance, value, p = 0.8569962)) 124 | 125 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Regular"), 126 | sum(value[Distance<6.27])/sum(value)) 127 | 128 | 129 | # open Illinois zip code shape files #### 130 | setwd("//admin/appdata/DataWarehouse/DSSG/Other Datasets/IllinoisZipCodes") 131 | # http://geocommons.com/overlays/305198 132 | library(rgdal) 133 | library(maptools) 134 | library(rgeos) 135 | 136 | ilzips = readOGR(dsn=".", layer="Zip_Codes") 137 | ilzips@data$id = rownames(ilzips@data) 138 | ilzips = gBuffer(ilzips, width=0, byid=TRUE) 139 | ilzips.points = fortify(ilzips, region="id") 140 | ilzips.df = plyr::join(ilzips.points, ilzips@data, by="id") 141 | 142 | 143 | # look at zip code source to high school #### 144 | setwd("//admin/appdata/DataWarehouse/DSSG") 145 | transition_zips_years = table(students$STUDENT_POSTAL_CODE, 146 | students$High_School_Code, 147 | students$SCHOOL_YEAR) 148 | 149 | trans_codes_m = melt(transition_zips_years, varnames = c("ZipCode", "HS_code", "SCHOOL_YEAR")) 150 | trans_codes_m = trans_codes_m[trans_codes_m$value>0,] 151 | head(trans_codes_m) 152 | 153 | schools = read.csv("School Reports/schools.csv") 154 | schools_loc = subset(schools, select = c("SCHOOL_CODE", "lat", "lon", "SCHOOL_TYPE")) 155 | 156 | library(zipcode) 157 | data(zipcode) 158 | 159 | trans_codes_m = merge(trans_codes_m, zipcode[, c(1, 4, 5)], by.x = "ZipCode", by.y = "zip", all.x = TRUE) 160 | names(trans_codes_m)[5:6] <- paste("zip", c("lat","lon"), sep = "_") 161 | 162 | trans_codes_m = merge(trans_codes_m, schools_loc, by.x = "HS_code", by.y = "SCHOOL_CODE", all.x = TRUE) 163 | names(trans_codes_m)[7:8] <- paste("HS", c("lat","lon"), sep = "_") 164 | 165 | library(dplyr) 166 | big_zips = tbl_df(trans_codes_m) %>% 167 | group_by(ZipCode) %>% 168 | summarise(Students = sum(value)) %>% 169 | filter(Students >= 1000) %>% 170 | arrange(desc(Students)) 171 | 172 | total_per_zip_year = tbl_df(trans_codes_m) %>% 173 | group_by(ZipCode, SCHOOL_YEAR) %>% 174 | summarise(TotalStudents = sum(value)) 175 | trans_codes_m = merge(trans_codes_m, total_per_zip_year) 176 | trans_codes_m$Percent = trans_codes_m$value / trans_codes_m$TotalStudents * 100 177 | # trans_codes_m$Distance = with(trans_codes_m, rdist.earth.vec(cbind(zip_lon, zip_lat), cbind(HS_lon, HS_lat))) 178 | 179 | 180 | # plot 181 | # source_zips = sort(sample(as.character(unique(trans_codes_m$ZipCode))[-1],20)) 182 | library(maps) 183 | high_schools = as.data.frame(unique(subset(trans_codes_m, !is.na(HS_lat), 184 | c("SCHOOL_YEAR", "HS_lat", "HS_lon", "SCHOOL_TYPE")))) 185 | 186 | source_zips = big_zips[1:10, 1] 187 | z = source_zips[2] 188 | size_range = range(trans_codes_m[trans_codes_m$ZipCode %in% source_zips, "Percent"]) 189 | 190 | 191 | plot_base = ggplot(high_schools, aes(HS_lon, HS_lat)) + 192 | labs(size = "% of Students", y = "Latitude", x = "Longitude", colour = "High School Type") + 193 | scale_size_continuous(breaks = seq(10, floor(size_range[2]/10)*10, by = 10), limit = size_range, range = c(2, 6)) + 194 | facet_wrap( ~ SCHOOL_YEAR, nrow = 1) + geom_path( data=map_data("state"), aes(x=long, y=lat, group = group),colour="black") + 195 | coord_map(xlim = range(high_schools$HS_lon), ylim = range(high_schools$HS_lat)) + 196 | theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) 197 | pdf("Visualizations/R plots/explore_zip_to_9.pdf", hei=7, wid = 11) 198 | for (z in source_zips) { 199 | cat(which(z == source_zips), " ") 200 | zip_trans = as.data.frame(trans_codes_m[trans_codes_m$ZipCode == z, ]) 201 | if (sum(zip_trans$HS_code == 0)>0) 202 | zip_trans[zip_trans$HS_code == 0, 7:8] = rep(c(41.9, -87.55), each = sum(zip_trans$HS_code == 0)) 203 | if (!is.na(zip_trans$zip_lat[1])) { 204 | zip_code = as.character(z) 205 | zip_poly = geom_polygon(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), fill="grey", alpha = .8) 206 | # zip_poly = geom_path(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), colour="black") 207 | p <- plot_base + zip_poly + geom_point(aes(colour = SCHOOL_TYPE), shape = 1) + 208 | geom_point(data = zip_trans, aes(size = Percent, colour = SCHOOL_TYPE)) + 209 | ggtitle(paste0("Zip Code = ", zip_code)) 210 | # geom_point(data = zip_trans, aes(x = zip_lon, y = zip_lat), shape = "x", size = 10) 211 | print(p) 212 | } 213 | } 214 | dev.off() 215 | 216 | 217 | # same as above with google map in background 218 | # library(ggmap) 219 | # plot_base = qmap("Chicago", zoom = 10) + 220 | # labs(size = "% of Students", x = "Latitude", y = "Longitude", colour = "High School Type") + 221 | # scale_size_continuous(breaks = seq(10, floor(size_range[2]/10)*10, by = 10), limit = size_range, range = c(3, 10)) + 222 | # facet_wrap( ~ SCHOOL_YEAR, nrow = 1) # + xlim(range(high_schools$HS_lat)) + ylim(range(high_schools$HS_lon)) 223 | # for (z in source_zips) { 224 | # zip_trans = as.data.frame(trans_codes_m[trans_codes_m$ZipCode == z, ]) 225 | # if (sum(zip_trans$HS_code == 0)>0) 226 | # zip_trans[zip_trans$HS_code == 0, 7:8] = rep(c(41.9, -87.55), each = sum(zip_trans$HS_code == 0)) 227 | # if (!is.na(zip_trans$zip_lat[1])) { 228 | # zip_code = as.character(z) 229 | # zip_poly = geom_path(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), colour="black") 230 | # p <- plot_base + zip_poly + 231 | # geom_point(data = high_schools, aes(HS_lon, HS_lat, colour = SCHOOL_TYPE), shape = 1, size = 3) + 232 | # geom_point(data = zip_trans, aes(HS_lon, HS_lat, size = Percent, colour = SCHOOL_TYPE)) + 233 | # ggtitle(paste0("Zip Code = ", zip_code)) 234 | # # geom_point(data = zip_trans, aes(x = zip_lon, y = zip_lat), shape = "x", size = 10) 235 | # print(p) 236 | # } 237 | # } 238 | 239 | # plot for OpenGov Hack Night 240 | trans_codes_m = subset(trans_codes_m, SCHOOL_YEAR == "2012-2013") 241 | 242 | high_schools = as.data.frame(unique(subset(trans_codes_m, !is.na(HS_lat), 243 | c("SCHOOL_YEAR", "HS_lat", "HS_lon", "SCHOOL_TYPE")))) 244 | 245 | source_zips = big_zips[1:10, 1] 246 | size_range = range(trans_codes_m[trans_codes_m$ZipCode %in% source_zips, "Percent"]) 247 | 248 | 249 | plot_base = ggplot(high_schools, aes(HS_lon, HS_lat)) + 250 | labs(size = "# of Students", y = "Latitude", x = "Longitude") + 251 | scale_size_continuous(range = c(2, 6)) + # breaks = seq(5, 25, by = 5), limit = size_range, 252 | geom_path( data=map_data("state"), aes(x=long, y=lat, group = group),colour="black") + 253 | coord_map(xlim = range(high_schools$HS_lon), ylim = range(high_schools$HS_lat)) + 254 | theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) 255 | 256 | z = source_zips[2] 257 | zip_trans = as.data.frame(trans_codes_m[trans_codes_m$ZipCode == z, ]) 258 | if (sum(zip_trans$HS_code == 0)>0) 259 | zip_trans[zip_trans$HS_code == 0, 7:8] = rep(c(41.9, -87.55), each = sum(zip_trans$HS_code == 0)) 260 | if (!is.na(zip_trans$zip_lat[1])) { 261 | zip_code = as.character(z) 262 | zip_poly = geom_polygon(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), fill="grey", alpha = .8) 263 | # zip_poly = geom_path(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), colour="black") 264 | p <- plot_base + zip_poly + geom_point(shape = 1) + 265 | geom_point(data = zip_trans, aes(size = value)) + 266 | ggtitle(paste0("Zip Code = ", zip_code)) 267 | # geom_point(data = zip_trans, aes(x = zip_lon, y = zip_lat), shape = "x", size = 10) 268 | print(p) 269 | } 270 | 271 | 272 | library(ggmap) 273 | 274 | # plot_base = qmap("Chicago", zoom = 10, maptype = "roadmap") + 275 | # colMeans(high_schools[,c(3,2)]) 276 | plot_base = qmap(location = c(-87.70831, 41.84566), zoom = 11, maptype = "terrain") + 277 | labs(size = "Students", x = "Latitude", y = "Longitude") 278 | # + xlim(range(high_schools$HS_lat)) + ylim(range(high_schools$HS_lon)) 279 | 280 | z = big_zips[2,1] 281 | pdf("Visualizations/R plots/zip_to_HS_maps.pdf") 282 | for (z in big_zips[,1]) { 283 | print(z) 284 | zip_trans = as.data.frame(trans_codes_m[trans_codes_m$ZipCode == z, ]) 285 | if (sum(zip_trans$HS_code == 0)>0) 286 | zip_trans[zip_trans$HS_code == 0, 7:8] = rep(c(41.9, -87.55), each = sum(zip_trans$HS_code == 0)) 287 | 288 | zip_code = as.character(z) 289 | # zip_poly = geom_path(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), colour="black") 290 | zip_poly = geom_polygon(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), fill="blue", alpha = .5) 291 | p <- plot_base + zip_poly + 292 | geom_point(data = high_schools, aes(HS_lon, HS_lat), shape = 1, size = 3) + 293 | geom_point(data = zip_trans, aes(HS_lon, HS_lat, size = value)) + 294 | scale_size_continuous(range = c(3, 9)) + 295 | ggtitle(paste0("Zip Code = ", zip_code)) + theme(legend.text = element_text(size = 16), legend.title = element_text(size = 16)) 296 | print(p) 297 | } 298 | dev.off() 299 | -------------------------------------------------------------------------------- /Data_Analysis/explore_enroll_totals_from_website.Rmd: -------------------------------------------------------------------------------- 1 | Exploratory Data Analysis of Aggregate Enrollments 2 | ======================================================== 3 | 4 | Andrew Landgraf 5 | 6 | First, pull in the data created by Zhou, which was downloaded from [here](http://www.cps.edu/SchoolData/Pages/SchoolData.aspx). Also, look at the structure of the data. 7 | 8 | ```{r} 9 | # setwd("~/../github/predicting_student_enrollment/") 10 | load("enrollment_all_data.Rdata") 11 | # load("enrollment_data.Rdata") 12 | str(output) 13 | head(output[,,1]) 14 | ``` 15 | 16 | Transform the data into a data frame, with one row per school and year. 17 | 18 | ```{r} 19 | library(plyr) 20 | options(stringsAsFactors = FALSE) 21 | dat=adply(output, 3, rbind.fill) 22 | str(dat) 23 | ``` 24 | 25 | Edit the variables types and add a `Year` variable. 26 | 27 | ```{r} 28 | dat$Year = as.numeric(dat$X1) + 2005 29 | dat$School_ID = as.numeric(dat$School_ID) 30 | # dat$School_Name = as.factor(dat$School_Name) 31 | dat$School_Type = as.factor(dat$School_Type) 32 | 33 | for (col in 5:20) { 34 | dat[, col] = as.numeric(dat[, col]) 35 | } 36 | ``` 37 | 38 | Add the total enrollments of the school for the first year of the data set (2006) in order to make comparisons. 39 | 40 | ```{r} 41 | dat2006 = subset(dat, Year == 2006, select = c("School_ID", "Total")) 42 | colnames(dat2006)[2] = "Total_2006" 43 | dat = merge(dat, dat2006) 44 | ``` 45 | 46 | The total enrollments should match the sum of the enrollments by grade, for each school and year. 47 | 48 | ```{r} 49 | plot(rowSums(subset(dat,select=c(PE:G12))), dat$Total) 50 | max( abs( rowSums(subset(dat,select=c(PE:G12))) - dat$Total)) 51 | ``` 52 | 53 | 54 | Plot the total enrollments by year of each school. Also plot the ratio of the yearly enrollments and the enrollments in the first year (2006). These are broken up by charter and non-charter schools. 55 | 56 | ```{r fig.width=8, fig.height=6} 57 | library(ggplot2) 58 | ggplot(subset(dat), aes(Year, Total, group = School_ID)) + facet_wrap( ~ School_Type) + geom_line() 59 | 60 | ggplot(subset(dat), aes(Year, Total / Total_2006, group = School_ID)) + geom_line(aes(alpha = Total_2006)) + 61 | facet_wrap( ~ School_Type) + scale_y_log10() 62 | ``` 63 | 64 | Summarize the schools by their average yearly enrollment and the SD of the yearly enrollment. I did this to find a school that consistently has lots of students. 65 | ```{r} 66 | head(ddply(dat, .(School_ID), summarize, mean = mean(Total), sd = sd(Total))) 67 | ``` 68 | 69 | For one school (Portage Park), plot some yearly enrollment by grade. 70 | 71 | ```{r fig.width=8, fig.height=6} 72 | dat_school = subset(dat, School_ID == 610135, select=c(Year,PK:G08)) 73 | dat_school = dat_school[order(dat_school$Year), ] 74 | 75 | library(reshape2) 76 | school_m = melt(dat_school, id = "Year") 77 | ggplot(school_m, aes(Year, value, colour = variable)) + geom_line() + facet_wrap( ~ variable) 78 | ggplot(school_m, aes(Year, variable, fill = cut(value,8))) + geom_tile() + 79 | scale_fill_brewer(type = "div", palette = 1) + labs(title = "Portage Park", y = "Grade", fill = "# Enrolled") 80 | ``` 81 | 82 | From the tile plot, you can see some trends of high and low enrollments following each year. 83 | -------------------------------------------------------------------------------- /Data_Analysis/grade9_transition_matrix.R: -------------------------------------------------------------------------------- 1 | library("plyr") 2 | library("GISTools") 3 | library("ggplot2") 4 | 5 | setwd("/Volumes/appdata-3/School Reports") 6 | data = read.csv("tbl_9thgrade_toandfrom_tracy.csv", stringsAsFactors = F) 7 | data2013 = subset(data, year == 2013) 8 | schoolstable = read.csv("schools.csv") 9 | 10 | setwd("/Volumes/appdata-3/Other Datasets/public neighborhood data/CPS High School Attendance Boundaries SY13 14") 11 | HS_boundaries = readShapePoly(fn = "geo_aeud-d3nt-1") 12 | HS_boundaries_data = slot(HS_boundaries, "data") 13 | HS_boundaries_data = subset(HS_boundaries_data, BoundaryGr %in% c("9, 10", "9, 10, 11, 12")) 14 | HS_catchment1314 = unique(HS_boundaries_data$SchoolID) 15 | HS_catchment1314_keys = subset(schoolstable, SchoolID %in% HS_catchment1314, select = SchoolKey) 16 | 17 | catchmentsum = ddply(data2013, .(CATCHMENT_SCHOOL_KEY), summarize, catchmentsum = sum(Freq)) 18 | 19 | data2013 = merge(data2013, catchmentsum, all.x = T, by.x = "CATCHMENT_SCHOOL_KEY", by.y = "CATCHMENT_SCHOOL_KEY") 20 | 21 | data2013 = subset(data2013, CATCHMENT_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys)) 22 | 23 | # merge lat long 24 | data2013 = merge(data2013, schoolstable[,c("SchoolKey","Longitude","Latitude")], all.x = T, by.x = "NEXT_GRADE_SCHOOL_KEY", by.y = "SchoolKey") 25 | data2013 = subset(data2013, !is.na(CATCHMENT_SCHOOL_NAME)) 26 | data2013 = subset(data2013, !is.na(NEXT_GRADE_SCHOOL_NAME)) 27 | 28 | data2013$catchmentprop = data2013$Freq/data2013$catchmentsum 29 | data2013$iscatchment = 1*(data2013$NEXT_GRADE_SCHOOL_KEY == data2013$CATCHMENT_SCHOOL_KEY) 30 | 31 | # all together sorted N to S 32 | data2013$NEXT_GRADE_SCHOOL_NAME = with(data2013, reorder(NEXT_GRADE_SCHOOL_NAME, Latitude)) 33 | data2013$CATCHMENT_SCHOOL_NAME = factor(data2013$CATCHMENT_SCHOOL_NAME, levels = intersect(levels(data2013$NEXT_GRADE_SCHOOL_NAME), unique(data2013$CATCHMENT_SCHOOL_NAME))) 34 | 35 | p <- ggplot(data2013, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = catchmentprop), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_y_discrete(limits = rev(levels(data2013$CATCHMENT_SCHOOL_NAME))) 36 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) 37 | 38 | 39 | # catchment schools only 40 | data2013_catch = subset(data2013, CATCHMENT_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys)) 41 | data2013_catch = subset(data2013_catch, NEXT_GRADE_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys)) 42 | 43 | data2013_catch$NEXT_GRADE_SCHOOL_NAME = with(data2013_catch, reorder(NEXT_GRADE_SCHOOL_NAME, Latitude)) 44 | data2013_catch$CATCHMENT_SCHOOL_NAME = factor(data2013_catch$CATCHMENT_SCHOOL_NAME, levels = levels(data2013_catch$NEXT_GRADE_SCHOOL_NAME)) 45 | #data2013$NEXT_GRADE_SCHOOL_NAME = factor(data2013$NEXT_GRADE_SCHOOL_NAME, levels = c(levels(data2013$CATCHMENT_SCHOOL_NAME), setdiff(data2013$NEXT_GRADE_SCHOOL_NAME, data2013$CATCHMENT_SCHOOL_NAME))) 46 | 47 | p <- ggplot(data2013_catch, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = catchmentprop), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_x_discrete(limits = rev(levels(data2013_catch$NEXT_GRADE_SCHOOL_NAME))) 48 | # with names 49 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(size = 8, angle = 270, hjust = 0, colour = "grey50")) 50 | # without names 51 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) 52 | 53 | 54 | p2 <- ggplot(data2013_catch, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = iscatchment), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_x_discrete(limits = rev(levels(data2013_catch$NEXT_GRADE_SCHOOL_NAME))) 55 | # with names 56 | p2 + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(size = 8, angle = 270, hjust = 0, colour = "grey50")) 57 | # without names 58 | p2 + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) 59 | 60 | 61 | # non-catchment schools only 62 | data2013_noncatch = subset(data2013, CATCHMENT_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys)) 63 | data2013_noncatch = subset(data2013_noncatch, ! NEXT_GRADE_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys)) 64 | 65 | data2013_noncatch$NEXT_GRADE_SCHOOL_NAME = with(data2013_noncatch, reorder(NEXT_GRADE_SCHOOL_NAME, Latitude)) 66 | data2013_noncatch$CATCHMENT_SCHOOL_NAME = factor(data2013_noncatch$CATCHMENT_SCHOOL_NAME, levels = levels(data2013_catch$CATCHMENT_SCHOOL_NAME)) 67 | #data2013$NEXT_GRADE_SCHOOL_NAME = factor(data2013$NEXT_GRADE_SCHOOL_NAME, levels = c(levels(data2013$CATCHMENT_SCHOOL_NAME), setdiff(data2013$NEXT_GRADE_SCHOOL_NAME, data2013$CATCHMENT_SCHOOL_NAME))) 68 | 69 | 70 | p <- ggplot(data2013_noncatch, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = catchmentprop), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_x_discrete(limits = rev(levels(data2013_noncatch$NEXT_GRADE_SCHOOL_NAME))) 71 | # with names 72 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(size = 8, angle = 270, hjust = 0, colour = "grey50")) 73 | # without names 74 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) 75 | 76 | 77 | # all schools, by rating 78 | setwd("/Volumes/appdata-3/Master_Data") 79 | schoolfeatures = read.csv("schools_changing_features_2013.csv", stringsAsFactors = F) 80 | 81 | data2013 = merge(data2013, schoolstable[,c("SchoolKey", "SchoolID")], all.x = T, by.x = "CATCHMENT_SCHOOL_KEY", by.y = "SchoolKey") 82 | data2013 = merge(data2013, schoolfeatures[,c("SchoolID","Rating","Safety","Mobility")], all.x = T, by.x = "SchoolID", by.y = "SchoolID") 83 | table(data2013$Rating) 84 | data2013$Rating2 = as.numeric(as.factor(data2013$Rating)) 85 | 86 | # catchment schools only 87 | data2013_catch = subset(data2013, CATCHMENT_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys)) 88 | data2013_catch = subset(data2013_catch, NEXT_GRADE_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys)) 89 | 90 | data2013_catch$CATCHMENT_SCHOOL_NAME = with(data2013_catch, reorder(CATCHMENT_SCHOOL_NAME, Rating2)) 91 | data2013_catch$NEXT_GRADE_SCHOOL_NAME = factor(data2013_catch$NEXT_GRADE_SCHOOL_NAME, levels = levels(data2013_catch$CATCHMENT_SCHOOL_NAME)) 92 | #data2013$NEXT_GRADE_SCHOOL_NAME = factor(data2013$NEXT_GRADE_SCHOOL_NAME, levels = c(levels(data2013$CATCHMENT_SCHOOL_NAME), setdiff(data2013$NEXT_GRADE_SCHOOL_NAME, data2013$CATCHMENT_SCHOOL_NAME))) 93 | 94 | p <- ggplot(data2013_catch, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = catchmentprop), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_x_discrete(limits = rev(levels(data2013_catch$NEXT_GRADE_SCHOOL_NAME))) 95 | # with names 96 | p + theme(legend.position = "none", axis.ticks = element_blank(), 97 | axis.title.x = element_blank(), axis.title.y = element_blank(), 98 | axis.text.x = element_text(size = 8, angle = 270, hjust = 0, 99 | colour = "grey50")) + 100 | geom_rect(aes(xmin = 0.5, xmax = 47.5, ymin = 5.5, ymax = 5.5), 101 | fill = "transparent", color = "black", size = 1) + 102 | geom_rect(aes(xmin = 0.5, xmax = 47.5, ymin = 30.5, ymax = 30.5), 103 | fill = "transparent", color = "black", size = 1) + 104 | geom_rect(aes(ymin = 0.5, ymax = 47.5, xmin = 47.5-5, xmax = 47.5-5), 105 | fill = "transparent", color = "black", size = 1) + 106 | geom_rect(aes(ymin = 0.5, ymax = 47.5, xmin = 47.5-30, xmax = 47.5-30), 107 | fill = "transparent", color = "black", size = 1) 108 | 109 | # non-catchment 110 | data2013_noncatch = subset(data2013, CATCHMENT_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys)) 111 | data2013_noncatch = subset(data2013_noncatch, ! NEXT_GRADE_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys)) 112 | 113 | data2013_noncatch$CATCHMENT_SCHOOL_NAME = factor(data2013_noncatch$CATCHMENT_SCHOOL_NAME, levels = levels(data2013_catch$CATCHMENT_SCHOOL_NAME)) 114 | #data2013$NEXT_GRADE_SCHOOL_NAME = factor(data2013$NEXT_GRADE_SCHOOL_NAME, levels = c(levels(data2013$CATCHMENT_SCHOOL_NAME), setdiff(data2013$NEXT_GRADE_SCHOOL_NAME, data2013$CATCHMENT_SCHOOL_NAME))) 115 | 116 | 117 | p2 <- ggplot(data2013_noncatch, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = catchmentprop), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_x_discrete(limits = rev(levels(data2013_noncatch$NEXT_GRADE_SCHOOL_NAME))) 118 | # with names 119 | p2 + theme(legend.position = "none", axis.ticks = element_blank(), 120 | axis.title.x = element_blank(), axis.title.y = element_blank(), 121 | axis.text.x = element_text(size = 8, angle = 270, hjust = 0, 122 | colour = "grey50")) + 123 | geom_rect(aes(xmin = 0.5, xmax = 103.5, ymin = 5.5, ymax = 5.5), 124 | fill = "transparent", color = "black", size = 1) + 125 | geom_rect(aes(xmin = 0.5, xmax = 103.5, ymin = 30.5, ymax = 30.5), 126 | fill = "transparent", color = "black", size = 1) 127 | 128 | 129 | # without names 130 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) 131 | -------------------------------------------------------------------------------- /Data_Analysis/history_evaluation.R: -------------------------------------------------------------------------------- 1 | ###################################### 2 | ###Historical Projection Evaluation### 3 | ###################################### 4 | 5 | ###CPS Team### 6 | ###07/21/2014### 7 | 8 | rm(list=ls()) 9 | setwd("/Volumes/DSSG/Master_Data/") 10 | suppressMessages(library(ggplot2)) 11 | suppressMessages(library(reshape2)) 12 | suppressMessages(library(gridExtra)) 13 | pred2009 <- read.csv("csp2009.csv") 14 | pred2010 <- read.csv("csp2010.csv") 15 | pred2011 <- read.csv("csp2011.csv") 16 | pred2012 <- read.csv("csp2012.csv") 17 | pred2013 <- read.csv("csp2013.csv") 18 | true2009 <- read.csv("true2009.csv") 19 | true2010 <- read.csv("true2010.csv") 20 | true2011 <- read.csv("true2011.csv") 21 | true2012 <- read.csv("true2012.csv") 22 | true2013 <- read.csv("true2013.csv") 23 | edit <- read.csv("cps_adjust.csv") 24 | 25 | ###Process Data### 26 | mis_projection <- function(pred, real) { 27 | id <- intersect(pred$SchoolID, real$SchoolID) 28 | pred <- subset(pred, pred$SchoolID%in%id) 29 | real <- subset(real, real$SchoolID%in%id) 30 | pred <- pred[order(pred$SchoolID),] 31 | real <- real[order(real$SchoolID),] 32 | data <- pred-real #projection-actual 33 | data$SchoolID <- sort(id) 34 | return(data) 35 | } 36 | data2009 <- mis_projection(pred2009, true2009) 37 | data2010 <- mis_projection(pred2010, true2010) 38 | data2011 <- mis_projection(pred2011, true2011) 39 | data2012 <- mis_projection(pred2012, true2012) 40 | data2013 <- mis_projection(pred2013, true2013) 41 | data2009$year <- 2009 42 | data2010$year <- 2010 43 | data2011$year <- 2011 44 | data2012$year <- 2012 45 | data2013$year <- 2013 46 | 47 | ###Evaluation### 48 | data_plot <- function(data, name, threshold) { 49 | temp <- data.frame(year=data$year, miss=unlist(data[name])) 50 | temp$indicator <- NA 51 | temp[temp$miss>=threshold,]$indicator <- "Over-projected By 100" 52 | temp[temp$miss>25 & temp$miss= -25 & temp$miss<=25,]$indicator <- "Tolerably Mis-projected" 54 | temp[temp$miss< -25 & temp$miss>-threshold,]$indicator <- "Under-projected By 25" 55 | temp[temp$miss<=-threshold,]$indicator <- "Under-projected By 100" 56 | temp$indicator <- factor(temp$indicator, levels=c("Under-projected By 100", "Under-projected By 25", "Tolerably Mis-projected", "Over-projected By 25", "Over-projected By 100")) 57 | return(temp) 58 | } 59 | data_error <- function(data, threshold) { 60 | err <- data.frame(Year=c(2009, 2010, 2011, 2012, 2013), Total=NA, Error=NA) 61 | for (i in 1:nrow(err)) { 62 | temp <- subset(data, data$year==err[i,]$Year) 63 | err[i,]$Total <- nrow(temp) 64 | err[i,]$Error <- length(which(abs(temp$miss)>=threshold)) 65 | } 66 | return(err) 67 | } 68 | #school 69 | data2009_new <- data_plot(data2009, "ATOT", 100) 70 | data2010_new <- data_plot(data2010, "ATOT", 100) 71 | data2011_new <- data_plot(data2011, "ATOT", 100) 72 | data2012_new <- data_plot(data2012, "ATOT", 100) 73 | data2013_new <- data_plot(data2013, "ATOT", 100) 74 | data_all <- do.call("rbind", list(data2009_new, data2010_new, data2011_new, data2012_new, data2013_new)) 75 | data_err <- data_error(data_all, 100) 76 | ggplot()+ 77 | geom_bar(data=data_all, aes(x=factor(year), fill=indicator), binwidth=0.5, position="stack")+ 78 | geom_text(data=data_err, aes(x=factor(Year), y=Total+10, label=Error), vjust=0)+ 79 | xlab("Year")+ylab("Number of Schools") 80 | 81 | ###Serious Mis-projection By Year### 82 | name <- c("K", "Grade1", "Grade2", "Grade3", "Grade4", "Grade5", "Grade6", "Grade7", "Grade8", "Grade9", "Grade10", "Grade11", "Grade12") 83 | grade <- data.frame(matrix(NA, 5, length(name)+1)) 84 | colnames(grade) <- c("Year", name) 85 | grade$Year <- 2009:2013 86 | for (i in 1:length(name)) { 87 | grade[1,i+1] <- length(which(abs(unlist(data2009[name[i]]))>=25)) 88 | grade[2,i+1] <- length(which(abs(unlist(data2010[name[i]]))>=25)) 89 | grade[3,i+1] <- length(which(abs(unlist(data2011[name[i]]))>=25)) 90 | grade[4,i+1] <- length(which(abs(unlist(data2012[name[i]]))>=25)) 91 | grade[5,i+1] <- length(which(abs(unlist(data2013[name[i]]))>=25)) 92 | } 93 | grade_melted <- melt(grade, id=1) 94 | ggplot(data=grade_melted, aes(x=Year, y=value, colour=variable))+geom_line()+xlab("Year")+ylab("Number of Schools") 95 | 96 | ###Compare CSM And Edit### 97 | real <- data.frame(SchoolID=true2013$SchoolID, Total=true2013$ATOT) 98 | csmp <- data.frame(SchoolID=pred2013$SchoolID, ATOT=pred2013$ATOT) 99 | before <- merge(real, csmp) 100 | after <- merge(real, edit) 101 | before_plot <- ggplot(before, aes(x=ATOT-Total))+geom_histogram(binwidth=5)+xlab("Mis-project")+ylab("Number of Schools")+xlim(-500, 500)+ggtitle("Before Edit") 102 | after_plot <- ggplot(after, aes(x=ATOT-Total))+geom_histogram(binwidth=5)+xlab("Mis-project")+ylab("Number of Schools")+xlim(-500, 500)+ggtitle("After Edit") 103 | grid.arrange(before_plot, after_plot, ncol=2) 104 | 105 | ####Last Year Prediction Error#### 106 | name <- c("K", paste0("Grade", 1:12)) 107 | temp <- data2013[,names(data2013)%in%name] 108 | grade_graph <- data.frame(Grade=c("K", as.character(1:12)), Misprojection=colSums(abs(temp)>=30)) 109 | grade_graph$Grade <- factor(grade_graph$Grade, levels=c("K", as.character(1:12))) 110 | ggplot(grade_graph, aes(x=Grade, y=Misprojection))+geom_bar(stat="identity", fill="slategray4")+theme_bw(base_size=20)+labs(x=NULL, y="# Schools with Large Error") 111 | 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /Data_Analysis/investigate_9thgrade.R: -------------------------------------------------------------------------------- 1 | # source('C:/Users/Andrew/github/predicting_student_enrollment/estimation-sandbox/feeder_school_prediction_89.R') 2 | setwd("/Volumes/appdata-3") 3 | load("/Volumes/appdata-3/Student_Data/students_89_snap.RData") 4 | schools = read.csv("/Volumes/appdata-3/School Reports/schools.csv") 5 | 6 | library(fields) 7 | 8 | students = subset(students, !(STUDENT_EDUCATIONAL_EXCEPT_TYP %in% c("21 - 60%", "61 - 100%"))) 9 | students$THIS_GRADE_SCHOOL_KEY[is.na(students$THIS_GRADE_SCHOOL_KEY)] = 0 10 | students$NEXT_GRADE_SCHOOL_KEY[is.na(students$NEXT_GRADE_SCHOOL_KEY)] = 0 11 | 12 | tbl2011_0 = as.data.frame(with(subset(students, SCHOOL_YEAR == 2010 & THIS_GRADE_SCHOOL_KEY == 0), table(NEXT_GRADE_SCHOOL_KEY))) 13 | tbl2011_0$studentsource = 'outside_CPS' 14 | #colnames(tbl2011_0) = c("SCHOOL_KEY", "Num_0") 15 | 16 | tbl2011_catchment = as.data.frame(with(subset(students, SCHOOL_YEAR == 2010 & NEXT_GRADE_SCHOOL_KEY == CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY))) 17 | tbl2011_catchment$studentsource = 'catchment' 18 | #colnames(tbl2011_catchment) = c("SCHOOL_KEY", "Num_catchment") 19 | 20 | tbl2011_other = as.data.frame(with(subset(students, SCHOOL_YEAR == 2010 & THIS_GRADE_SCHOOL_KEY > 0 & NEXT_GRADE_SCHOOL_KEY != CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY))) 21 | tbl2011_other$studentsource = 'other' 22 | #colnames(tbl2011_other) = c("SCHOOL_KEY", "Num_other") 23 | 24 | tbl2011 = rbind(tbl2011_0, tbl2011_catchment, tbl2011_other) 25 | tbl2011$year = 2011 26 | #tbl2011 = merge(tbl2011_0, tbl2011_catchment, all = TRUE) 27 | #tbl2011 = merge(tbl2011, tbl2011_other, all = TRUE) 28 | 29 | 30 | tbl2012_0 = as.data.frame(with(subset(students, SCHOOL_YEAR == 2011 & THIS_GRADE_SCHOOL_KEY == 0), table(NEXT_GRADE_SCHOOL_KEY))) 31 | tbl2012_0$studentsource = 'outside_CPS' 32 | #colnames(tbl2012_0) = c("SCHOOL_KEY", "Num_0") 33 | tbl2012_catchment = as.data.frame(with(subset(students, SCHOOL_YEAR == 2011 & NEXT_GRADE_SCHOOL_KEY == CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY))) 34 | tbl2012_catchment$studentsource = 'catchment' 35 | #colnames(tbl2012_catchment) = c("SCHOOL_KEY", "Num_catchment") 36 | tbl2012_other = as.data.frame(with(subset(students, SCHOOL_YEAR == 2011 & THIS_GRADE_SCHOOL_KEY > 0 & NEXT_GRADE_SCHOOL_KEY != CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY))) 37 | tbl2012_other$studentsource = 'other' 38 | #colnames(tbl2012_other) = c("SCHOOL_KEY", "Num_other") 39 | 40 | tbl2012 = rbind(tbl2012_0, tbl2012_catchment, tbl2012_other) 41 | tbl2012$year = 2012 42 | #tbl2012 = merge(tbl2012_0, tbl2012_catchment, all = TRUE) 43 | #tbl2012 = merge(tbl2012, tbl2012_other, all = TRUE) 44 | 45 | tbl2013_0 = as.data.frame(with(subset(students, SCHOOL_YEAR == 2012 & THIS_GRADE_SCHOOL_KEY == 0), table(NEXT_GRADE_SCHOOL_KEY))) 46 | tbl2013_0$studentsource = 'outside_CPS' 47 | #colnames(tbl2013_0) = c("SCHOOL_KEY", "Num_0") 48 | tbl2013_catchment = as.data.frame(with(subset(students, SCHOOL_YEAR == 2012 & NEXT_GRADE_SCHOOL_KEY == CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY))) 49 | tbl2013_catchment$studentsource = 'catchment' 50 | #colnames(tbl2013_catchment) = c("SCHOOL_KEY", "Num_catchment") 51 | tbl2013_other = as.data.frame(with(subset(students, SCHOOL_YEAR == 2012 & THIS_GRADE_SCHOOL_KEY > 0 & NEXT_GRADE_SCHOOL_KEY != CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY))) 52 | tbl2013_other$studentsource = 'other' 53 | #colnames(tbl2013_other) = c("SCHOOL_KEY", "Num_other") 54 | 55 | tbl2013 = rbind(tbl2013_0, tbl2013_catchment, tbl2013_other) 56 | tbl2013$year = 2013 57 | #tbl2013 = merge(tbl2013_0, tbl2013_catchment, all = TRUE) 58 | #tbl2013 = merge(tbl2013, tbl2013_other, all = TRUE) 59 | 60 | #tbl2011$Year = 2011 61 | #tbl2012$Year = 2012 62 | #tbl2013$Year = 2013 63 | 64 | tbl9thgradesources = rbind(tbl2011, tbl2012, tbl2013) 65 | setwd("/Volumes/appdata-3/Student_Data") 66 | write.csv(tbl9thgradesources, file = "tbl9thgradesources_tracy.csv", row.names = FALSE) 67 | 68 | 69 | # all to-and-from data 70 | tbl2011_all = as.data.frame(with(subset(students, SCHOOL_YEAR == 2010), table(CATCHMENT_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY))) 71 | tbl2011_all$year = 2011 72 | 73 | tbl2012_all = as.data.frame(with(subset(students, SCHOOL_YEAR == 2011), table(CATCHMENT_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY))) 74 | tbl2012_all$year = 2012 75 | 76 | tbl2013_all = as.data.frame(with(subset(students, SCHOOL_YEAR == 2012), table(CATCHMENT_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY))) 77 | tbl2013_all$year = 2013 78 | 79 | tblall = rbind(tbl2011_all, tbl2012_all, tbl2013_all) 80 | 81 | tblall = merge(tblall, schools[,c("SCHOOL_KEY","SCHOOL_SHORT_NAME")], all.x = TRUE, by.x = "CATCHMENT_SCHOOL_KEY", by.y = "SCHOOL_KEY") 82 | colnames(tblall)[5] = 'CATCHMENT_SCHOOL_NAME' 83 | 84 | tblall = merge(tblall, schools[,c("SCHOOL_KEY","SCHOOL_SHORT_NAME")], all.x = TRUE, by.x = "NEXT_GRADE_SCHOOL_KEY", by.y = "SCHOOL_KEY") 85 | colnames(tblall)[6] = 'NEXT_GRADE_SCHOOL_NAME' 86 | 87 | setwd("/Volumes/appdata-3/School Reports") 88 | write.csv(tblall, file = "tbl_9thgrade_toandfrom_tracy.csv", row.names = FALSE) 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | setwd("/Volumes/appdata-3/Count Data") 98 | countdata = read.csv("enrollment_byschool_byyear_bygrade.csv") 99 | 100 | # plot by school 101 | countdata_wide = reshape(countdata, v.names = "ENROLLMENT", timevar = "YEAR", idvar = c("SCHOOL_CODE","GRADE"), direction = "wide") 102 | countdata_wide[is.na(countdata_wide)] = 0 103 | 104 | grade9change = subset(countdata_wide, GRADE == 'X9' & ENROLLMENT.2012 > 0 & ENROLLMENT.2013 > 0) 105 | head(grade9change[order(abs(grade9change$ENROLLMENT.2013 - grade9change$ENROLLMENT.2012), decreasing = T),c("SCHOOL_CODE")]) 106 | 107 | plotschool(609707) 108 | 109 | plotschool = function(sc_code){ 110 | key = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_KEY)[1,1] 111 | name = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_NAME)[1,1] 112 | 113 | test = matrix(nrow = 3, ncol = 3) 114 | test = as.data.frame(test) 115 | rownames(test) = c("Catchment","Other","Outside district") 116 | colnames(test) = c("2011", "2012", "2013") 117 | test[1,1] = subset(tbl2011, SCHOOL_KEY == key, select = "Num_catchment") 118 | test[2,1] = subset(tbl2011, SCHOOL_KEY == key, select = "Num_other") 119 | test[3,1] = subset(tbl2011, SCHOOL_KEY == key, select = "Num_0") 120 | 121 | test[1,2] = subset(tbl2012, SCHOOL_KEY == key, select = "Num_catchment") 122 | test[2,2] = subset(tbl2012, SCHOOL_KEY == key, select = "Num_other") 123 | test[3,2] = subset(tbl2012, SCHOOL_KEY == key, select = "Num_0") 124 | 125 | test[1,3] = subset(tbl2013, SCHOOL_KEY == key, select = "Num_catchment") 126 | test[2,3] = subset(tbl2013, SCHOOL_KEY == key, select = "Num_other") 127 | test[3,3] = subset(tbl2013, SCHOOL_KEY == key, select = "Num_0") 128 | test = as.matrix(test) 129 | test[is.na(test)]=0 130 | 131 | barplot(test, beside = F, legend.text = T, args.legend = list(x = 'topleft'),main =paste(name),ylab="Grade 9 enrollment",col=c(2,4,5)) 132 | } 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | tbl2013 = with(subset(students, SCHOOL_YEAR == 2012), table(THIS_GRADE_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY)) 141 | 142 | sc_code = 609713 143 | sc_code = 609764 144 | sc_code = 610563 145 | key = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_KEY)[1,1] 146 | location = subset(schools, SCHOOL_CODE == sc_code, c(lon,lat)) 147 | el_schools2012 = tbl2012[,colnames(tbl2012)==key] 148 | el_schools2013 = tbl2013[,colnames(tbl2013)==key] 149 | 150 | el_schools2012df = data.frame(SCHOOL_KEY = names(el_schools2012), Num2012 = el_schools2012) 151 | el_schools2013df = data.frame(SCHOOL_KEY = names(el_schools2013), Num2013 = el_schools2013) 152 | 153 | el_schoolsdf = merge(el_schools2012df, el_schools2013df, all = TRUE) 154 | el_schoolsdf = el_schools2013df 155 | el_schoolsdf$Num2012[is.na(el_schoolsdf$Num2012)] = 0 156 | el_schoolsdf$Num2013[is.na(el_schoolsdf$Num2013)] = 0 157 | el_schoolsdf = subset(el_schoolsdf, Num2012>0 | Num2013>0) 158 | el_schoolsdf = merge(el_schoolsdf, subset(schools, , c(SCHOOL_KEY, SCHOOL_CODE, SCHOOL_SHORT_NAME, lon, lat)), all.x = TRUE) 159 | dist.mat = rdist.earth(subset(el_schoolsdf, , c(lon, lat)), location) 160 | el_schoolsdf = cbind(el_schoolsdf, Distance = dist.mat[,1]) 161 | el_schoolsdf$Change = el_schoolsdf$Num2013 - el_schoolsdf$Num2012 162 | 163 | head(el_schoolsdf[order(-abs(el_schoolsdf$Num2012-el_schoolsdf$Num2013)),], 10) 164 | head(el_schoolsdf[order(-abs(el_schoolsdf$Num2013)),], 10) 165 | 166 | qplot(Distance,Change,data = el_schoolsdf) 167 | 168 | sc_code = 610329 169 | key = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_KEY)[1,1] 170 | location = subset(schools, SCHOOL_CODE == sc_code, c(lon,lat)) 171 | 172 | high_schools2012 = tbl2012[rownames(tbl2012)==key,] 173 | high_schools2013 = tbl2013[rownames(tbl2013)==key,] 174 | 175 | high_schools2012df = data.frame(SCHOOL_KEY = names(high_schools2012), Num2012 = high_schools2012) 176 | high_schools2013df = data.frame(SCHOOL_KEY = names(high_schools2013), Num2013 = high_schools2013) 177 | 178 | high_schoolsdf = merge(high_schools2012df, high_schools2013df, all = TRUE) 179 | high_schoolsdf$Num2012[is.na(high_schoolsdf$Num2012)] = 0 180 | high_schoolsdf$Num2013[is.na(high_schoolsdf$Num2013)] = 0 181 | high_schoolsdf = subset(high_schoolsdf, Num2012>0 | Num2013>0) 182 | high_schoolsdf = merge(high_schoolsdf, subset(schools, , c(SCHOOL_KEY, SCHOOL_CODE, SCHOOL_SHORT_NAME, lon, lat)), all.x = TRUE) 183 | dist.mat = rdist.earth(subset(high_schoolsdf, , c(lon, lat)), location) 184 | high_schoolsdf = cbind(high_schoolsdf, Distance = dist.mat[,1]) 185 | high_schoolsdf$Change = high_schoolsdf$Num2013 - high_schoolsdf$Num2012 186 | 187 | head(high_schoolsdf[order(-abs(high_schoolsdf$Num2012-high_schoolsdf$Num2013)),], 10) 188 | tail(high_schoolsdf[order(high_schoolsdf$Num2012-high_schoolsdf$Num2013),], 10) 189 | head(high_schoolsdf[order(-abs(high_schoolsdf$Num2012)),], 10) 190 | 191 | colSums(high_schoolsdf[,2:3]) 192 | -------------------------------------------------------------------------------- /Data_Analysis/investigate_juarez_change.R: -------------------------------------------------------------------------------- 1 | # source('C:/Users/Andrew/github/predicting_student_enrollment/estimation-sandbox/feeder_school_prediction_89.R') 2 | setwd("//admin/appdata/DataWarehouse/DSSG") 3 | load("//admin/appdata/DataWarehouse/DSSG/Student_Data/students_89_snap.RData") 4 | schools = read.csv("//admin/appdata/DataWarehouse/DSSG/School Reports/schools.csv") 5 | library(fields) 6 | 7 | students = subset(students, !(STUDENT_EDUCATIONAL_EXCEPT_TYP %in% c("21 - 60%", "61 - 100%"))) 8 | students$THIS_GRADE_SCHOOL_KEY[is.na(students$THIS_GRADE_SCHOOL_KEY)] = 0 9 | students$NEXT_GRADE_SCHOOL_KEY[is.na(students$NEXT_GRADE_SCHOOL_KEY)] = 0 10 | 11 | tbl2012 = with(subset(students, SCHOOL_YEAR == 2011), table(THIS_GRADE_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY)) 12 | tbl2013 = with(subset(students, SCHOOL_YEAR == 2012), table(THIS_GRADE_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY)) 13 | 14 | sc_code = 609713 15 | sc_code = 609764 16 | sc_code = 610563 17 | key = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_KEY)[1,1] 18 | location = subset(schools, SCHOOL_CODE == sc_code, c(lon,lat)) 19 | el_schools2012 = tbl2012[,colnames(tbl2012)==key] 20 | el_schools2013 = tbl2013[,colnames(tbl2013)==key] 21 | 22 | el_schools2012df = data.frame(SCHOOL_KEY = names(el_schools2012), Num2012 = el_schools2012) 23 | el_schools2013df = data.frame(SCHOOL_KEY = names(el_schools2013), Num2013 = el_schools2013) 24 | 25 | el_schoolsdf = merge(el_schools2012df, el_schools2013df, all = TRUE) 26 | el_schoolsdf = el_schools2013df 27 | el_schoolsdf$Num2012[is.na(el_schoolsdf$Num2012)] = 0 28 | el_schoolsdf$Num2013[is.na(el_schoolsdf$Num2013)] = 0 29 | el_schoolsdf = subset(el_schoolsdf, Num2012>0 | Num2013>0) 30 | el_schoolsdf = merge(el_schoolsdf, subset(schools, , c(SCHOOL_KEY, SCHOOL_CODE, SCHOOL_SHORT_NAME, lon, lat)), all.x = TRUE) 31 | dist.mat = rdist.earth(subset(el_schoolsdf, , c(lon, lat)), location) 32 | el_schoolsdf = cbind(el_schoolsdf, Distance = dist.mat[,1]) 33 | el_schoolsdf$Change = el_schoolsdf$Num2013 - el_schoolsdf$Num2012 34 | 35 | head(el_schoolsdf[order(-abs(el_schoolsdf$Num2012-el_schoolsdf$Num2013)),], 10) 36 | head(el_schoolsdf[order(-abs(el_schoolsdf$Num2013)),], 10) 37 | 38 | qplot(Distance,Change,data = el_schoolsdf) 39 | 40 | sc_code = 610329 41 | key = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_KEY)[1,1] 42 | location = subset(schools, SCHOOL_CODE == sc_code, c(lon,lat)) 43 | 44 | high_schools2012 = tbl2012[rownames(tbl2012)==key,] 45 | high_schools2013 = tbl2013[rownames(tbl2013)==key,] 46 | 47 | high_schools2012df = data.frame(SCHOOL_KEY = names(high_schools2012), Num2012 = high_schools2012) 48 | high_schools2013df = data.frame(SCHOOL_KEY = names(high_schools2013), Num2013 = high_schools2013) 49 | 50 | high_schoolsdf = merge(high_schools2012df, high_schools2013df, all = TRUE) 51 | high_schoolsdf$Num2012[is.na(high_schoolsdf$Num2012)] = 0 52 | high_schoolsdf$Num2013[is.na(high_schoolsdf$Num2013)] = 0 53 | high_schoolsdf = subset(high_schoolsdf, Num2012>0 | Num2013>0) 54 | high_schoolsdf = merge(high_schoolsdf, subset(schools, , c(SCHOOL_KEY, SCHOOL_CODE, SCHOOL_SHORT_NAME, lon, lat)), all.x = TRUE) 55 | dist.mat = rdist.earth(subset(high_schoolsdf, , c(lon, lat)), location) 56 | high_schoolsdf = cbind(high_schoolsdf, Distance = dist.mat[,1]) 57 | high_schoolsdf$Change = high_schoolsdf$Num2013 - high_schoolsdf$Num2012 58 | 59 | head(high_schoolsdf[order(-abs(high_schoolsdf$Num2012-high_schoolsdf$Num2013)),], 10) 60 | tail(high_schoolsdf[order(high_schoolsdf$Num2012-high_schoolsdf$Num2013),], 10) 61 | head(high_schoolsdf[order(-abs(high_schoolsdf$Num2012)),], 10) 62 | 63 | colSums(high_schoolsdf[,2:3]) 64 | -------------------------------------------------------------------------------- /Data_Analysis/map_neighborhood_catchment.R: -------------------------------------------------------------------------------- 1 | setwd("//admin/appdata/DataWarehouse/DSSG/Master_Data/") 2 | library(reshape2) 3 | library(ggplot2) 4 | rm(list=ls()) 5 | 6 | year = 2013 7 | students = read.csv(paste0("student",year,".csv")) 8 | students = subset(students, !(EducationType %in% c("21 - 60%", "61 - 100%"))) 9 | schools = read.csv("schools_static_features.csv") 10 | 11 | # students$NextGradeSchoolKey = ifelse(is.na(students$NextGradeSchoolKey), 12 | # 0, as.character(students$NextGradeSchoolKey)) 13 | students = subset(students, !is.na(students$NextGradeSchoolKey)) 14 | students$CatchmentSchoolKey = ifelse(is.na(students$CatchmentSchoolKey), 15 | 0, students$CatchmentSchoolKey) 16 | 17 | trans_table = with(students, table(CatchmentSchoolKey, NextGradeSchoolKey)) 18 | trans_table_melt <- melt(trans_table) 19 | trans_table_melt = merge(trans_table_melt, subset(schools,,c(SchoolKey, SchoolID)), 20 | by.x = "CatchmentSchoolKey", by.y = "SchoolKey", all.x = TRUE) 21 | colnames(trans_table_melt)[4] = "CatchmentSchoolID" 22 | trans_table_melt = merge(trans_table_melt, subset(schools,,c(SchoolKey, SchoolID)), 23 | by.x = "NextGradeSchoolKey", by.y = "SchoolKey", all.x = TRUE) 24 | colnames(trans_table_melt)[5] = "NextGradeSchoolID" 25 | 26 | library(dplyr) 27 | PercentSame = tbl_df(trans_table_melt) %>% 28 | mutate(SameSchool = (CatchmentSchoolKey==NextGradeSchoolKey)) %>% 29 | group_by(CatchmentSchoolID) %>% 30 | summarise( 31 | Total = sum(value), 32 | Catchment = sum(ifelse(SameSchool,value,0)) 33 | ) %>% 34 | mutate(PercentCatchment = Catchment/Total) 35 | 36 | # Neighborhood boundaries 37 | setwd("//admin/appdata/DataWarehouse/DSSG/Other Datasets/public neighborhood data/CPS High School Attendance Boundaries SY13 14") 38 | # http://geocommons.com/overlays/305198 39 | library(rgdal) 40 | library(maptools) 41 | library(rgeos) 42 | 43 | boundaries = readOGR(dsn=".", layer="geo_aeud-d3nt-1") 44 | boundaries@data$id = rownames(boundaries@data) 45 | boundaries = gBuffer(boundaries, width=0, byid=TRUE) 46 | boundaries.points = fortify(boundaries, region="id") 47 | boundaries.df = plyr::join(boundaries.points, boundaries@data, by="id") 48 | 49 | boundaries.df = merge(boundaries.df, PercentSame, by.x = "SchoolID", by.y = "CatchmentSchoolID", all.x=TRUE) 50 | boundaries.df = boundaries.df[order(boundaries.df$order),] 51 | boundaries.df = subset(boundaries.df, !is.na(boundaries.df$Catchment)) 52 | 53 | library(ggmap) 54 | plot_base = qmap(location = c(-87.70831, 41.84566), zoom = 10, maptype = "terrain") 55 | 56 | # id = 609708 # 609679 57 | # boundaries_school_counts = merge(bourdaries.df, subset(trans_table_melt, CatchmentSchoolID == id, c(NextGradeSchoolID, value)), 58 | # by.x = "SchoolID", by.y = "NextGradeSchoolID", all.x = TRUE) 59 | # # boundaries_school_counts$value[is.na(boundaries_school_counts$value)] <- 0 60 | # colnames(boundaries_school_counts)[ncol(boundaries_school_counts)] <- "To" 61 | # boundaries_school_counts = merge(boundaries_school_counts, subset(trans_table_melt, NextGradeSchoolID == id, c(CatchmentSchoolID, value)), 62 | # by.x = "SchoolID", by.y = "CatchmentSchoolID", all.x = TRUE) 63 | # # boundaries_school_counts$value[is.na(boundaries_school_counts$value)] <- 0 64 | # colnames(boundaries_school_counts)[ncol(boundaries_school_counts)] <- "From" 65 | # boundaries_school_counts = boundaries_school_counts[order(boundaries_school_counts$order),] 66 | # 67 | # school = subset(schools, SchoolID == id) 68 | # 69 | # school_mark = geom_point(data=school, aes(x = Longitude, y = Latitude), col = 'red', shape = 4, size = 4) 70 | # school_path = geom_path(data = boundaries_school_counts, aes(long,lat,group=group)) 71 | # school_poly = geom_polygon(data = boundaries_school_counts, aes(long,lat,group=group, fill = To), alpha = 1) 72 | # p <- ggplot() + school_poly + school_mark + school_path + ggtitle(paste0("School = ", school$SchoolName)) + 73 | # scale_fill_continuous(low = "white", high = "steelblue") 74 | # print(p) 75 | 76 | # school_path = geom_path(data = boundaries.df, aes(long,lat,group=group), colour = "black", alpha = .65, size = 0.5) 77 | # school_poly = geom_polygon(data = boundaries.df, aes(long,lat,group=group, fill = PercentCatchment*100), alpha = .6) 78 | # p <- plot_base + school_poly + school_path + scale_fill_continuous(low = "white", high = "#000066") + 79 | # ggtitle("Percent of Students that Go to their Catchment School") + labs(fill=NULL) + 80 | # theme(legend.justification=c(1,1), legend.position=c(1,1), legend.background = element_rect(fill="transparent")) 81 | # print(p) 82 | 83 | school_poly = geom_polygon(data = boundaries.df, aes(long,lat,group=group, fill = cut(round(PercentCatchment*100), seq(0,75,15))), alpha = .6) 84 | p <- plot_base + school_poly + school_path + scale_fill_brewer(palette="Purples", labels = paste0(seq(0,60,15),"% to ",c(seq(15,60,15),65),"%")) + 85 | labs(fill=NULL) + theme(legend.text = element_text(size = 18), legend.justification=c(1,1), legend.position=c(1,1), legend.background = element_rect(fill="transparent")) 86 | print(p) 87 | ggsave(plot = p, filename = "//admin/appdata/DataWarehouse/DSSG/Visualizations/R plots/PercentCatchmentSchool.pdf", width=8, height=8) 88 | # ggtitle("Percent of Students that Go to their Catchment School") 89 | -------------------------------------------------------------------------------- /Data_Analysis/prepare_data_and_shapefile_for_tableau.R: -------------------------------------------------------------------------------- 1 | # ------------------------------------------ # 2 | # create data file to read in to tableau # 3 | # ------------------------------------------ # 4 | 5 | setwd("/Volumes/appdata-3/Other Datasets/public neighborhood data/CPS High School Attendance Boundaries SY13 14") 6 | 7 | library("GISTools") 8 | boundaries = readShapePoly(fn = "geo_aeud-d3nt-1") # load boundaries 9 | boundaries = boundaries[boundaries@data$BoundaryGr %in% c("9, 10", "9, 10, 11, 12"),] # only 9th grade boundaries 10 | boundaries_polygons = slot(boundaries, "polygons") 11 | boundaries_data = slot(boundaries, "data") 12 | 13 | boundaries_csv = NULL 14 | 15 | for (i in 1:57){ 16 | 17 | temp = boundaries_polygons[[i]] # pull out polygon i 18 | tempcoords = slot(slot(temp, "Polygons")[[1]],"coords") # extract coordinates 19 | tempid = slot(temp, "ID") # extract polygon ID 20 | tempname = as.character(boundaries_data[i,"SchoolID"]) # extract school ID 21 | 22 | tempmat = cbind(c(0:(nrow(tempcoords)-1)),tempcoords, rep(tempid, nrow(tempcoords)), rep(tempname, nrow(tempcoords))) 23 | tempmat = as.data.frame(tempmat) 24 | colnames(tempmat) = c("PointOrder","Longitude","Latitude","PolygonID","SchoolID") 25 | boundaries_csv = rbind(boundaries_csv, tempmat) # save information about this polygon 26 | 27 | } 28 | 29 | setwd("/Volumes/appdata-3/School Reports") 30 | schoolstable = read.csv("schools.csv") # schools table from database 31 | # for each catchment school ID, match up the school key and school name 32 | boundaries_csv = merge(boundaries_csv, schoolstable[,c("SchoolKey","SchoolID","SchoolShortName")], all.x = T, by.x = "SchoolID", by.y = "SchoolID") 33 | 34 | setwd("/Volumes/appdata-3/School Reports") 35 | data = read.csv("tbl_9thgrade_toandfrom_tracy.csv", stringsAsFactors = F) # matrix of where students are from and where they go to 9th grade 36 | data2013 = subset(data, year == 2013) # only look at 2013 37 | 38 | # match up each catchment school key with the school ID 39 | data2013 = merge(data2013, schoolstable[,c("SchoolID","SchoolKey")], all.x = TRUE, by.x = "CATCHMENT_SCHOOL_KEY", by.y = "SchoolKey") 40 | 41 | # for each 'next grade' school key, match up the latitude and longitude of that school 42 | data2013 = merge(data2013, schoolstable[,c("SchoolKey","Latitude","Longitude")], all.x = T, by.x = "NEXT_GRADE_SCHOOL_KEY", by.y = "SchoolKey") 43 | 44 | data2013 = subset(data2013, Freq > 0) # only need combinations of catchment area and school where there are actually students 45 | data2013 = subset(data2013, !is.na(Latitude)) # remove if the locoation is missing 46 | 47 | catchment1314 = unique(boundaries_data$SchoolID) # find out school IDs of catchment areas 48 | 49 | data2013 = subset(data2013, SchoolID %in% catchment1314) # only need to consider catchment areas that have associated geographic area 50 | # there are some students where the catchment area is still their 8th grade school, we remove these 51 | 52 | # prepare data to read in to tableau: create 'type' variable 53 | boundaries_csv$type = 'polygon' 54 | data2013$type = 'school_point' 55 | 56 | boundaries_csv$NEXT_GRADE_SCHOOL_NAME = NA 57 | boundaries_csv$Freq = 0 58 | 59 | names(boundaries_csv)[7] = "CATCHMENT_SCHOOL_NAME" 60 | boundaries_csv = boundaries_csv[,c("type","SchoolID","PointOrder","PolygonID","Longitude","Latitude","CATCHMENT_SCHOOL_NAME","NEXT_GRADE_SCHOOL_NAME","Freq")] 61 | 62 | data2013$PointOrder = NA 63 | data2013$PolygonID = NA 64 | data2013 = data2013[,c("type","SchoolID","PointOrder","PolygonID","Longitude","Latitude","CATCHMENT_SCHOOL_NAME","NEXT_GRADE_SCHOOL_NAME","Freq")] 65 | 66 | setwd("/Volumes/appdata-3/Other Datasets/public neighborhood data") 67 | 68 | boundaries_csv$Longitude = as.numeric(as.character(boundaries_csv$Longitude)) 69 | boundaries_csv$Latitude = as.numeric(as.character(boundaries_csv$Latitude)) 70 | 71 | data_for_tableau = rbind(boundaries_csv, data2013) 72 | write.csv(data_for_tableau, file = "data_for_tableau_FINAL.csv", row.names = F) 73 | 74 | # for reverse, run above except do not remove 0 Freq's 75 | locations = unique(data2013[,c("NEXT_GRADE_SCHOOL_NAME","Latitude","Longitude")]) 76 | locations$type = 'locations' 77 | locations$PolygonID = NA 78 | locations$PointOrder = NA 79 | locations$SchoolID = NA 80 | locations$Freq = 0 81 | locations$CATCHMENT_SCHOOL_NAME = NA 82 | 83 | locations = locations[,names(data2013)] 84 | 85 | data_for_tableau = rbind(boundaries_csv, locations) 86 | data_for_tableau = data_for_tableau[,names(data_for_tableau) != 'Freq'] 87 | write.csv(data_for_tableau, file = "data_for_tableau_FINAL_reverse.csv", row.names = F) 88 | write.csv(data2013, file = "data_for_tableau_FINAL_reverse2.csv", row.names = F) 89 | -------------------------------------------------------------------------------- /Data_Analysis/topdown_FINAL.R: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------- # 2 | # predicting catchment high schools - fall 2013 # 3 | # ------------------------------------------------- # 4 | 5 | # run CPS_gradebygrade_dataprep.R 6 | 7 | ### greg's predictions 8 | setwd("/Volumes/appdata-3/School Reports") 9 | gregdata = read.csv("CSVfileforR_excerptof_FY14SBBALLOCATIONS_FINALSUMMARY_asof_032414.csv", stringsAsFactors = FALSE) 10 | gregdata_chs = subset(gregdata, SchoolID %in% schools_with1314boundaries) 11 | 12 | summary(gregdata_chs$Projected.Total - gregdata_chs$Actual.Total) 13 | mean(abs(gregdata_chs$Projected.Total - gregdata_chs$Actual.Total)) # 66 14 | mean(abs(gregdata_chs$Projected.Total - gregdata_chs$Actual.Total) <= 50) # 0.51 15 | mean(abs(gregdata_chs$Projected.Total - gregdata_chs$Actual.Total) <= 100) # 0.73 16 | 17 | ### prepare data 18 | datafinal12_chs = subset(datafinal12, SCHOOL_CODE %in% schools_with1314boundaries) 19 | datafinal13_chs = subset(datafinal13, SCHOOL_CODE %in% schools_with1314boundaries) 20 | 21 | apply(datafinal12_chs, 2, function(x) sum(is.na(x))) 22 | apply(datafinal13_chs, 2, function(x) sum(is.na(x))) 23 | datafinal12_chs[is.na(datafinal12_chs)] = 'Unknown' 24 | datafinal13_chs[is.na(datafinal13_chs)] = 'Unknown' 25 | 26 | ### compare to rolling over 2012 or Zhou's cohort survival projection 27 | library("plyr") 28 | baseline_comparison = ddply(datafinal13_chs, .(SCHOOL_CODE), summarize, cstotal = sum(cs_projection), lastyeartotal = sum(projection), truetotal = sum(enrollment_to_predict)) 29 | mean(abs(baseline_comparison$truetotal - baseline_comparison$cstotal)) # cohort survival: 77 student mean absolute error 30 | mean(abs(baseline_comparison$truetotal - baseline_comparison$cstotal) <= 50) # 0.49 31 | mean(abs(baseline_comparison$truetotal - baseline_comparison$cstotal) <= 100) # 0.75 32 | 33 | mean(abs(baseline_comparison$lastyeartotal - baseline_comparison$truetotal)) # roll over 2012: 87 student mean absolute error 34 | mean(abs(baseline_comparison$lastyeartotal - baseline_comparison$truetotal) <= 50) # 0.39 35 | mean(abs(baseline_comparison$lastyeartotal - baseline_comparison$truetotal) <= 100) # 0.61 36 | 37 | ### train regression model on 2012 data 38 | m1 = lm(enrollment_to_predict ~ (projection + cs_projection)*Rating + newnearby, data = datafinal12_chs) 39 | summary(m1) # summarize coefficients 40 | plot(m1) # residual plots for model diagnostic 41 | 42 | ### test model on 2013 data 43 | m1_test = predict(m1, newdata = datafinal13_chs, interval = "prediction", level = 0.95) 44 | m1_test = as.data.frame(m1_test) 45 | m1_test_var = ((m1_test$upr - m1_test$lwr)/4)^2 46 | 47 | datafinal13_chs$testpredictions = m1_test$fit 48 | datafinal13_chs$testvar = m1_test_var 49 | 50 | test_summary = ddply(datafinal13_chs, .(SCHOOL_CODE), summarize, truetotal = sum(enrollment_to_predict), predictedtotal = sum(testpredictions), predictedvar = sum(testvar)) 51 | mean(abs(test_summary$truetotal - test_summary$predictedtotal)) # mean absolute error 52 | mean(abs(test_summary$truetotal - test_summary$predictedtotal) <= 50) 53 | mean(abs(test_summary$truetotal - test_summary$predictedtotal) <= 100) # 0.80 54 | 55 | # calculate prediction interval lower and upper limits 56 | test_summary$predictedlower = test_summary$predictedtotal - 1.96*sqrt(test_summary$predictedvar) 57 | test_summary$predictedupper = test_summary$predictedtotal + 1.96*sqrt(test_summary$predictedvar) 58 | 59 | # how many 95% intervals cover the true school total (should be about 95%) 60 | mean((test_summary$predictedlower <= test_summary$truetotal) & (test_summary$truetotal <= test_summary$predictedupper)) 61 | 62 | # summarize the widths of the confidence intervals 63 | summary(test_summary$predictedupper - test_summary$predictedlower) 64 | 65 | setwd("/Volumes/appdata-3/School Reports") 66 | schools = read.csv("schools.csv", stringsAsFactors = F) 67 | 68 | test_summary = merge(test_summary, schools[,c("SchoolID","SchoolShortName")], all.x = T, by.x = "SCHOOL_CODE", by.y = "SchoolID") 69 | test_summary = merge(test_summary, gregdata_chs[,c("SchoolID","Projected.Total")], all.x = T, by.x = "SCHOOL_CODE", by.y = "SchoolID") 70 | 71 | # plot our estimates and CPS's estimates 72 | 73 | plot(test_summary$truetotal, test_summary$predictedtotal, pch = 16, xlim = c(0,3300), ylim = c(0,3300),xlab = "True enrollment 2013", ylab = "Predicted enrollment 2013") 74 | segments(x0 = test_summary$truetotal, y0 = test_summary$predictedtotal - 1.96*sqrt(test_summary$predictedvar), x1 = test_summary$truetotal, y1 = test_summary$predictedtotal + 1.96*sqrt(test_summary$predictedvar)) 75 | segments(x0=0,y0=0,x1=4000,y1=4000, col=2) 76 | points(x = test_summary$truetotal, y = test_summary$Projected.Total, pch = 2, col = 4) 77 | legend("bottomright", c("DSSG", "CPS"),col = c(1,4), pch = c(16,2), bty = "n", pt.cex = 2, title = "Prediction Model:") 78 | 79 | # plot school estimates and intervals 80 | library(ggplot2) 81 | test_summary$SCHOOL_CODE = with(test_summary, reorder(SCHOOL_CODE, -predictedtotal)) 82 | 83 | ggplot(data=test_summary, aes(x=SCHOOL_CODE, y=predictedtotal)) + 84 | geom_bar(stat="identity", fill = "#E69F00") + 85 | geom_errorbar(aes(ymin=predictedtotal - 1.96*sqrt(predictedvar), ymax=predictedtotal + 1.96*sqrt(predictedvar)), 86 | width=.2, # Width of the error bars 87 | position=position_dodge(.9)) + 88 | xlab("Catchment high schools") + ylab("Predicted enrollment") + 89 | theme(legend.position = "none", axis.ticks = element_blank(), 90 | axis.text.x = element_blank()) 91 | -------------------------------------------------------------------------------- /Data_Pipeline/README.md: -------------------------------------------------------------------------------- 1 | ##Data Pipeline for CPS Project## 2 | 3 | This folder includes scripts for querying, merging, and saving the data we used for most of our analyses. 4 | 5 | ###Student Data### 6 | 7 | `create_students_table.R` gathers data for student-level modelling. First, it runs the query `student_table_with_ninth_snapshot_vw.sql` to get the students who attended CPS in either 8th or 9th grades in the last few years. There is one record per student, per school year, and it combines which school the students went to in 8th and/or 9th grade. 8 | 9 | Then it runs `student_ISAT_scores_vw.sql` and `student_attendance_vw.sql` to get the students' ISAT scores and historical attendance, respectively. This is done separately, to increase speed when combined with the original query. These data sets are all merged together. Finally, the data is saved one year at a time. 10 | 11 | Finally, we run the script `geocode_addresses.R` to get the latitude and longitude for each student's home address. 12 | 13 | ###School Data### 14 | 15 | `create_schools_table.R` pulls all their features from the database. After this, it geocodes the schools, gets their census block information, and saves the results as `schools_static_features.csv`. 16 | 17 | This script also runs the query in `school_aggregate_features_vw.sql`, which pulls yearly summaries of the students who attend each of the schools. It captures the percentage of students receiving free or reduced priced lunches, the percentage of ESL students, the percentage of male students, the percentage of homeless students, the average student's GPA, and the students' attendance rate. 18 | 19 | The results are saved in `schools_aggregate_features_YEAR.csv`, with YEAR replaced by the appropriate year. 20 | 21 | The code in `create_features_table.R` creates the dataframe of school features that change every year. This file combines information from the school report cards downloaded from the [Chicago data portal](https://data.cityofchicago.org/), mobility and race data from [CPS's website](http://cps.edu/SchoolData/Pages/SchoolData.aspx), and [AUSL data](http://auslchicago.org/). Since the inputs are school progress reports, race files and ausl files, they are all excel sheets. Part of the work is done by hand and part of the work is automated. In the future, all files will change but will be publicly available data. 22 | 23 | ###Note### 24 | 25 | All the SQL queries pull data from views created for us by CPS. This is done to keep CPS's data stucture private. 26 | 27 | 28 | -------------------------------------------------------------------------------- /Data_Pipeline/create_features_table.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ####################################### 4 | ####Create School Changing Features#### 5 | ####################################### 6 | 7 | ####CPS Team#### 8 | ####08/19/2014#### 9 | 10 | rm(list=ls()) 11 | setwd("/Volumes/DSSG/Zhou_Classification/") 12 | elem2013 <- read.csv("CPS_Elementary_Report_20122013.csv", stringsAsFactors=F) 13 | high2013 <- read.csv("CPS_HighSchool_Report_20122013.csv", stringsAsFactors=F) 14 | elem2014 <- read.csv("CPS_Elementary_Report_20132014.csv", stringsAsFactors=F) 15 | high2014 <- read.csv("CPS_HighSchool_Report_20132014.csv", stringsAsFactors=F) 16 | prog2012 <- read.csv("Progress_2011_2012.csv", stringsAsFactors=F) 17 | name <- c("SchoolID", "SchoolName", "SchoolType", "ZipCode", "Rating", "Probation", "Health", "Safety", "Family", "Environment", "Instruction", "Leader", "Teacher") 18 | 19 | ####2011 - 2012### 20 | progress_2011_2012 <- with(prog2012, { 21 | data.frame(School.ID, Name.of.School, Elementary..Middle..or.High.School, ZIP.Code, 22 | CPS.Performance.Policy.Level, CPS.Performance.Policy.Status, 23 | Healthy.Schools.Certified., Safety.Icon, Family.Involvement.Icon, 24 | Environment.Icon, Instruction.Icon, Leaders.Icon, Teachers.Icon, stringsAsFactors=F) 25 | }) 26 | colnames(progress_2011_2012) <- name 27 | progress_2011_2012 <- within(progress_2011_2012, { 28 | SchoolType[SchoolType=="MS"] <- "ES"; 29 | Rating[Rating=="Not Enough Data" | Rating=="NDA"] <- NA; 30 | Probation[Probation=="Not on Probation"] <- "No"; 31 | Probation[Probation=="Probation"] <- "Yes"; 32 | Probation[Probation=="Not Applicable" | Probation=="NDA"] <- NA; 33 | Safety[Safety=="NDA"] <- NA; 34 | Family[Family=="NDA"] <- NA; 35 | Environment[Environment=="NDA"] <- NA; 36 | Instruction[Instruction=="NDA"] <- NA; 37 | Leader[Leader=="NDA"] <- NA; 38 | Teacher[Teacher=="NDA"] <- NA; 39 | Rating <- tolower(Rating); 40 | Probation <- tolower(Probation); 41 | Health <- tolower(Health); 42 | Safety <- tolower(Safety); 43 | Family <- tolower(Family); 44 | Environment <- tolower(Environment); 45 | Instruction <- tolower(Instruction); 46 | Leader <- tolower(Leader); 47 | Teacher <- tolower(Teacher); 48 | }) 49 | write.csv(x=progress_2011_2012, file="/Users/zhouye/Dropbox/DSSG/Edit_Prediction/progress_2011_2012.csv", row.names=F) 50 | 51 | ###2012 - 2013### 52 | progress_2012_2013_es <- with(elem2013, { 53 | data.frame(School.ID, School.Name, "ES", ZIP, Overall.Rating, 54 | On.Probation, Healthy.School.Certified, Safety, 55 | Involved.Families, Supportive.Environment, 56 | Ambitious.Instruction, Effective.Leaders, Collaborative.Teachers, stringsAsFactors=F) 57 | }) 58 | colnames(progress_2012_2013_es) <- name 59 | progress_2012_2013_es <- within(progress_2012_2013_es, { 60 | Rating[Rating=="Not Enough Data"] <- NA; 61 | Safety[Safety=="Not Enough Data"] <- NA; 62 | Safety[Safety=="Neutral"] <- "Average"; 63 | Family[Family=="Not Enough Data"] <- NA; 64 | Family[Family=="Neutral"] <- "Average"; 65 | Environment[Environment=="Not Enough Data"] <- NA; 66 | Environment[Environment=="Neutral"] <- "Average"; 67 | Instruction[Instruction=="Not Enough Data"] <- NA; 68 | Instruction[Instruction=="Neutral"] <- "Average"; 69 | Leader[Leader=="Not Enough Data"] <- NA; 70 | Leader[Leader=="Neutral"] <- "Average"; 71 | Teacher[Teacher=="Not Enough Data"] <- NA; 72 | Teacher[Teacher=="Neutral"] <- "Average"; 73 | Rating <- tolower(Rating); 74 | Probation <- tolower(Probation); 75 | Health <- tolower(Health); 76 | Safety <- tolower(Safety); 77 | Family <- tolower(Family); 78 | Environment <- tolower(Environment); 79 | Instruction <- tolower(Instruction); 80 | Leader <- tolower(Leader); 81 | Teacher <- tolower(Teacher); 82 | }) 83 | progress_2012_2013_hs <- with(high2013, { 84 | data.frame(School.ID, School.Name, "HS", ZIP, Overall.Rating, 85 | On.Probation, Healthy.School.Certified, Safety, 86 | Involved.Families, Supportive.Environment, 87 | Ambitious.Instruction, Effective.Leaders, Collaborative.Teachers, stringsAsFactors=F) 88 | }) 89 | colnames(progress_2012_2013_hs) <- name 90 | progress_2012_2013_hs <- within(progress_2012_2013_hs, { 91 | Rating[Rating=="Not Enough Data"] <- NA; 92 | Safety[Safety=="Not Enough Data"] <- NA; 93 | Safety[Safety=="Neutral"] <- "Average"; 94 | Family[Family=="Not Enough Data"] <- NA; 95 | Family[Family=="Neutral"] <- "Average"; 96 | Environment[Environment=="Not Enough Data"] <- NA; 97 | Environment[Environment=="Neutral"] <- "Average"; 98 | Instruction[Instruction=="Not Enough Data"] <- NA; 99 | Instruction[Instruction=="Neutral"] <- "Average"; 100 | Leader[Leader=="Not Enough Data"] <- NA; 101 | Leader[Leader=="Neutral"] <- "Average"; 102 | Teacher[Teacher=="Not Enough Data"] <- NA; 103 | Teacher[Teacher=="Neutral"] <- "Average"; 104 | Rating <- tolower(Rating); 105 | Probation <- tolower(Probation); 106 | Health <- tolower(Health); 107 | Safety <- tolower(Safety); 108 | Family <- tolower(Family); 109 | Environment <- tolower(Environment); 110 | Instruction <- tolower(Instruction); 111 | Leader <- tolower(Leader); 112 | Teacher <- tolower(Teacher); 113 | }) 114 | progress_2012_2013 <- rbind(progress_2012_2013_es, progress_2012_2013_hs) 115 | write.csv(x=progress_2012_2013, file="/Users/zhouye/Dropbox/DSSG/Edit_Prediction/progress_2012_2013.csv", row.names=F) 116 | 117 | ###2013 - 2014### 118 | progress_2013_2014_es <- with(elem2014, { 119 | data.frame(School.ID, Name.of.School, "ES", ZIP.Code, CPS.Performance.Policy.Level, 120 | CPS.Performance.Policy.Status, Healthy.Schools.Certification, Safe, 121 | Involved.Family, Supportive.Environment, 122 | Ambitious.Instruction, Effective.Leaders, Collaborative.Teachers, stringsAsFactors=F) 123 | }) 124 | colnames(progress_2013_2014_es) <- name 125 | progress_2013_2014_es <- within(progress_2013_2014_es, { 126 | Rating[Rating=="NOT ENOUGH DATA" | Rating==""] <- NA; 127 | Probation[Probation=="NOT APPLICABLE" | Probation==""] <- NA 128 | Probation[Probation=="ON PROBATION"] <- "Yes" 129 | Probation[Probation=="NOT ON PROBATION"] <- "No" 130 | Health[Health=="PENDING CERTIFICATION" | Health==""] <- NA 131 | Health[Health=="HEALTHY SCHOOLS CERTIFIED"] <- "Yes" 132 | Health[Health=="NOT CERTIFIED"] <- "No" 133 | Safety[Safety=="NOT ENOUGH DATA" | Safety==""] <- NA; 134 | Safety[Safety=="NEUTRAL"] <- "Average"; 135 | Family[Family=="NOT ENOUGH DATA" | Family==""] <- NA; 136 | Family[Family=="NEUTRAL"] <- "Average"; 137 | Environment[Environment=="NOT ENOUGH DATA" | Environment==""] <- NA; 138 | Environment[Environment=="NEUTRAL"] <- "Average"; 139 | Instruction[Instruction=="NOT ENOUGH DATA" | Instruction==""] <- NA; 140 | Instruction[Instruction=="NEUTRAL"] <- "Average"; 141 | Leader[Leader=="NOT ENOUGH DATA" | Leader==""] <- NA; 142 | Leader[Leader=="NEUTRAL"] <- "Average"; 143 | Teacher[Teacher=="NOT ENOUGH DATA"] <- NA; 144 | Teacher[Teacher=="NEUTRAL"] <- "Average"; 145 | Rating <- tolower(Rating); 146 | Probation <- tolower(Probation); 147 | Health <- tolower(Health); 148 | Safety <- tolower(Safety); 149 | Family <- tolower(Family); 150 | Environment <- tolower(Environment); 151 | Instruction <- tolower(Instruction); 152 | Leader <- tolower(Leader); 153 | Teacher <- tolower(Teacher); 154 | }) 155 | progress_2013_2014_hs <- with(high2014, { 156 | data.frame(School.ID, Name.of.School, "ES", ZIP.Code, CPS.Performance.Policy.Level, 157 | CPS.Performance.Policy.Status, Healthy.Schools.Certification, Safe, 158 | Involved.Family, Supportive.Environment, 159 | Ambitious.Instruction, Effective.Leaders, Collaborative.Teachers, stringsAsFactors=F) 160 | }) 161 | colnames(progress_2013_2014_hs) <- name 162 | progress_2013_2014_hs <- within(progress_2013_2014_hs, { 163 | Rating[Rating=="NOT ENOUGH DATA" | Rating==""] <- NA; 164 | Probation[Probation=="NOT APPLICABLE" | Probation==""] <- NA 165 | Probation[Probation=="ON PROBATION"] <- "Yes" 166 | Probation[Probation=="NOT ON PROBATION"] <- "No" 167 | Health[Health=="PENDING CERTIFICATION" | Health==""] <- NA 168 | Health[Health=="HEALTHY SCHOOLS CERTIFIED"] <- "Yes" 169 | Health[Health=="NOT CERTIFIED"] <- "No" 170 | Safety[Safety=="NOT ENOUGH DATA" | Safety==""] <- NA; 171 | Safety[Safety=="NEUTRAL"] <- "Average"; 172 | Family[Family=="NOT ENOUGH DATA" | Family==""] <- NA; 173 | Family[Family=="NEUTRAL"] <- "Average"; 174 | Environment[Environment=="NOT ENOUGH DATA" | Environment==""] <- NA; 175 | Environment[Environment=="NEUTRAL"] <- "Average"; 176 | Instruction[Instruction=="NOT ENOUGH DATA" | Instruction==""] <- NA; 177 | Instruction[Instruction=="NEUTRAL"] <- "Average"; 178 | Leader[Leader=="NOT ENOUGH DATA" | Leader==""] <- NA; 179 | Leader[Leader=="NEUTRAL"] <- "Average"; 180 | Teacher[Teacher=="NOT ENOUGH DATA"] <- NA; 181 | Teacher[Teacher=="NEUTRAL"] <- "Average"; 182 | Rating <- tolower(Rating); 183 | Probation <- tolower(Probation); 184 | Health <- tolower(Health); 185 | Safety <- tolower(Safety); 186 | Family <- tolower(Family); 187 | Environment <- tolower(Environment); 188 | Instruction <- tolower(Instruction); 189 | Leader <- tolower(Leader); 190 | Teacher <- tolower(Teacher); 191 | }) 192 | progress_2013_2014 <- rbind(progress_2013_2014_es, progress_2013_2014_hs) 193 | write.csv(x=progress_2013_2014, file="/Users/zhouye/Dropbox/DSSG/Edit_Prediction/progress_2013_2014.csv", row.names=F) 194 | 195 | setwd("/Volumes/appdata-3/Volan data/01 - FY 06 to FY 14 Enrollment History/csvfiles") 196 | 197 | # csv files of Greg's 20th day excel files 198 | enrollment2013mat = read.csv("FY2014_20th_DAY_ENROLLMENT_allgrades.csv") 199 | enrollment2012mat = read.csv("FY2013_20th_DAY_ENROLLMENT_allgrades.csv") 200 | enrollment2011mat = read.csv("FY2012_20th_DAY_ENROLLMENT_allgrades.csv") 201 | enrollment2010mat = read.csv("FY2011_20th_DAY_ENROLLMENT_allgrades.csv") 202 | enrollment2009mat = read.csv("FY2010_20th_DAY_ENROLLMENT_allgrades.csv") 203 | 204 | # sum up SPED and LRE 205 | enrollment2013mat$LRE = rowSums(enrollment2013mat[,15:20], na.rm = TRUE) 206 | enrollment2012mat$LRE = rowSums(enrollment2012mat[,15:16], na.rm = TRUE) 207 | enrollment2011mat$LRE = rowSums(enrollment2011mat[,15:17], na.rm = TRUE) 208 | enrollment2010mat$LRE = rowSums(enrollment2010mat[,15:17], na.rm = TRUE) 209 | enrollment2009mat$LRE = rowSums(enrollment2009mat[,15:17], na.rm = TRUE) 210 | 211 | # columns to include: school ID, K - 12, LRE total 212 | enrollment2013mat = enrollment2013mat[,c(1:14,21)] 213 | enrollment2012mat = enrollment2012mat[,c(1:14,17)] 214 | enrollment2011mat = enrollment2011mat[,c(1:14,18)] 215 | enrollment2010mat = enrollment2010mat[,c(1:14,18)] 216 | enrollment2009mat = enrollment2009mat[,c(1:14,18)] 217 | 218 | # column names 219 | colnames(enrollment2009mat) = c("SCHOOL_CODE", "K","X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","LRE") 220 | colnames(enrollment2010mat) = c("SCHOOL_CODE", "K","X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","LRE") 221 | colnames(enrollment2011mat) = c("SCHOOL_CODE", "K","X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","LRE") 222 | colnames(enrollment2012mat) = c("SCHOOL_CODE", "K","X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","LRE") 223 | colnames(enrollment2013mat) = c("SCHOOL_CODE", "K","X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","LRE") 224 | 225 | # remove NA school codes 226 | enrollment2013mat = enrollment2013mat[!is.na(enrollment2013mat$SCHOOL_CODE),] 227 | enrollment2012mat = enrollment2012mat[!is.na(enrollment2012mat$SCHOOL_CODE),] 228 | enrollment2011mat = enrollment2011mat[!is.na(enrollment2011mat$SCHOOL_CODE),] 229 | enrollment2010mat = enrollment2010mat[!is.na(enrollment2010mat$SCHOOL_CODE),] 230 | enrollment2009mat = enrollment2009mat[!is.na(enrollment2009mat$SCHOOL_CODE),] 231 | 232 | # if enrollment is NA, fill in 0 233 | enrollment2013mat[(is.na(enrollment2013mat))] = 0 234 | enrollment2012mat[(is.na(enrollment2012mat))] = 0 235 | enrollment2011mat[(is.na(enrollment2011mat))] = 0 236 | enrollment2010mat[(is.na(enrollment2010mat))] = 0 237 | enrollment2009mat[(is.na(enrollment2009mat))] = 0 238 | 239 | # enrollment matrix 240 | temp2009 = reshape(enrollment2009mat, idvar = "SCHOOL_CODE", times = names(enrollment2009mat)[2:15], timevar = "GRADE", varying = list(names(enrollment2009mat)[2:15]), direction = "long", v.names = "ENROLLMENT") 241 | temp2009$YEAR = 2009 242 | 243 | temp2010 = reshape(enrollment2010mat, idvar = "SCHOOL_CODE", times = names(enrollment2010mat)[2:15], timevar = "GRADE", varying = list(names(enrollment2010mat)[2:15]), direction = "long", v.names = "ENROLLMENT") 244 | temp2010$YEAR = 2010 245 | 246 | temp2011 = reshape(enrollment2011mat, idvar = "SCHOOL_CODE", times = names(enrollment2011mat)[2:15], timevar = "GRADE", varying = list(names(enrollment2011mat)[2:15]), direction = "long", v.names = "ENROLLMENT") 247 | temp2011$YEAR = 2011 248 | 249 | temp2012 = reshape(enrollment2012mat, idvar = "SCHOOL_CODE", times = names(enrollment2012mat)[2:15], timevar = "GRADE", varying = list(names(enrollment2012mat)[2:15]), direction = "long", v.names = "ENROLLMENT") 250 | temp2012$YEAR = 2012 251 | 252 | temp2013 = reshape(enrollment2013mat, idvar = "SCHOOL_CODE", times = names(enrollment2013mat)[2:15], timevar = "GRADE", varying = list(names(enrollment2013mat)[2:15]), direction = "long", v.names = "ENROLLMENT") 253 | temp2013$YEAR = 2013 254 | 255 | enrollmentmatrix = rbind(temp2009, temp2010, temp2011, temp2012, temp2013) 256 | 257 | # write to csv 258 | setwd("/Volumes/appdata-3/Count Data") 259 | write.csv(enrollmentmatrix, file = "enrollment_byschool_byyear_bygrade.csv", row.names = FALSE) 260 | 261 | # DONE 262 | 263 | 264 | # ========================================================== # 265 | # creating school matrix of features that don't change # 266 | # ========================================================== # 267 | 268 | # list of schools for each year 269 | schools2009 = unique(enrollment2009mat$SCHOOL_CODE) 270 | schools2010 = unique(enrollment2010mat$SCHOOL_CODE) 271 | schools2011 = unique(enrollment2011mat$SCHOOL_CODE) 272 | schools2012 = unique(enrollment2012mat$SCHOOL_CODE) 273 | schools2013 = unique(enrollment2013mat$SCHOOL_CODE) 274 | 275 | # all schools from fall 2009 - 2013 276 | allschools = union(schools2009, union(schools2010, union(schools2011, union(schools2012, schools2013)))) 277 | allschools = allschools[!is.na(allschools)] 278 | 279 | # start making all schools matrix 280 | allschoolsmat = as.data.frame(allschools) 281 | colnames(allschoolsmat) = "SCHOOL_CODE" 282 | 283 | # all schools in database 284 | setwd("/Volumes/appdata-3/School Reports") 285 | schools_table = read.csv("schools.csv", stringsAsFactors = FALSE) 286 | 287 | allschoolsmat = merge(allschoolsmat, schools_table[,c("SCHOOL_CODE","SCHOOL_NAME","SCHOOL_TYPE","SCHOOL_GRADES_GROUP","lon","lat")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SCHOOL_CODE") 288 | 289 | # use school report from 2011 to try to fill in missing lat/lon 290 | schools_report2011 = read.csv("Chicago_Public_Schools_-_Progress_Report_Cards__2011-2012_.csv", stringsAsFactors = FALSE) 291 | 292 | for (i in 1:nrow(allschoolsmat)){ 293 | if (is.na(allschoolsmat[i,"lon"]) & nrow(schools_report2011[schools_report2011$School.ID == allschoolsmat[i,"SCHOOL_CODE"],]) > 0) 294 | {allschoolsmat[i,"lon"] = schools_report2011[schools_report2011$School.ID == allschoolsmat[i,"SCHOOL_CODE"],"Longitude"] ; 295 | allschoolsmat[i,"lat"] = schools_report2011[schools_report2011$School.ID == allschoolsmat[i,"SCHOOL_CODE"],"Latitude"]} 296 | } 297 | 298 | library(ggmap) 299 | 300 | garfieldparklocation = geocode('3250 W Monroe St, Chicago, IL 60612') 301 | allschoolsmat[(allschoolsmat$SCHOOL_CODE == 400095),"lon"] = garfieldparklocation$lon 302 | allschoolsmat[(allschoolsmat$SCHOOL_CODE == 400095),"lat"] = garfieldparklocation$lat 303 | 304 | # crane achievement has same address as crane tech 305 | allschoolsmat[allschoolsmat$SCHOOL_CODE == 610378,"lon"] = allschoolsmat[allschoolsmat$SCHOOL_CODE == 609702,"lon"] 306 | allschoolsmat[allschoolsmat$SCHOOL_CODE == 610378,"lat"] = allschoolsmat[allschoolsmat$SCHOOL_CODE == 609702,"lat"] 307 | 308 | # end of filling in lat/lon 309 | 310 | # indicator for years when school is active 311 | allschoolsmat$active2009 = NA 312 | allschoolsmat$active2010 = NA 313 | allschoolsmat$active2011 = NA 314 | allschoolsmat$active2012 = NA 315 | allschoolsmat$active2013 = NA 316 | 317 | for (i in 1:nrow(allschoolsmat)){ 318 | allschoolsmat[i,"active2009"] = length(intersect(allschoolsmat[i,"SCHOOL_CODE"], schools2009)) 319 | allschoolsmat[i,"active2010"] = length(intersect(allschoolsmat[i,"SCHOOL_CODE"], schools2010)) 320 | allschoolsmat[i,"active2011"] = length(intersect(allschoolsmat[i,"SCHOOL_CODE"], schools2011)) 321 | allschoolsmat[i,"active2012"] = length(intersect(allschoolsmat[i,"SCHOOL_CODE"], schools2012)) 322 | allschoolsmat[i,"active2013"] = length(intersect(allschoolsmat[i,"SCHOOL_CODE"], schools2013)) 323 | } 324 | 325 | allschoolsmat = allschoolsmat[(allschoolsmat$SCHOOL_CODE != 400152),] # this school doesn't have any enrollment 326 | 327 | sum(is.na(allschoolsmat)) # 0 328 | 329 | setwd("/Volumes/appdata-3/School Reports") 330 | write.csv(allschoolsmat, file = "schools_staticfeatures.csv", row.names = FALSE) 331 | 332 | # DONE 333 | 334 | 335 | # =============================================== # 336 | # school features that do change year to year # 337 | # =============================================== # 338 | 339 | # list of schools 340 | setwd("/Volumes/appdata-3/Count Data") 341 | countdata = read.csv("enrollment_byschool_byyear_bygrade.csv") 342 | 343 | # make one data frame for each year to start 344 | 345 | schools_whatyouknowforfall11 = as.data.frame(unique(countdata[(countdata$YEAR == 2011),"SCHOOL_CODE"])) 346 | colnames(schools_whatyouknowforfall11) = "SCHOOL_CODE" 347 | schools_whatyouknowforfall11$year_topredict = 2011 348 | 349 | schools_whatyouknowforfall12 = as.data.frame(unique(countdata[(countdata$YEAR == 2012),"SCHOOL_CODE"])) 350 | colnames(schools_whatyouknowforfall12) = "SCHOOL_CODE" 351 | schools_whatyouknowforfall12$year_topredict = 2012 352 | 353 | schools_whatyouknowforfall13 = as.data.frame(unique(countdata[(countdata$YEAR == 2013),"SCHOOL_CODE"])) 354 | colnames(schools_whatyouknowforfall13) = "SCHOOL_CODE" 355 | schools_whatyouknowforfall13$year_topredict = 2013 356 | 357 | 358 | # race 359 | setwd("/Volumes/appdata-3/School Reports/Ethnicity") 360 | race1011 = read.csv("FY2011_race.csv") 361 | race1112 = read.csv("FY2012_race.csv") 362 | race1213 = read.csv("FY2013_race.csv") 363 | 364 | schools_whatyouknowforfall11 = merge(schools_whatyouknowforfall11, race1011, all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID") 365 | schools_whatyouknowforfall12 = merge(schools_whatyouknowforfall12, race1112, all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID") 366 | schools_whatyouknowforfall13 = merge(schools_whatyouknowforfall13, race1213, all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID") 367 | 368 | 369 | # AUSL turnarounds (source = AUSL site) 370 | 371 | # starting fall 2010 or before: bradwell, chicago academy, chicago academy HS, 372 | # collins, curtis, deneen, dodge, dulles, harvard, howe, johnson, morton, 373 | # national teachers academy, orr, phillips, sherman, solorio, tarkington 374 | AUSL_priorfall2010 = c(609806, 610248, 610340, 610499, 609900, 609883, 609888, 375 | 610263, 609971, 610000, 610274, 610257, 610231, 610389, 376 | 609727, 610172, 610543, 610396) 377 | # starting fall 2011: none 378 | 379 | schools_whatyouknowforfall11$AUSL = (lapply(schools_whatyouknowforfall11$SCHOOL_CODE, function(x) (x %in% AUSL_priorfall2010)) == TRUE) 380 | 381 | 382 | # starting fall 2012: casals, fuller, herzl, marquette, piccolo, stagg 383 | AUSL_fall2012 = c(610021, 609928, 609991, 610053, 610106, 610339) 384 | schools_whatyouknowforfall12$AUSL = (lapply(schools_whatyouknowforfall12$SCHOOL_CODE, function(x) (x %in% AUSL_fall2012)) == TRUE) 385 | 386 | # starting fall 2013: carter, chalmers, dewey, lewis, o'keeffe 387 | AUSL_fall2013 = c(609844, 609851, 609885, 610036, 610103) 388 | schools_whatyouknowforfall13$AUSL = (lapply(schools_whatyouknowforfall13$SCHOOL_CODE, function(x) (x %in% AUSL_fall2013)) == TRUE) 389 | 390 | # starting fall 2014: dvorak, gresham, mcnair 391 | #AUSL_fall2014 = c(610254, 609955, 610282) 392 | 393 | # other turnarounds (by OSI): marshall HS, vocational career academy, tilden career HS, wendell smith elem, woodson south elem 394 | 395 | 396 | 397 | # mobility rate 398 | setwd("/Volumes/appdata-3/School Reports") 399 | mobilitydata = read.csv("mobilityrates.csv", na.strings = c(" ","")) 400 | # I think the year is the year the school year ended 401 | # (i.e. the rate for 2009 is for the 2008-09 school year) 402 | 403 | currentnames = names(schools_whatyouknowforfall12) 404 | 405 | schools_whatyouknowforfall11 = merge(schools_whatyouknowforfall11, mobilitydata[,c("School.ID","X2011")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID") 406 | colnames(schools_whatyouknowforfall11) = c(currentnames, "mobility_rate") 407 | 408 | schools_whatyouknowforfall12 = merge(schools_whatyouknowforfall12, mobilitydata[,c("School.ID","X2012")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID") 409 | colnames(schools_whatyouknowforfall12) = c(currentnames, "mobility_rate") 410 | 411 | schools_whatyouknowforfall13 = merge(schools_whatyouknowforfall13, mobilitydata[,c("School.ID","X2013")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID") 412 | colnames(schools_whatyouknowforfall13) = c(currentnames, "mobility_rate") 413 | 414 | 415 | # report data 416 | setwd("/Volumes/appdata-3/Master_Data") 417 | progress1112 = read.csv("progress_2011_2012.csv", stringsAsFactors = FALSE) 418 | progress1213 = read.csv("progress_2012_2013.csv", stringsAsFactors = FALSE) 419 | progress1314 = read.csv("progress_2013_2014.csv", stringsAsFactors = FALSE) 420 | 421 | schools_whatyouknowforfall11 = merge(schools_whatyouknowforfall11, progress1112[,c(1,5:13)], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SchoolID") 422 | schools_whatyouknowforfall12 = merge(schools_whatyouknowforfall12, progress1213[,c(1,5:13)], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SchoolID") 423 | schools_whatyouknowforfall13 = merge(schools_whatyouknowforfall13, progress1314[,c(1,5:13)], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SchoolID") 424 | 425 | schools_whatyouknowforfallxx = rbind(schools_whatyouknowforfall11, schools_whatyouknowforfall12, schools_whatyouknowforfall13) 426 | 427 | setwd("/Volumes/appdata-3/School Reports") 428 | write.csv(schools_whatyouknowforfallxx, file = "schools_changingfeatures.csv", row.names = FALSE) 429 | 430 | rm(list=ls()) 431 | setwd("/Volumes/DSSG/School Reports") 432 | data <- read.csv("schools_changingfeatures.csv") 433 | colnames(data) <- c("SchoolID", "YearToPredict", "White", "Black", "Hispanic", "AUSL", "Mobility", "Rating", "Probation", "Health", "Safety", "Family", "Environment", "Instruction", "Leader", "Teacher") 434 | 435 | data2011 <- subset(data, data$YearToPredict==2011) 436 | data2011$YearToPredict <- NULL 437 | write.csv(x=data2011, file="schools_changing_features_2011.csv", row.names=F) 438 | 439 | data2012 <- subset(data, data$YearToPredict==2012) 440 | data2012$YearToPredict <- NULL 441 | write.csv(x=data2012, file="schools_changing_features_2012.csv", row.names=F) 442 | 443 | data2013 <- subset(data, data$YearToPredict==2013) 444 | data2013$YearToPredict <- NULL 445 | write.csv(x=data2013, file="schools_changing_features_2013.csv", row.names=F) 446 | 447 | rm(list=ls()) 448 | setwd("/Volumes/DSSG/Master_Data/") 449 | enroll <- read.csv("enrollment.csv", stringsAsFactors=T) 450 | all_year <- sort(unique(enroll$YEAR)) 451 | name <- c("SCHOOL_CODE", "Year", "K", "G1", "G2", "G3", "G4", "G5", "G6", 452 | "G7", "G8", "G9", "G10", "G11", "G12") 453 | old_name <- c("K", "X1", "X2", "X3", "X4", "X5", "X6", 454 | "X7", "X8", "X9", "X10", "X11", "X12") 455 | 456 | active_grade <- function(year) { 457 | temp <- subset(enroll, enroll$YEAR==year) 458 | result <- data.frame(matrix(0, nrow(temp), length(name))) 459 | colnames(result) <- name 460 | result$SCHOOL_CODE <- temp$SCHOOL_CODE 461 | result$Year <- year 462 | for (i in 1:nrow(result)) { 463 | for (j in 1:length(old_name)) { 464 | flag <- subset(temp, temp$SCHOOL_CODE==result[i,]$SCHOOL_CODE & temp$GRADE==old_name[j])$ENROLLMENT 465 | result[i,j+2] <- ifelse(flag>0, 1, 0) 466 | } 467 | } 468 | return(result) 469 | } 470 | 471 | data <- list() 472 | for (i in 1:length(all_year)) { 473 | print(i) 474 | data[[i]] <- active_grade(all_year[i]) 475 | } 476 | final_data <- do.call("rbind", data) 477 | write.csv(x=final_data, file="/Users/zhouye/Dropbox/DSSG/avtive_grade.csv", row.names=F) 478 | 479 | rm(list=ls()) 480 | setwd("/Volumes/DSSG/Master_Data/") 481 | tract <- read.csv("../Other Datasets/percent_race_by_tract.csv") 482 | static <- read.csv("schools_static_features.csv") 483 | change2011 <- read.csv("schools_changing_features_2011.csv") 484 | change2012 <- read.csv("schools_changing_features_2012.csv") 485 | change2013 <- read.csv("schools_changing_features_2013.csv") 486 | fill_miss <- function(data) { 487 | index <- which(is.na(rowSums(data[,c("White", "Black", "Hispanic")]))) 488 | for (i in index) { 489 | print(i) 490 | id <- data[i,]$SchoolID 491 | j <- which(static$SchoolID==id) 492 | fips <- static[j,]$FIPSCode 493 | code <- as.numeric(substring(fips, 3, 13)) 494 | k <- which(tract$Tract==code) 495 | if (length(k)!=0) { 496 | data[i,c("White", "Black", "Hispanic")] <- tract[k,c("PercentWhite", "PercentBlack", "PercentHispanic")] 497 | } 498 | } 499 | return(data) 500 | } 501 | new2011 <- fill_miss(change2011) 502 | new2012 <- fill_miss(change2012) 503 | new2013 <- fill_miss(change2013) 504 | write.csv(x=new2011, file="schools_changing_features_2011.csv", row.names=F) 505 | write.csv(x=new2012, file="schools_changing_features_2012.csv", row.names=F) 506 | write.csv(x=new2013, file="schools_changing_features_2013.csv", row.names=F) 507 | 508 | 509 | 510 | -------------------------------------------------------------------------------- /Data_Pipeline/create_schools_table.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ##################################### 4 | ####Create School Static Features#### 5 | ##################################### 6 | 7 | ####CPS Team#### 8 | ####08/20/2014#### 9 | 10 | rm(list = ls()) 11 | setwd("//admin/appdata/DataWarehouse/DSSG/School Reports/") 12 | 13 | # Pull in data from CPS data warehouse #### 14 | library(RODBC) 15 | # an ODBC called "DW_QA" which accesses the K12intel_qa database 16 | channel <- odbcConnect("DW_QA") 17 | 18 | school_query <- "SELECT * 19 | FROM K12INTEL_DW.dbo.DSSG_School_Query 20 | WHERE SchoolKey>1" 21 | odbcClose(channel) 22 | 23 | schools = sqlQuery(channel, school_query, stringsAsFactors = FALSE) 24 | 25 | # Geocode school addresses #### 26 | library("ggmap") 27 | addresses = with(schools, paste(Address, City, State, ZipCode, sep = ", ")) 28 | good_addresses = !is.na(schools$Address) 29 | geocodeschools = geocode(addresses[good_addresses]) 30 | 31 | schools$Longitude[good_addresses] = geocodeschools$lon 32 | schools$Latitude[good_addresses] = geocodeschools$lat 33 | 34 | # plot(geocodeschools) # should look like chicago 35 | 36 | # Get census block group and tract information based on the geocode #### 37 | get_FIPS <- function(latlons) { 38 | library(RJSONIO) 39 | # latlons is a n X 2 matrix with latitude in first column and longitude in second column 40 | # returns an n-dimensional character vector of FIPS codes 41 | urls = paste0("http://data.fcc.gov/api/block/2010/find?format=json&latitude=", 42 | latlons[,1], "&longitude=", latlons[,2]) 43 | 44 | FIPSs = rep(NA, nrow(latlons)) 45 | index = which(!apply(is.na(latlons), 1, any)) 46 | for (i in index) { 47 | bk = fromJSON(urls[i]) 48 | 49 | if (bk$status == "OK") { 50 | FIPSs[i] = bk$Block 51 | } 52 | } 53 | 54 | FIPSs[index] = paste0("US", FIPSs[index]) 55 | return(FIPSs) 56 | } 57 | 58 | FIPS = get_FIPS(subset(schools, select = c("Latitude", "Longitude"))) 59 | schools$FIPSCode = FIPS 60 | 61 | write.table(schools, "//admin/appdata/DataWarehouse/DSSG/Master_Data/schools_static_features.csv", row.names = FALSE, sep = ",") 62 | 63 | 64 | ## School Aggregate Features #### 65 | setwd("//admin/appdata/DataWarehouse/DSSG/School Reports/") 66 | 67 | fileName1 <- '//admin/appdata/DataWarehouse/DSSG/School Reports/school_aggregate_features_vw.sql' 68 | aggr_query <- readChar(fileName1, file.info(fileName1)$size) 69 | 70 | library(RODBC) 71 | channel <- odbcConnect("DW_QA") 72 | school_aggr = sqlQuery(channel, aggr_query, stringsAsFactors = FALSE) 73 | odbcClose(channel) 74 | 75 | for (year in 2011:2013) { 76 | school_aggr_year = subset(school_aggr, SchoolYear == year) 77 | school_aggr_year = school_aggr_year[,-which(colnames(school_aggr)=="SchoolYear")] 78 | write.table(school_aggr_year, paste0("//admin/appdata/DataWarehouse/DSSG/Master_Data/schools_aggregate_features_",year,".csv"), row.names = FALSE, sep = ",") 79 | } 80 | 81 | -------------------------------------------------------------------------------- /Data_Pipeline/create_students_table.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ########################### 4 | ####Create Student data#### 5 | ########################### 6 | 7 | ####CPS Team#### 8 | ####08/20/2014#### 9 | 10 | setwd("//admin/appdata/DataWarehouse/DSSG/Student_Data") 11 | 12 | library(RODBC) 13 | 14 | fileName1 <- '//admin/appdata/DataWarehouse/DSSG/Student_Data/student_table_with_ninth_snapshot_vw.sql' 15 | student_query <- readChar(fileName1, file.info(fileName1)$size) 16 | fileName2 <- '//admin/appdata/DataWarehouse/DSSG/Student_Data/student_ISAT_scores_vw.sql' 17 | tests_query <- readChar(fileName2, file.info(fileName2)$size) 18 | fileName3 <- '//admin/appdata/DataWarehouse/DSSG/Student_Data/student_attendance_vw.sql' 19 | attendance_query <- readChar(fileName3, file.info(fileName3)$size) 20 | 21 | # an ODBC called "DW_QA" which accesses the K12intel_qa database 22 | channel <- odbcConnect("DW_QA") 23 | students = sqlQuery(channel, student_query, stringsAsFactors = TRUE) 24 | tests = sqlQuery(channel, tests_query) 25 | attendance = sqlQuery(channel, attendance_query) 26 | odbcCloseAll() 27 | 28 | # indicators of whether the student left the system after 8th 29 | # or entered the system in 9th grade 30 | students$Left <- as.numeric(is.na(students$NextGradeSchoolKey)) 31 | students$Entered <- as.numericis.na(students$ThisGradeSchoolKey)) 32 | 33 | # combine with students testing and attendance information 34 | students = merge(students, tests, all.x = TRUE) 35 | students = merge(students, attendance, all.x = TRUE) 36 | 37 | # add in geocodes 38 | student_geo = read.csv("student_geo_correct.csv") 39 | students = merge(students, student_geo[,-2], all.x = TRUE) 40 | 41 | # split up by year and save 42 | for (year in 2011:2013) { 43 | students_year = subset(students, SchoolYear == (year-1)) 44 | students_year = students_year[,-which(colnames(students_year)=="SchoolYear")] 45 | write.table(students_year, paste0("//admin/appdata/DataWarehouse/DSSG/Master_Data/students",year,".csv"), sep = ",", row.names = FALSE) 46 | } 47 | -------------------------------------------------------------------------------- /Data_Pipeline/geocode_addresses.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ####################### 4 | ####Create Geo Code#### 5 | ####################### 6 | 7 | ####CPS Team#### 8 | ####08/20/2014#### 9 | 10 | library('ggmap') 11 | library('plyr') 12 | library('RDSTK') 13 | 14 | setwd("//admin/appdata/DataWarehouse/DSSG/") 15 | 16 | year=2011 17 | students <- read.csv(paste0("students",year,".csv"), header=T) 18 | 19 | students$Address <- do.call(paste, c(students[c("StreetNumber", "StreetDirection", "StreetName", "StreetType", "City", "State", "ZipCode")], sep="+")) 20 | #fit student addresses into a data frame of text string 21 | #subset to create new dataset 22 | names(students) 23 | student_addresses <- students[,c(5,42)] 24 | 25 | temp <- data.frame(matrix(NA, nrow(students), 13)) 26 | for (i in 1:nrow(students)) 27 | { 28 | tmp <- tryCatch( 29 | street2coordinates(student_addresses[i,3], session=getCurlHandle()) 30 | ) 31 | temp[i,] <- tmp[1,] 32 | 33 | print(i) 34 | } 35 | 36 | #add latitude and longitudes to original dataset 37 | student_addresses$latitude <- newset[,3] 38 | student_addresses$longitude <- newset[,5] 39 | 40 | #write to csv 41 | write.csv(student_addresses, file="student_address_table.csv", row.names=F) 42 | -------------------------------------------------------------------------------- /Data_Pipeline/school_aggregate_features_vw.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | FROM K12INTEL_DW.DSSG_SCHOOL_AGGREGATE_FEATURES_VW -------------------------------------------------------------------------------- /Data_Pipeline/student_ISAT_scores_vw.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | FROM K12INTEL_DW.dbo.DSSG_STUDENT_ISTAT_SCORES_VW -------------------------------------------------------------------------------- /Data_Pipeline/student_attendance_vw.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | FROM K12INTEL_DW.dbo.DSSG_STUDENT_ATTENDANCE_VW -------------------------------------------------------------------------------- /Data_Pipeline/student_table_with_ninth_snapshot_vw.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | FROM K12INTEL_DW.dbo.DSSG_STUDENT_TABLE_WITH_NINTH_SNAPSHOT_VW -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ##Predicting School-Level Enrollment for Chicago Public Schools## 2 | 3 | 4 | Statistical models and analysis of student enrollment to better allocate school budgets in Chicago. 5 | This project is a 2014 Data Science for Social Good project in partnership with Chicago Public Schools. 6 | 7 | For a more comprehensive overview, please consult our [blog post](http://dssg.io/2014/07/23/cps-enrollment-prediction.html). 8 | 9 | ###Project Description### 10 | Lack of accurate enrollment forecasting (at the individual school level) causes millions of dollars to be re-allocated every year causing administration and teaching disruption. 11 | 12 | In the Chicago Public Schools system, each school's budget depends on how many students enroll there. Unfortunately, it can be difficult to predict how many students will enroll in a given year, and inaccurate predictions can result in last-minute money shuffles, staff changes, and wasted educational time. Because budget allocations are made in the spring, long before students start in September, early and accurate predictions of student enrollment are crucial. 13 | 14 | ###The Solution### 15 | To answer these questions, we approached the problem in the following ways: 16 | * Extensive Exploratory Data Analysis 17 | * Linear Regression (School Level) 18 | * Conditional Logit Model (Student Level) 19 | * Decision Tree (Predict Misprojection) 20 | 21 | ###Project Layout### 22 | * Data_Pipeline: process raw data into structured data frame 23 | * Data_Analysis: data visualization and regression analysis 24 | * Tool_Box: main tool box for CPS usage to make prediction and diagnose prediction for the future 25 | 26 | ###Installation### 27 | 28 | #### Git 29 | To download the code we used for the project, you will need to clone the repository using Git. [Git](http://git-scm.com/) is used version control system we used to orginize our code. We hosted the code on [Github](http://github.com/). You can download Git [here](http://git-scm.com/downloads). Information on getting started with Git is [here](http://git-scm.com/book/en/Getting-Started-Git-Basics). Additionally, you will need to create a Github account. 30 | 31 | Once you have installed Git, you will need to navigate in command line to the folder in which you want to download the code. Then you will need to clone the respository with the following commands. 32 | 33 | ``` 34 | git clone https://github.com/dssg/predicting_student_enrollment.git 35 | cd predicting_student_enrollment/ 36 | ``` 37 | 38 | #### R 39 | 40 | We did most of our analyses in R. R is available for free via the [R Project website](http://www.r-project.org) or, with a slightly friendlier user interface, as [RStudio](http://www.rstudio.com). See [these resources](https://github.com/dssg/nfp#using-r) for a more detailed overview of R. 41 | 42 | Much of R's functionality comes not from the base installation but from additional packages. Details about how to install packages are available [here](http://www.r-bloggers.com/installing-r-packages). Our code makes use of the following packages: 43 | 44 | * ggplot2 45 | * plyr 46 | * survival 47 | * clogitL1 48 | * randomForest 49 | 50 | ### Team 51 | ![CPS Team](http://dssg.io/img/posts/cps-team.png) 52 | 53 | * [Vanessa Ko](https://github.com/vanessako), Political Science, McGill University 54 | * [Andrew Landgraf](https://github.com/andland), Statistics, The Ohio State University 55 | * [Tracy Schifeling](https://github.com/tracyschifeling), Statistics, Duke University 56 | * [Zhou Ye](https://github.com/ZhouYeJava), Computer Science, Johns Hopkins University 57 | * [Joe Walsh](https://github.com/jtwalsh0), Mentor 58 | 59 | ### License 60 | Copyright (C) 2014 Data Science for Social Good Fellowship at the University of Chicago 61 | 62 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 63 | 64 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 65 | 66 | THE SOFTWARE IS PROVIDED "AS IS," WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 67 | -------------------------------------------------------------------------------- /Tool_Box/README.md: -------------------------------------------------------------------------------- 1 | ##CPS Tool Box## 2 | 3 | ###Description### 4 | Models For CPS: 5 | 6 | 1. Cohort Survival Model: **cohort_survival.R**. 7 | 8 | 2. Decision Tree: **model_diagnostics.R**. 9 | 10 | 3. Conditional Logistic Regression: **grade9_prediction.R**. 11 | 12 | The model code is designed to run on **RStudio**. We recommend RStudio since it is the best way to edit and run R for people without much experience before. 13 | 14 | In order to run three models, you should use the script *main.R*. There are three sections in the code that correspond to the three models. 15 | 16 | For all parts, you need to source the function scripts, set the parameters and run the function to obtain the results. The details of three models are introduced below. 17 | 18 | **We expect CPS to use the above four scripts.** There are also evaluation scripts which are used to evaluate the model: *eval_grade9_prediction.R* and *eval_new_grade9_prediction.R*. The first one is to evaluate on all schools and the second one is on new schools. These codes are used by ourselves to see how the model will perform. 19 | 20 | 21 | ###Cohort Survival Model### 22 | This is essentially the replication of CPS's work on predicting the next year enrollment for each grade of each school given five-year history. For example, if we want to predict the enrollment on 2013, we need to use the enrollment data from 2008 to 2012. Our replication improves on the original VBA to code: the replication needs 3 seconds to run while the original VBA requires 1 hour and 25 minutes. 23 | 24 | This is the code to run the script: 25 | 26 | ``` 27 | source("cohort_survival.R") 28 | location <- "/mnt/data/cps/Data/" 29 | year_to_predict <- 2013 30 | result <- cohort_survival(location, year_to_predict) 31 | ``` 32 | 33 | The variable "location" is the directory to store all data files. The variable "year_to_predict" is the year you want to make the prediction. For example, if you want to make predictions on 2013, then type 2013. The code will automatically find the previous five-year enrollment data for you. 34 | 35 | The input data for the cohort survival model is one active grade file and five enrollment history files. We stress the usage of the active grade file. The column names of this file are SchoolID and all grades. Before making predictions, we suggest knowing, for each school and each grade, what the predictions are on. If the grades at a certain school needs prediction, then put 1 there; otherwise 0. As a default, set all entries as 1. 36 | 37 | The output is a csv file called "csm_prediction_year.csv". The year corresponds on the year you input into the model. 38 | 39 | 40 | ###Decision Tree### 41 | This part is to help CPS diagnose their predictions. Specifically, it will help CPS find the set of schools with the highest likelihood to be mis-projected given the threshold. 42 | 43 | This is the code to run the script: 44 | 45 | ``` 46 | source("model_diagnostics.R") 47 | location <- "/mnt/data/cps/Data/" 48 | year_to_train <- 2012 49 | threshold <- 50 50 | grade <- "Grade9" 51 | model <- diagnostic(location, year, threshold, grade) 52 | year_to_predict <- 2013 53 | result <- prediction(location, year, threshold, grade, model) 54 | ``` 55 | 56 | The variable "location" is the directory to store all data files. The variable "year_to_train" is the year you want to analyze the misprojection. The variable "threshold" shows the tolerance for the misprojection: if the misprojection is larger than the threshold, it is bad; otherwise, good. The variable "grade" is the grade you want to make diagnostics on. 57 | 58 | In the code above, we will make diagnostics on the year 2012 and grade 9 with a threshold of 50. The output of the "diagnostic" function is the R object from the decision tree and a visualization of the decision tree. You will find the probability of making a bad projection from the leaves of the tree to see which types of schools are likely to be misprojected. 59 | 60 | The decision tree is very sensitive to the threshold. If you set threshold 50 or 25, the two trees might be very different. Thus you need to know what threshold you are interested in before using this script. 61 | 62 | Another function is "prediction". This function is used to predict the probability of making a bad projection for the future. The hope here is to help CPS adjust predictions more accurately. We suggest CPS use this model when you do not have other solid information when adjusting predictions. The input arguments are similar with "diagnostic" but requires as input the model trained from "diagnostic" into the model. 63 | 64 | If you want to make a prediction, the output will be "misprojection_year.csv". The year depends on your inputs. 65 | 66 | 67 | ###Conditional Logistic Regression### 68 | This is our main work of this summer -- a student-level model to make predictions for the 8th to 9th grade. The model we use is a conditional logistic regression. It can provide the probability of student i going to school j for all students and all schools. You can also obtain the predictive enrollment from these probability. 69 | 70 | 71 | This is the code to run the script: 72 | 73 | ``` 74 | source("grade9_prediction.R") 75 | location <- "/mnt/data/cps/Data/" 76 | year_to_predict <- 2012 77 | result <- grade9_prediction(location, year_to_predict) 78 | ``` 79 | 80 | The input is exactly the same as the cohort survival model. 81 | 82 | For output, there are two RData files. They are the data frames for training and testing conditonal logit model. Since the data size has over 4,000,000 rows and 80 columns, they are saved as RData files. Use "load" function in R to read the data. The variable names are "data_last_year" and "data_this_year". The code to generate the Rdata files is (variable "year" is the year to make prediction on): 83 | 84 | ``` 85 | save(data_last_year, file=paste0("clogit_", year, "_data_", year-1, ".RData")) #last year data 86 | save(data_this_year, file=paste0("clogit_", year, "_data_", year, ".RData")) #this year data 87 | ``` 88 | 89 | There are also four csv files associated with the probability of student i going to school j. The code to generate the probability files is: 90 | 91 | ``` 92 | write.csv(x=prob_d, file=paste0("clogit_", year, "_prob_d.csv"), row.names=F) #distance 93 | write.csv(x=prob_dc, file=paste0("clogit_", year, "_prob_dc.csv"), row.names=F) #distance+catchment+rating 94 | write.csv(x=prob_dcr, file=paste0("clogit_", year, "_prob_dcr.csv"), row.names=F) #distance+catchment+rating 95 | write.csv(x=prob_all, file=paste0("clogit_", year, "_prob_all.csv"), row.names=F) #all features 96 | ``` 97 | 98 | There are four models which are associated with what features we want to put in: 99 | 100 | 1. d: we only use distance from home to high school as features. 101 | 102 | 2. dc: we use distance and if the school is a catchment school as features. 103 | 104 | 3. dcr: we use distance, catchment school and school rating as features. 105 | 106 | 4. all: we use all the features including the above and also previous enrollment, school attendance rate, etc. 107 | 108 | Another four csv files associated with four models above are the enrollment prediction files. The code to generate the enrollment files is: 109 | 110 | ``` 111 | write.csv(x=enrollment_d, file=paste0("clogit_", year, "_prediction_d.csv"), row.names=F) #distance 112 | write.csv(x=enrollment_dc, file=paste0("clogit_", year, "_prediction_dc.csv"), row.names=F) #distance+catchment 113 | write.csv(x=enrollment_dcr, file=paste0("clogit_", year, "_prediction_dcr.csv"), row.names=F) #distance+catchment+rating 114 | write.csv(x=enrollment_all, file=paste0("clogit_", year, "_prediction_all.csv"), row.names=F) #all features 115 | ``` 116 | 117 | The predictive enrollment is obtained by summing up the probability of all students for a specific school. 118 | 119 | Finally, another two files are the baseline and our final ensemble enrollment prediction. CPS should use the ensemble file as their prediction of enrollment of 9th grade in practice. The code to generate the two enrollment files is: 120 | 121 | ``` 122 | write.csv(x=baseline, file=paste0("baseline_", year, "_prediction_all.csv"), row.names=F) #baseline 123 | write.csv(x=ensemble, file=paste0("ensemble_", year, "_prediction_all.csv"), row.names=F) #final model 124 | ``` 125 | 126 | The baseline model is copy the enrollment last year for old schools and for the new schools, we use (total students-total students predicted in old schools)/(number of total new schools). This will gurantee the total students match. 127 | 128 | We recommend that CPS needs to use **ensemble_year_prediction_all.csv** for the final prediction. The year depends on the year you want to make prediction. It uses clogit to predict new schools and proportionally scale the students on old schools based on the enrollment last year so that the total number of students are correct. 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /Tool_Box/cohort_survival.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ############################# 4 | ####Cohort Survival Model#### 5 | ############################# 6 | 7 | ####CPS Team#### 8 | ####08/19/2014#### 9 | 10 | ####Moving Average#### 11 | #data: history 12 | #num: years of moving average 13 | ma <- function(data, num) { 14 | pred <- vector(length=length(data)-num) 15 | for (i in 1:(length(data)-num)) { 16 | pred[i] <- mean(data[i:(i+1)]) 17 | } 18 | return(pred) 19 | } 20 | 21 | ####Exponential Smoothing#### 22 | #data: history 23 | #num: percentage of exponential smoothing 24 | es <- function(data, num) { 25 | pred <- vector(length=length(data)-1) 26 | for (i in 1:(length(data)-1)) { 27 | if (i==1) { 28 | pred[i] <- data[i] 29 | } 30 | else { 31 | pred[i] <- num*data[i]+(1-num)*pred[i-1] 32 | } 33 | } 34 | return(pred) 35 | } 36 | 37 | ####All Models#### 38 | #enroll_vector: historical enrollment 39 | #this will provide the best prediction based on six models 40 | model <- function(enroll_vector) { 41 | data <- enroll_vector[enroll_vector!=0] 42 | if (length(data)==1) { 43 | cat(paste0("Single"), "\n") 44 | return(data) 45 | } 46 | else if (length(data)==2) { 47 | cat(paste0("Double"), "\n") 48 | return(mean(data)) 49 | } 50 | else { 51 | candidate <- vector(length=6) 52 | value <- vector(length=6) 53 | #MA1 54 | pred <- ma(data, 1) 55 | real <- data[-1] 56 | candidate[1] <- mean((real-pred)^2) 57 | value[1] <- pred[length(pred)] 58 | #MA2 59 | pred <- ma(data, 2) 60 | real <- data[-c(1,2)] 61 | candidate[2] <- mean((real-pred)^2) 62 | value[2] <- pred[length(pred)] 63 | #ES2 64 | pred <- es(data, 0.2) 65 | real <- data[-1] 66 | candidate[3] <- mean((real-pred)^2) 67 | value[3] <- pred[length(pred)] 68 | #ES3 69 | pred <- es(data, 0.3) 70 | real <- data[-1] 71 | candidate[4] <- mean((real-pred)^2) 72 | value[4] <- pred[length(pred)] 73 | #ES4 74 | pred <- es(data, 0.4) 75 | real <- data[-1] 76 | candidate[5] <- mean((real-pred)^2) 77 | value[5] <- pred[length(pred)] 78 | #ES5 79 | pred <- es(data, 0.5) 80 | real <- data[-1] 81 | candidate[6] <- mean((real-pred)^2) 82 | value[6] <- pred[length(pred)] 83 | all_model <- c("MA1", "MA2", "ES2", "ES3", "ES4", "ES5") 84 | cat(paste0(all_model[which.min(candidate)], "\n")) 85 | return(value[which.min(candidate)]) 86 | } 87 | } 88 | 89 | ####Cohort Survival Model#### 90 | #last_grade: name of last grade 91 | #this_grade: name of this grade 92 | #city_csr: city-level cohort survival rate 93 | csm <- function(last_grade, this_grade, city_csr) { 94 | ratio <- unlist(this_grade)[-1]/unlist(last_grade)[-length(unlist(last_grade))] 95 | ratio[is.nan(ratio)] <- 0 96 | ratio[is.infinite(ratio)] <- 0 97 | if (sum(ratio)!=0) { 98 | temp <- model(ratio) 99 | if (temp!=0) { 100 | return(round(unlist(last_grade)[length(unlist(last_grade))]*temp)) 101 | } 102 | else { 103 | return(round(unlist(last_grade)[length(unlist(last_grade))]*mean(ratio[ratio!=0]))) 104 | } 105 | } 106 | else { 107 | return(round(unlist(last_grade)[length(unlist(last_grade))]*city_csr[,names(this_grade)])) 108 | } 109 | } 110 | 111 | ####Entry Level Model#### 112 | #this_grade: name of this grade 113 | #city_el: city-level enrollment 114 | el <- function(this_grade, city_el) { 115 | if (sum(this_grade)!=0) { 116 | temp <- model(unlist(this_grade)) 117 | if (temp!=0) { 118 | return(round(temp)) 119 | } 120 | else { 121 | return(round(city_el[,names(this_grade)])) 122 | } 123 | } 124 | else { 125 | return(round(city_el[,names(this_grade)])) 126 | } 127 | } 128 | 129 | ####Predict Next Year Enrollment#### 130 | #history: enrollment history 131 | #grade: the grade we want to make prediction 132 | #city_csr: city-level cohort survival rate 133 | #city_el: city-level enrollment 134 | pred_enroll <- function(history, grade, city_csr, city_el) { 135 | pred <- data.frame(matrix(NA, 1, ncol(history))) 136 | name <- colnames(history) 137 | colnames(pred) <- name 138 | pred$SchoolID <- unique(history$SchoolID) 139 | for (i in 2:(ncol(grade)-1)) { 140 | if (grade[i]!=0) { #active grade 141 | cat(paste0(name[i], "\n")) 142 | if (name[i]=="K" | name[i]=="Grade1" | name[i]=="Grade9" | name[i]=="LRE") { 143 | pred[i] <- el(history[i], city_el) #entry level model 144 | } 145 | else { 146 | if (history[nrow(history),i-1]!=0) { #check if last grade last year has enrollment 147 | pred[i] <- csm(history[i-1], history[i], city_csr) #cohort survival model 148 | } 149 | else { 150 | pred[i] <- el(history[i], city_el) #entry level model 151 | } 152 | } 153 | } 154 | else { 155 | pred[i] <- 0 156 | } 157 | } 158 | pred$ATOT <- sum(pred[2:(length(pred)-1)]) 159 | return(pred) 160 | } 161 | 162 | ####Main Function For Cohort Survival Model#### 163 | #location: data source 164 | #year: year to predict 165 | cohort_survival <- function(location, year) { 166 | #Read Data 167 | active <- read.csv(paste0(location, "active", year, ".csv")) #active schools and grades 168 | school1 <- read.csv(paste0(location, "true", year-1, ".csv")) #enrollment -1 year 169 | school2 <- read.csv(paste0(location, "true", year-2, ".csv")) #enrollment -2 year 170 | school3 <- read.csv(paste0(location, "true", year-3, ".csv")) #enrollment -3 year 171 | school4 <- read.csv(paste0(location, "true", year-4, ".csv")) #enrollment -4 year 172 | school5 <- read.csv(paste0(location, "true", year-5, ".csv")) #enrollment -5 year 173 | school_list <- list(school5, school4, school3, school2, school1) 174 | #Set Up 175 | all_id <- unique(active$SchoolID) 176 | all_school <- do.call("rbind", school_list) 177 | all_school$SchoolID <- NULL 178 | all_school$ATOT <- NULL 179 | #City Level Cohort Survival Rate 180 | city_csr <- data.frame(Grade2=0.98, Grade3=1.03, Grade4=0.93, Grade5=0.98, Grade6=1, Grade7=0.97, Grade8=0.98, Grade10=1.04, Grade11=0.84, Grade12=0.91) 181 | city_el <- data.frame(matrix(apply(all_school, 2, function(x) {mean(x[x!=0])}), 1, ncol(all_school))) 182 | colnames(city_el) <- colnames(all_school) 183 | #Make Prediction 184 | name <- c("SchoolID", "K", "Grade1", "Grade2", "Grade3", "Grade4", "Grade5", "Grade6", "Grade7", "Grade8", "Grade9", "Grade10", "Grade11", "Grade12", "LRE", "ATOT") 185 | result <- data.frame(matrix(NA, length(all_id), length(name))) 186 | colnames(result) <- name 187 | for (i in 1:length(all_id)) { 188 | id = all_id[i] 189 | cat(paste0(id), "\n") 190 | active_grade <- subset(active, active$SchoolID==id) 191 | history <- data.frame(matrix(NA, length(school_list), length(name))) 192 | colnames(history) <- name 193 | for (j in 1:length(school_list)) { 194 | school <- school_list[[j]] 195 | index <- which(school$SchoolID==id) 196 | if (length(index)!=0) { 197 | history[j,] <- school[index,] 198 | } 199 | else { 200 | history[j,] <- c(id, rep(0, length(name)-1)) 201 | } 202 | } 203 | result[i,] <- pred_enroll(history, active_grade, city_csr, city_el) 204 | } 205 | write.csv(x=result, file=paste0("csm_prediction_", year, ".csv"), row.names=F) 206 | return(result) 207 | } -------------------------------------------------------------------------------- /Tool_Box/eval_grade9_prediction.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ################################################ 4 | ####Evaluate Conditional Logistic Regression#### 5 | ################################################ 6 | 7 | ####CPS Team#### 8 | ####08/18/2014#### 9 | 10 | rm(list=ls()) 11 | setwd("/home/zye/Model/") 12 | year <- 2013 13 | 14 | ####Packages#### 15 | suppressMessages(library(stringr)) 16 | suppressMessages(library(reshape2)) 17 | suppressMessages(library(ggplot2)) 18 | 19 | ####Accuracy#### 20 | accuracy <- function(prob, true_school, len) { 21 | acc <- vector(length=len) 22 | temp <- t(apply(-prob[,-1], 1, rank, ties.method="random")) 23 | for (k in 1:len) { 24 | top_k <- (temp<=k)*1 25 | acc[k] <- sum(true_school*top_k)/nrow(top_k) 26 | } 27 | return(acc) 28 | } 29 | 30 | ####Evaluation#### 31 | #True Enrollment This Year 32 | load(paste0("clogit_", year, "_data_", year, ".RData")) 33 | true_school <- dcast(data_this_year, StudentKey~SchoolKey, mean, value.var="SchoolSelected")[,-1] 34 | true_enrollment <- data.frame(SchoolKey=as.numeric(names(true_school)), Enrollment=round(colSums(true_school))) 35 | #Predictive Probability 36 | d <- read.csv(paste0("clogit_", year, "_prob_d.csv")) #distance 37 | dc <- read.csv(paste0("clogit_", year, "_prob_dc.csv")) #distance+catchment 38 | dcr <- read.csv(paste0("clogit_", year, "_prob_dcr.csv")) #distance+catchment+rating 39 | all <- read.csv(paste0("clogit_", year, "_prob_all.csv")) #all features 40 | #Predictive Enrollment 41 | d_prediction <- read.csv(paste0("clogit_", year, "_prediction_d.csv")) 42 | dc_prediction <- read.csv(paste0("clogit_", year, "_prediction_dc.csv")) 43 | dcr_prediction <- read.csv(paste0("clogit_", year, "_prediction_dcr.csv")) 44 | all_prediction <- read.csv(paste0("clogit_", year, "_prediction_all.csv")) 45 | baseline_prediction <- read.csv(paste0("baseline_", year, "_prediction_all.csv")) 46 | ensemble_prediction <- read.csv(paste0("ensemble_", year, "_prediction_all.csv")) 47 | #Compute Accuracy From Probability 48 | num <- 10 49 | d_acc <- data.frame(Model=rep("D", num), Accuracy=accuracy(d, true_school, num)) 50 | dc_acc <- data.frame(Model=rep("D+C", num), Accuracy=accuracy(dc, true_school, num)) 51 | dcr_acc <- data.frame(Model=rep("D+C+R", num), Accuracy=accuracy(dcr, true_school, num)) 52 | all_acc <- data.frame(Model=rep("ALL", num), Accuracy=accuracy(all, true_school, num)) 53 | acc_graph <- do.call("rbind", list(d_acc, dc_acc, dcr_acc, all_acc)) 54 | acc_graph$Rank <- factor(1:num, levels=1:num) 55 | ggplot(acc_graph, aes(x=Rank, y=Accuracy, group=Model, colour=Model))+geom_line() 56 | #Compare Enrollment 57 | d_compare <- merge(d_prediction, true_enrollment, by="SchoolKey") 58 | dc_compare <- merge(dc_prediction, true_enrollment, by="SchoolKey") 59 | dcr_compare <- merge(dcr_prediction, true_enrollment, by="SchoolKey") 60 | all_compare <- merge(all_prediction, true_enrollment, by="SchoolKey") 61 | baseline_compare <- merge(baseline_prediction, true_enrollment, by="SchoolKey") 62 | ensemble_compare <- merge(ensemble_prediction, true_enrollment, by="SchoolKey") 63 | d_mae <- mean(abs(d_compare$Enrollment.x-d_compare$Enrollment.y)) 64 | dc_mae <- mean(abs(dc_compare$Enrollment.x-dc_compare$Enrollment.y)) 65 | dcr_mae <- mean(abs(dcr_compare$Enrollment.x-dcr_compare$Enrollment.y)) 66 | all_mae <- mean(abs(all_compare$Enrollment.x-all_compare$Enrollment.y)) 67 | baseline_mae <- mean(abs(baseline_compare$Enrollment.x-baseline_compare$Enrollment.y)) 68 | ensemble_mae <- mean(abs(ensemble_compare$Enrollment.x-ensemble_compare$Enrollment.y)) 69 | mae_graph <- data.frame(Model=factor(c("clogit", "baseline", "ensemble"), levels=c("clogit", "baseline", "ensemble")), MAE=c(all_mae, baseline_mae, ensemble_mae)) 70 | ggplot(mae_graph, aes(x=Model, y=MAE))+geom_bar(stat="identity", fill="midnightblue") 71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /Tool_Box/eval_new_grade9_prediction.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ########################### 4 | ###Evaluation New School### 5 | ########################### 6 | 7 | ###Zhou Ye### 8 | ###08/06/2014### 9 | 10 | rm(list=ls()) 11 | setwd("/home/zye/Model") 12 | year <- 2013 13 | suppressMessages(library(stringr)) 14 | suppressMessages(library(reshape2)) 15 | suppressMessages(library(ggplot2)) 16 | result <- read.csv(paste0("clogit_", year, "_prob_all.csv")) 17 | load(paste0("clogit_", year, "_data_", year-1, ".RData")) 18 | load(paste0("clogit_", year, "_data_", year, ".RData")) 19 | new_school <- setdiff(unique(data_this_year$SchoolKey), unique(data_last_year$SchoolKey)) 20 | 21 | ###Reshape### 22 | reshape_enroll <- function(data) { 23 | school_key <- as.numeric(str_replace(names(data), "X", "")) 24 | enroll <- as.numeric(data) 25 | school_enroll <- data.frame(SchoolKey=school_key, Enrollment=enroll) 26 | return(school_enroll) 27 | } 28 | 29 | ###Distance### 30 | rdist_earth_vec <- function(x1, x2, miles=T, R=NULL) { 31 | #modified from fields::rdist.earth 32 | if (is.null(R)) { 33 | if (miles) { 34 | R <- 3963.34 35 | } 36 | else { 37 | R <- 6378.388 38 | } 39 | } 40 | coslat1 <- cos((x1[,2]*pi)/180) 41 | sinlat1 <- sin((x1[,2]*pi)/180) 42 | coslon1 <- cos((x1[,1]*pi)/180) 43 | sinlon1 <- sin((x1[,1]*pi)/180) 44 | coslat2 <- cos((x2[,2]*pi)/180) 45 | sinlat2 <- sin((x2[,2]*pi)/180) 46 | coslon2 <- cos((x2[,1]*pi)/180) 47 | sinlon2 <- sin((x2[,1]*pi)/180) 48 | pp <- cbind(coslat1*coslon1, coslat1*sinlon1, sinlat1)*cbind(coslat2*coslon2, coslat2*sinlon2, sinlat2) 49 | pp <- rowSums(pp) 50 | return(R*acos(ifelse(abs(pp)>1, 1*sign(pp), pp))) 51 | } 52 | 53 | ###Prediction### 54 | prediction <- round(colSums(result[,-1])) 55 | pred_enroll <- reshape_enroll(prediction) 56 | 57 | ###True Enrollment### 58 | real <- dcast(data_this_year, StudentKey~SchoolKey, mean, value.var="SchoolSelected") 59 | true_enroll <- reshape_enroll(colSums(real[,-1])) 60 | 61 | ###School Location### 62 | school_location <- unique(with(data_this_year, data.frame(SchoolKey, LongitudeHS, LatitudeHS))) 63 | old_school_location <- subset(school_location, !SchoolKey%in%new_school) 64 | old_school_location$Distance <- NA 65 | 66 | ###Baseline### 67 | last_year <- dcast(data_last_year, StudentKey~SchoolKey, mean, value.var="SchoolSelected") 68 | last_enroll <- reshape_enroll(colSums(last_year[,-1])) 69 | #Last Year Mean 70 | baseline1 <- data.frame(SchoolKey=true_enroll$SchoolKey, Enrollment=NA) 71 | for (i in 1:nrow(baseline1)) { 72 | index <- which(last_enroll$SchoolKey==baseline1[i,"SchoolKey"]) 73 | if (length(index)>0) { 74 | baseline1[i,"Enrollment"] <- last_enroll[index,"Enrollment"] 75 | } 76 | } 77 | baseline1[is.na(baseline1$Enrollment),"Enrollment"] <- round(mean(baseline1$Enrollment, na.rm=T)) 78 | #Last Year Median 79 | baseline2 <- data.frame(SchoolKey=true_enroll$SchoolKey, Enrollment=NA) 80 | for (i in 1:nrow(baseline1)) { 81 | index <- which(last_enroll$SchoolKey==baseline2[i,"SchoolKey"]) 82 | if (length(index)>0) { 83 | baseline2[i,"Enrollment"] <- last_enroll[index,"Enrollment"] 84 | } 85 | } 86 | baseline2[is.na(baseline2$Enrollment),"Enrollment"] <- round(median(baseline2$Enrollment, na.rm=T)) 87 | #KNN Mean 88 | k <- 5 89 | baseline3 <- data.frame(SchoolKey=true_enroll$SchoolKey, Enrollment=NA) 90 | for (i in 1:nrow(baseline3)) { 91 | index <- which(last_enroll$SchoolKey==baseline3[i,"SchoolKey"]) 92 | if (length(index)>0) { 93 | baseline3[i,"Enrollment"] <- last_enroll[index,"Enrollment"] 94 | } 95 | else { 96 | self_location <- subset(school_location, SchoolKey==baseline3[i,"SchoolKey"]) 97 | self_coordinate <- with(self_location, t(replicate(nrow(old_school_location), c(LongitudeHS, LatitudeHS)))) 98 | old_school_location <- within(old_school_location, { 99 | Distance <- rdist_earth_vec(cbind(LongitudeHS, LatitudeHS), self_coordinate) 100 | }) 101 | close_school_location <- old_school_location[order(old_school_location$Distance),][1:k,] 102 | baseline3[i,"Enrollment"] <- round(mean(subset(last_enroll, SchoolKey%in%close_school_location$SchoolKey)$Enrollment)) 103 | } 104 | } 105 | #KNN Median 106 | k <- 5 107 | baseline4 <- data.frame(SchoolKey=true_enroll$SchoolKey, Enrollment=NA) 108 | for (i in 1:nrow(baseline4)) { 109 | index <- which(last_enroll$SchoolKey==baseline4[i,"SchoolKey"]) 110 | if (length(index)>0) { 111 | baseline4[i,"Enrollment"] <- last_enroll[index,"Enrollment"] 112 | } 113 | else { 114 | self_location <- subset(school_location, SchoolKey==baseline3[i,"SchoolKey"]) 115 | self_coordinate <- with(self_location, t(replicate(nrow(old_school_location), c(LongitudeHS, LatitudeHS)))) 116 | old_school_location <- within(old_school_location, { 117 | Distance <- rdist_earth_vec(cbind(LongitudeHS, LatitudeHS), self_coordinate) 118 | }) 119 | close_school_location <- old_school_location[order(old_school_location$Distance),][1:k,] 120 | baseline4[i,"Enrollment"] <- round(median(subset(last_enroll, SchoolKey%in%close_school_location$SchoolKey)$Enrollment)) 121 | } 122 | } 123 | 124 | ###Evaluation### 125 | pred <- subset(merge(pred_enroll, true_enroll, by="SchoolKey"), SchoolKey%in%new_school) 126 | base1 <- subset(merge(baseline1, true_enroll, by="SchoolKey"), SchoolKey%in%new_school) 127 | base2 <- subset(merge(baseline2, true_enroll, by="SchoolKey"), SchoolKey%in%new_school) 128 | base3 <- subset(merge(baseline3, true_enroll, by="SchoolKey"), SchoolKey%in%new_school) 129 | base4 <- subset(merge(baseline4, true_enroll, by="SchoolKey"), SchoolKey%in%new_school) 130 | pred_mae <- mean(abs(pred$Enrollment.x-pred$Enrollment.y)) 131 | base1_mae <- mean(abs(base1$Enrollment.x-base1$Enrollment.y)) 132 | base2_mae <- mean(abs(base2$Enrollment.x-base2$Enrollment.y)) 133 | base3_mae <- mean(abs(base3$Enrollment.x-base3$Enrollment.y)) 134 | base4_mae <- mean(abs(base4$Enrollment.x-base4$Enrollment.y)) 135 | mae_graph <- data.frame(Model=factor(c("baseline1", "baseline2", "baseline3", "baseline4", "clogit"), levels=c("baseline1", "baseline2", "baseline3", "baseline4", "clogit")), MAE=c(base1_mae, base2_mae, base3_mae, base4_mae, pred_mae)) 136 | ggplot(mae_graph, aes(x=Model, y=MAE))+geom_bar(stat="identity", fill="midnightblue") 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /Tool_Box/grade9_prediction.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ########################## 4 | ####Grade 9 Prediction#### 5 | ########################## 6 | 7 | ####CPS Team#### 8 | ####08/17/2014#### 9 | 10 | ####Packages#### 11 | suppressMessages(library(dplyr)) 12 | suppressMessages(library(fields)) 13 | suppressMessages(library(reshape2)) 14 | suppressMessages(library(clogitL1)) 15 | suppressMessages(library(survival)) 16 | suppressMessages(library(stringr)) 17 | 18 | ####Compute Distance on Map into a Vector#### 19 | #modified from fields::rdist.earth (see rdist.earth for more details) 20 | #now x1 and x2 can be vectors 21 | rdist_earth_vec <- function(x1, x2, miles=T, R=NULL) { 22 | if (is.null(R)) { 23 | if (miles) { 24 | R <- 3963.34 25 | } 26 | else { 27 | R <- 6378.388 28 | } 29 | } 30 | coslat1 <- cos((x1[,2]*pi)/180) 31 | sinlat1 <- sin((x1[,2]*pi)/180) 32 | coslon1 <- cos((x1[,1]*pi)/180) 33 | sinlon1 <- sin((x1[,1]*pi)/180) 34 | coslat2 <- cos((x2[,2]*pi)/180) 35 | sinlat2 <- sin((x2[,2]*pi)/180) 36 | coslon2 <- cos((x2[,1]*pi)/180) 37 | sinlon2 <- sin((x2[,1]*pi)/180) 38 | pp <- cbind(coslat1*coslon1, coslat1*sinlon1, sinlat1)*cbind(coslat2*coslon2, coslat2*sinlon2, sinlat2) 39 | pp <- rowSums(pp) 40 | return(R*acos(ifelse(abs(pp)>1, 1*sign(pp), pp))) 41 | } 42 | 43 | ###Construct Data Set for Conditional Logistic Regression### 44 | #location: the path to the data 45 | #year: the year we want make prediction 46 | #num: Inf for all schools; otherwise only "num" schools are used 47 | construct_data <- function(location, year, num) { 48 | #Process Data 49 | students <- read.csv(paste0(location, "students", year, ".csv"), stringsAsFactors=F) #student information 50 | students <- subset(students, !(EducationType%in%c("21 - 60%", "61 - 100%"))) #exclude LRE2/LRE3 students 51 | names(students)[names(students)=="Longitude"] <- "LongitudeHome" 52 | names(students)[names(students)=="Latitude"] <- "LatitudeHome" 53 | schools_static <- read.csv(paste0(location, "schools_static_features.csv"), stringsAsFactors=F) #school static information 54 | schools_progress <- read.csv(paste0(location, "schools_changing_features_", year, ".csv"), stringsAsFactors=F) #school progress feature 55 | schools_aggregate <- read.csv(paste0(location, "schools_aggregate_features_", year, ".csv"), stringsAsFactors=F) #school aggregate feature 56 | schools_change <- merge(schools_progress, schools_aggregate, by="SchoolID", all.x=T) #school changing feature 57 | schools_change$SchoolKey <- NULL 58 | schools_change[is.na(schools_change$White),]$White <- median(schools_change$White, na.rm=T) #impute race 59 | schools_change[is.na(schools_change$Black),]$Black <- median(schools_change$Black, na.rm=T) #impute race 60 | schools_change[is.na(schools_change$Hispanic),]$Hispanic <- median(schools_change$Hispanic, na.rm=T) #impute racex 61 | schools_change$Other <- pmax(0, 100-rowSums(schools_change[,c("White", "Black", "Hispanic")])) #other race 62 | enroll <- read.csv(paste0(location, "true", year-1, ".csv"), stringsAsFactors=F) 63 | enroll9 <- subset(enroll, select=c("SchoolID", "Grade9")) 64 | colnames(enroll9) <- c("SchoolID", "PreviousEnrollment") 65 | cat("Finish Reading Data...\n") 66 | #Inner Join to Obtain Final Student and School Data 67 | schools <- subset(schools_static, select=c(SchoolKey, SchoolID, SchoolType, SchoolSubType, SchoolStyle, Longitude, Latitude)) 68 | colnames(schools) <- c("SchoolKey", "SchoolIDES", "SchoolTypeES", "SchoolSubTypeES", "SchoolStyleES", "LongitudeES", "LatitudeES") #elementary schools 69 | students_schools <- merge(students, schools, all.x=T, by.x="ThisGradeSchoolKey", by.y="SchoolKey") 70 | students_final <- students_schools #final students data 71 | colnames(schools) <- c("SchoolKey", "SchoolIDHS", "SchoolTypeHS", "SchoolSubTypeHS", "SchoolStyleHS", "LongitudeHS", "LatitudeHS") #high schools 72 | students_schools <- merge(students_schools, schools, all.x=T, by.x="NextGradeSchoolKey", by.y="SchoolKey") 73 | students_schools$DistanceHomeHS <- with(students_schools, rdist_earth_vec(cbind(LongitudeHome, LatitudeHome), cbind(LongitudeHS, LatitudeHS))) 74 | possible_schools <- tbl_df(students_schools) %>% 75 | group_by(NextGradeSchoolKey) %>% 76 | summarize(n=n(), AvgDist=mean(DistanceHomeHS, na.rm=T), n_dist=sum(!is.na(DistanceHomeHS))) %>% 77 | filter(n_dist>1) %>% 78 | arrange(desc(n_dist)) #remove NA distance schools 79 | if (is.finite(num)) { 80 | possible_schools <- possible_schools[1:num,] #sub-sampling 81 | } 82 | students_final <- subset(students_final, NextGradeSchoolKey%in%possible_schools$NextGradeSchoolKey) 83 | schools_final <- subset(schools, SchoolKey%in%possible_schools$NextGradeSchoolKey) 84 | schools_final <- merge(schools_final, enroll9, by.x="SchoolIDHS", by.y="SchoolID", all.x=T) 85 | schools_final <- merge(schools_final, schools_change, by.x="SchoolIDHS", by.y="SchoolID", all.x=T) 86 | schools_final <- within(schools_final, { 87 | PreviousEnrollment[is.na(PreviousEnrollment)] <- 0 88 | NewSchool <- as.numeric(PreviousEnrollment == 0) 89 | }) 90 | #Outer Join to Obtain Data for Conditional Logistic Regression 91 | students_schools_final <- merge(unique(students_final), unique(schools_final), by=NULL, all=T) 92 | cat("Finish Constructing Table...\n") 93 | #Feature Computation and Missing Value 94 | #1. Distance 95 | students_schools_final$DistanceHomeHS <- with(students_schools_final, rdist_earth_vec(cbind(LongitudeHome, LatitudeHome), cbind(LongitudeHS, LatitudeHS))) 96 | median_distance <- median(students_schools_final$DistanceHomeHS, na.rm=T) 97 | students_schools_final[is.na(students_schools_final$DistanceHomeHS),]$DistanceHomeHS <- median_distance 98 | students_schools_final$DistanceESHS <- with(students_schools_final, rdist_earth_vec(cbind(LongitudeES, LatitudeES), cbind(LongitudeHS, LatitudeHS))) 99 | median_distance <- median(students_schools_final$DistanceESHS, na.rm=T) 100 | students_schools_final[is.na(students_schools_final$DistanceESHS),]$DistanceESHS <- median_distance 101 | #2. School Feature 102 | students_schools_final <- within(students_schools_final, { 103 | SchoolTypeES[is.na(SchoolTypeES)] <- "Unknown" 104 | SchoolSubTypeES[is.na(SchoolSubTypeES)] <- "Unknown" 105 | SchoolStyleES[is.na(SchoolStyleES)] <- "Unknown" 106 | SchoolTypeHS[is.na(SchoolTypeHS)] <- "Unknown" 107 | SchoolSubTypeHS[is.na(SchoolSubTypeHS)] <- "Unknown" 108 | SchoolStyleHS[is.na(SchoolStyleHS)] <- "Unknown" 109 | SchoolTypeMatch <- as.numeric(SchoolTypeES==SchoolTypeHS) 110 | SchoolSubTypeMatch <- as.numeric(SchoolSubTypeES==SchoolSubTypeHS) 111 | SchoolStyleMatch <- as.numeric(SchoolStyleES==SchoolStyleHS) 112 | SchoolTypeHS <- factor(SchoolTypeHS) 113 | SchoolSubTypeHS <- factor(SchoolSubTypeHS) 114 | SchoolStyleHS <- factor(SchoolStyleHS) 115 | SelectiveEnrollment <- as.numeric(SchoolStyleHS=="Selective Enrollment") 116 | Race[Race %in% c("American Indian", "Hawaiian or Pacific Islander", "Multi", "N/A")] <- "Other" 117 | Race <- factor(Race) 118 | Language[!(Language %in% c("English", "Spanish", "Polish"))] <- "Other" 119 | Language <- factor(Language) 120 | BirthCountry[!(BirthCountry %in% c("United States", "Mexico", "China"))] <- "Other" 121 | BirthCountry <- factor(BirthCountry) 122 | Food <- factor(Food) 123 | ESL[is.na(ESL)] <- "Unknown" 124 | ESL <- factor(ESL) 125 | SchoolChange[is.na(SchoolChange)] <- 0 126 | SchoolGPA[is.na(SchoolGPA)] <- median(SchoolGPA, na.rm=T) 127 | }) 128 | #3. ISAT 129 | students_schools_final$AvgScore <- (students_schools_final$EighthMathISAT+students_schools_final$EighthReadingISAT)/2 130 | median_ISAT <- median(students_schools_final$AvgScore, na.rm=T) 131 | students_schools_final[is.na(students_schools_final$AvgScore),]$AvgScore <- median_ISAT 132 | #4. Progress Report 133 | students_schools_final <- within(students_schools_final, { 134 | Rating[is.na(Rating)] <- "Unknown" 135 | Safety[is.na(Safety)] <- "Unknown" 136 | Probation[is.na(Probation)] <- "Unknown" 137 | Rating <- factor(Rating) 138 | RatingLevel1 <- as.numeric(Rating=="level 1") 139 | Safety <- factor(Safety) 140 | Probation <- factor(Probation) 141 | Mobility[is.na(Mobility)] <- median(Mobility, na.rm=T) 142 | }) 143 | #5. Other Features 144 | students_schools_final <- within(students_schools_final, { 145 | SchoolSelected <- as.numeric(NextGradeSchoolKey==SchoolKey) 146 | SameAsPreviousSchool <- ifelse(is.na(ThisGradeSchoolKey), 0, as.numeric(ThisGradeSchoolKey==SchoolKey)) 147 | CatchmentSchool <- as.numeric(CatchmentSchoolKey==SchoolKey) 148 | SchoolGPA[is.na(SchoolGPA)] <- median(GPA, na.rm=T) 149 | SchoolAttendanceRate[is.na(SchoolAttendanceRate)] <- median(SchoolAttendanceRate, na.rm=T) 150 | AttendanceRate[is.na(AttendanceRate)] <- median(AttendanceRate, na.rm=T) 151 | PercentSameRace <- NA 152 | PercentSameRace[Race=="Black, Non-Hispanic"] <- Black[Race=="Black, Non-Hispanic"] 153 | PercentSameRace[Race=="Hispanic"] <- Hispanic[Race=="Hispanic"] 154 | PercentSameRace[Race=="White, Non-Hispanic"] <- White["White, Non-Hispanic"] 155 | PercentSameRace[is.na(PercentSameRace)] <- Other[is.na(PercentSameRace)] 156 | PercentSameFood <- NA 157 | PercentSameFood[Food=="Yes"] <- SchoolFood[Food=="Yes"] 158 | PercentSameFood[Food!="Yes"] <- 1-SchoolFood[Food!="Yes"] 159 | PercentSameESL <- NA 160 | PercentSameESL[ESL=="Yes"] <- SchoolESL[ESL=="Yes"] 161 | PercentSameESL[ESL!="Yes"] <- 1-SchoolESL[ESL!="Yes"] 162 | PercentSameMale <- NA 163 | PercentSameMale[Gender=="Male"] <- SchoolMale[Gender=="Male"] 164 | PercentSameMale[Gender!="Male"] <- 1-SchoolMale[Gender!="Male"] 165 | PercentSameHomeless <- NA 166 | PercentSameHomeless[Homeless=="Yes"] <- SchoolHomeless[Homeless=="Yes"] 167 | PercentSameHomeless[Homeless!="Yes"] <- 1-SchoolHomeless[Homeless!="Yes"] 168 | }) 169 | cat("Finish Data Cleaning...\n") 170 | return(students_schools_final) 171 | } 172 | 173 | ####Make Prediction#### 174 | #convert the prediction to probability 175 | #prediction: the prediction value from condition logistic regression 176 | #test_data: data for test 177 | make_prediction <- function(prediction, test_data) { 178 | test <- data.frame(StudentKey=test_data$StudentKey, 179 | SchoolKey=test_data$SchoolKey, 180 | Pred=exp(as.numeric(prediction))) 181 | pred_mat <- dcast(test, StudentKey~SchoolKey, mean, value.var="Pred") 182 | pred_mat[,-1] <- pred_mat[,-1]/rowSums(pred_mat[,-1]) 183 | return(pred_mat) 184 | } 185 | 186 | ####Reshape Probability Into Enrollment#### 187 | #prob: probability from conditional logistic regression 188 | reshape_enroll <- function(prob) { 189 | data <- round(colSums(prob[,-1])) 190 | school_key <- as.numeric(str_replace(names(data), "X", "")) 191 | enroll <- as.numeric(data) 192 | school_enroll <- data.frame(SchoolKey=school_key, Enrollment=enroll) 193 | return(school_enroll) 194 | } 195 | 196 | ####Train Conditional Logistic Regression#### 197 | clogit <- function(regression_formula, data_last_year, data_this_year) { 198 | cat(paste0("Formula: ", regression_formula, "\n")) 199 | feature <- model.matrix(as.formula(regression_formula), data_last_year)[,-1] 200 | feature_test <- model.matrix(as.formula(regression_formula), data_this_year)[,-1] 201 | feature <- feature[, colnames(feature)%in%colnames(feature_test)] #make sure the features are matched 202 | feature <- feature[, apply(feature, 2, sd)>0] 203 | feature_means <- colMeans(feature) 204 | feature_sds <- apply(feature, 2, sd) 205 | feature <- scale(feature, center=feature_means, scale=feature_sds) 206 | feature_test <- scale(feature_test, center=feature_means, scale=feature_sds) 207 | label <- data_last_year$SchoolSelected 208 | start <- proc.time() 209 | model <- clogitL1(x=feature, y=label, strata=data_last_year$StudentKey, alpha=1) 210 | model_cv <- cv.clogitL1(model, numFolds=5) 211 | end <- proc.time() 212 | time <- end-start 213 | cat(paste0("Training Takes ", as.numeric(time["elapsed"])/3600), "Hours...\n") 214 | #Predict 215 | coefficient <- summary(model_cv)$beta_minCV 216 | if (is.nan(sum(coefficient))) { 217 | coefficient <- summary(model_cv)$beta_minCV1se 218 | if (is.nan(sum(coefficient))) { 219 | cat("We Cannot Make Prediction!\n") 220 | return() 221 | } 222 | } 223 | prediction <- feature_test%*%coefficient 224 | prob <- make_prediction(prediction, data_this_year) 225 | return(prob) 226 | } 227 | 228 | ####Construct Baseline: Copy Last Year Enrollment#### 229 | #data_last_year: last year training data 230 | #new_school: new school keys 231 | copy_last_year <- function(last_enroll, this_enroll, new_school) { 232 | baseline <- data.frame(SchoolKey=this_enroll$SchoolKey, Enrollment=NA) 233 | for (i in 1:nrow(baseline)) { 234 | index <- which(last_enroll$SchoolKey==baseline[i,"SchoolKey"]) 235 | if (length(index)>0) { 236 | baseline[i,"Enrollment"] <- last_enroll[index,"Enrollment"] 237 | } 238 | } 239 | baseline[is.na(baseline$Enrollment),"Enrollment"] <- max(0, round((sum(this_enroll$Enrollment)-sum(baseline$Enrollment, na.rm=T))/sum(is.na(baseline$Enrollment)))) 240 | return(baseline) 241 | } 242 | 243 | ####Conditional Logistic Regression#### 244 | grade9_prediction <- function(location, year) { 245 | cat("Process Last Year Data...\n") 246 | data_last_year <- construct_data(location, year-1, Inf) 247 | cat("Process This Year Data...\n") 248 | data_this_year <- construct_data(location, year, Inf) 249 | #Construct Design Matrix 250 | cat("Creating Model Matrices...\n") 251 | #Train/Predict 252 | #1. Distance 253 | regression_formula <- "SchoolSelected~ns(DistanceHomeHS,2)" 254 | prob_d <- clogit(regression_formula, data_last_year, data_this_year) 255 | enrollment_d <- reshape_enroll(prob_d) 256 | #2. Distance+Catchment 257 | regression_formula <- "SchoolSelected~ns(DistanceHomeHS,2)+CatchmentSchool" 258 | prob_dc <- clogit(regression_formula, data_last_year, data_this_year) 259 | enrollment_dc <- reshape_enroll(prob_dc) 260 | #3. Distance+Catchment+Rating 261 | regression_formula <- "SchoolSelected~Rating*ns(DistanceHomeHS,2)+CatchmentSchool" 262 | prob_dcr <- clogit(regression_formula, data_last_year, data_this_year) 263 | enrollment_dcr <- reshape_enroll(prob_dcr) 264 | #4. All 265 | regression_formula <- "SchoolSelected~Rating*ns(DistanceHomeHS,2)+ns(PercentSameRace,2)+ns(PercentSameESL,2)+ns(PercentSameMale,2)+ns(Mobility,2)+ns(SchoolAttendanceRate,2)+CatchmentSchool+SchoolTypeMatch+PreviousEnrollment" 266 | prob_all <- clogit(regression_formula, data_last_year, data_this_year) 267 | enrollment_all <- reshape_enroll(prob_all) 268 | cat("Finish Conditional Logit Prediction ^_^Y\n") 269 | #Baseline 270 | new_school <- setdiff(unique(data_this_year$SchoolKey), unique(data_last_year$SchoolKey)) 271 | this_year <- dcast(data_this_year, StudentKey~SchoolKey, mean, value.var="SchoolSelected")[,-1] 272 | this_enroll <- data.frame(SchoolKey=as.numeric(names(this_year)), Enrollment=round(colSums(this_year))) 273 | last_year <- dcast(data_last_year, StudentKey~SchoolKey, mean, value.var="SchoolSelected")[,-1] 274 | last_enroll <- data.frame(SchoolKey=as.numeric(names(last_year)), Enrollment=round(colSums(last_year))) 275 | baseline <- copy_last_year(last_enroll, this_enroll, new_school) 276 | cat("Finish Baseline Prediction ^_^Y\n") 277 | #Ensemble 278 | total_new_this_year <- with(enrollment_all, sum(Enrollment[SchoolKey%in%new_school])) 279 | total_old_this_year <- sum(this_enroll$Enrollment)-total_new_this_year 280 | proportion <- with(baseline, {Enrollment[!SchoolKey%in%new_school]/sum(Enrollment[!SchoolKey%in%new_school])}) 281 | ensemble <- within(enrollment_all, {Enrollment[!SchoolKey%in%new_school]=round(total_old_this_year*proportion)}) 282 | cat("Finish Ensemble Prediction ^_^Y\n") 283 | #Save Files 284 | save(data_last_year, file=paste0("clogit_", year, "_data_", year-1, ".RData")) #last year data 285 | save(data_this_year, file=paste0("clogit_", year, "_data_", year, ".RData")) #this year data 286 | write.csv(x=prob_d, file=paste0("clogit_", year, "_prob_d.csv"), row.names=F) #distance 287 | write.csv(x=prob_dc, file=paste0("clogit_", year, "_prob_dc.csv"), row.names=F) #distance+catchment+rating 288 | write.csv(x=prob_dcr, file=paste0("clogit_", year, "_prob_dcr.csv"), row.names=F) #distance+catchment+rating 289 | write.csv(x=prob_all, file=paste0("clogit_", year, "_prob_all.csv"), row.names=F) #all features 290 | write.csv(x=enrollment_d, file=paste0("clogit_", year, "_prediction_d.csv"), row.names=F) #distance 291 | write.csv(x=enrollment_dc, file=paste0("clogit_", year, "_prediction_dc.csv"), row.names=F) #distance+catchment 292 | write.csv(x=enrollment_dcr, file=paste0("clogit_", year, "_prediction_dcr.csv"), row.names=F) #distance+catchment+rating 293 | write.csv(x=enrollment_all, file=paste0("clogit_", year, "_prediction_all.csv"), row.names=F) #all features 294 | write.csv(x=baseline, file=paste0("baseline_", year, "_prediction_all.csv"), row.names=F) #baseline 295 | write.csv(x=ensemble, file=paste0("ensemble_", year, "_prediction_all.csv"), row.names=F) #final model 296 | cat("Finish Saving Files ^_^Y\n") 297 | return(prob_all) 298 | } 299 | 300 | 301 | 302 | 303 | 304 | -------------------------------------------------------------------------------- /Tool_Box/main.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ################################# 4 | ####Main Script For Modelling#### 5 | ################################# 6 | 7 | ####CPS Team#### 8 | ####08/17/2014#### 9 | 10 | rm(list=ls()) 11 | setwd("/home/zye/Model/") 12 | 13 | ####Cohort Survival Model#### 14 | source("cohort_survival.R") 15 | location <- "/mnt/data/cps/Data/" 16 | year_to_predict <- 2013 17 | result <- cohort_survival(location, year_to_predict) 18 | 19 | ####Model Diagnostics#### 20 | source("model_diagnostics.R") 21 | location <- "/mnt/data/cps/Data/" 22 | year_to_train <- 2012 23 | threshold <- 50 24 | grade <- "Grade9" 25 | model <- diagnostic(location, year, threshold, grade) 26 | year_to_predict <- 2013 27 | result <- prediction(location, year, threshold, grade, model) 28 | 29 | ####Conditional Logistic Regression#### 30 | source("grade9_prediction.R") 31 | location <- "/mnt/data/cps/Data/" 32 | year_to_predict <- 2012 33 | result <- grade9_prediction(location, year_to_predict) 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /Tool_Box/model_diagnostics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | 3 | ######################### 4 | ####Error Diagnostics#### 5 | ######################### 6 | 7 | ####CPS Team#### 8 | ####08/19/2014#### 9 | 10 | ####Packages#### 11 | suppressMessages(library(rpart)) 12 | suppressMessages(library(rpart.plot)) 13 | 14 | ####Construct Data for Modeling#### 15 | #prediction: cohort survival prediction 16 | #enrollment: true enrollment data 17 | #changing_feature: school features changing every year 18 | #static_feature: school features not changing every year 19 | #aggregate_feature: school features aggregated from the database 20 | #grade: the grade we want to classify 21 | construct_data <- function(prediction, enrollment, changing_feature, static_feature, aggregate_feature, grade) { 22 | pred <- data.frame(SchoolID=prediction$SchoolID, Prediction=prediction[,grade]) 23 | enroll <- data.frame(SchoolID=enrollment$SchoolID, PreviousEnrollment=enrollment[,grade]) 24 | group <- with(static_feature, data.frame(SchoolID, SchoolType, SchoolStyle, SchoolGradeGroup)) 25 | data <- merge(pred, enroll, by="SchoolID", all.x=T) 26 | data <- merge(data, group, by="SchoolID", all.x=T) 27 | data <- merge(data, changing_feature, by="SchoolID", all.x=T) 28 | data <- merge(data, aggregate_feature, by="SchoolID", all.x=T) 29 | data <- within(data, { 30 | PreviousEnrollment[is.na(PreviousEnrollment)] <- 0 31 | SchoolType <- factor(SchoolType, exclude=NULL) 32 | SchoolStyle <- factor(SchoolStyle, exclude=NULL) 33 | SchoolGradeGroup <- factor(SchoolGradeGroup, exclude=NULL) 34 | White[is.na(White)] <- median(White, na.rm=T) 35 | Black[is.na(Black)] <- median(Black, na.rm=T) 36 | Hispanic[is.na(Hispanic)] <- median(Hispanic, na.rm=T) 37 | AUSL <- factor(AUSL, exclude=NULL) 38 | Mobility[is.na(Mobility)] <- median(Mobility, na.rm=T) 39 | Rating <- factor(Rating, exclude=NULL) 40 | Probation <- factor(Probation, exclude=NULL) 41 | Health <- factor(Health, exclude=NULL) 42 | Safety <- factor(Safety, exclude=NULL) 43 | Family <- factor(Family, exclude=NULL) 44 | Environment <- factor(Environment, exclude=NULL) 45 | Instruction <- factor(Instruction, exclude=NULL) 46 | Leader <- factor(Leader, exclude=NULL) 47 | Teacher <- factor(Teacher, exclude=NULL) 48 | SchoolFood[is.na(SchoolFood)] <- median(SchoolFood, na.rm=T) 49 | SchoolESL[is.na(SchoolESL)] <- median(SchoolESL, na.rm=T) 50 | SchoolMale[is.na(SchoolMale)] <- median(SchoolMale, na.rm=T) 51 | SchoolHomeless[is.na(SchoolHomeless)] <- median(SchoolHomeless, na.rm=T) 52 | SchoolGPA[is.na(SchoolGPA)] <- median(SchoolGPA, na.rm=T) 53 | SchoolAttendanceRate[is.na(SchoolAttendanceRate)] <- median(SchoolAttendanceRate, na.rm=T) 54 | }) 55 | return(data) 56 | } 57 | 58 | ####Train Decision Tree Model#### 59 | #location: data source 60 | #year: year to train 61 | #threshold: >= threshold -> 1; otherwise 0 62 | diagnostic <- function(location, year, threshold, grade) { 63 | prediction_train <- read.csv(paste0(location, "csp", year, ".csv")) 64 | enrollment_train <- read.csv(paste0(location, "true", year-1, ".csv")) 65 | change_train <- read.csv(paste0(location, "schools_changing_features_", year, ".csv"), stringsAsFactors=F) 66 | aggregate_train <- read.csv(paste0(location, "schools_aggregate_features_", year, ".csv"), stringsAsFactors=F) 67 | static_train <- read.csv(paste0(location, "schools_static_features.csv"), stringsAsFactors=F) 68 | true_enrollment <- read.csv(paste0(location, "true", year, ".csv")) 69 | data_train_temp <- construct_data(prediction_train, enrollment_train, change_train, static_train, aggregate_train, grade) 70 | enroll <- data.frame(SchoolID=true_enrollment$SchoolID, CurrentEnrollment=true_enrollment[,grade]) 71 | data_train <- merge(data_train_temp, enroll, by="SchoolID") 72 | data_train <- within(data_train, { 73 | Label <- rep(0, nrow(data_train)) 74 | Label[abs(CurrentEnrollment-Prediction)>=threshold] <- 1 75 | Label <- factor(Label) 76 | }) 77 | if (length(unique(data_train$Label))==1) { 78 | cat("Threshold is not appropriate!\n") 79 | return() 80 | } 81 | model <- rpart(formula=Label~Prediction+PreviousEnrollment+ 82 | SchoolType+SchoolStyle+SchoolGradeGroup+White+ 83 | Black+Hispanic+Mobility+Rating+Probation+ 84 | SchoolESL+SchoolMale+SchoolAttendanceRate, 85 | data=data_train) 86 | print(prp(x=model, type=3)) 87 | return(model) 88 | } 89 | 90 | ####Make Prediction#### 91 | #location: data source 92 | #year: year to test 93 | #threshold: >= threshold -> 1; otherwise 0 94 | #model: decision tree model 95 | prediction <- function(location, year, threshold, grade, model) { 96 | prediction_test <- read.csv(paste0(location, "csp", year, ".csv")) 97 | enrollment_test <- read.csv(paste0(location, "true", year-1, ".csv")) 98 | change_test <- read.csv(paste0(location, "schools_changing_features_", year, ".csv"), stringsAsFactors=F) 99 | aggregate_test <- read.csv(paste0(location, "schools_aggregate_features_", year, ".csv"), stringsAsFactors=F) 100 | static_test <- read.csv(paste0(location, "schools_static_features.csv"), stringsAsFactors=F) 101 | data_test <- construct_data(prediction_test, enrollment_test, change_test, static_test, aggregate_test, grade) 102 | result <- as.data.frame(predict(object=model, newdata=data_test, type="prob")) 103 | output <- data.frame(SchoolID=data_test$SchoolID, Probability=result[,"1"]) 104 | write.csv(x=output, file=paste0("misprojection_", year, ".csv"), row.names=F) 105 | return(output) 106 | } 107 | 108 | 109 | 110 | 111 | 112 | --------------------------------------------------------------------------------