├── .gitignore
├── Data_Analysis
    ├── CPS_JAGS.R
    ├── CPS_gradebygrade_dataprep.R
    ├── CPS_gradebygrade_modelingexploration.R
    ├── CPS_gradebygrade_script.R
    ├── DeepDive1EDA_Counts.Rmd
    ├── DeepDive1EDA_Counts.html
    ├── README.md
    ├── dataprep_geocodeschools.R
    ├── exploratory_data_analysis.R
    ├── explore_8_to_9_transition.R
    ├── explore_enroll_totals_from_website.Rmd
    ├── explore_enroll_totals_from_website.html
    ├── grade9_transition_matrix.R
    ├── history_evaluation.R
    ├── investigate_9thgrade.R
    ├── investigate_juarez_change.R
    ├── map_neighborhood_catchment.R
    ├── prepare_data_and_shapefile_for_tableau.R
    └── topdown_FINAL.R
├── Data_Pipeline
    ├── README.md
    ├── create_features_table.R
    ├── create_schools_table.R
    ├── create_students_table.R
    ├── geocode_addresses.R
    ├── school_aggregate_features_vw.sql
    ├── student_ISAT_scores_vw.sql
    ├── student_attendance_vw.sql
    └── student_table_with_ninth_snapshot_vw.sql
├── README.md
└── Tool_Box
    ├── README.md
    ├── cohort_survival.R
    ├── eval_grade9_prediction.R
    ├── eval_new_grade9_prediction.R
    ├── grade9_prediction.R
    ├── main.R
    └── model_diagnostics.R


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Data files
 2 | *.csv
 3 | 
 4 | # History files
 5 | .Rhistory
 6 | 
 7 | # Example code in package build process
 8 | *-Ex.R
 9 | 
10 | # R data files from past sessions
11 | .Rdata
12 | 


--------------------------------------------------------------------------------
/Data_Analysis/CPS_JAGS.R:
--------------------------------------------------------------------------------
  1 | # --------------------------------------- #
  2 | #   prepare data for hierarchical model   #
  3 | # --------------------------------------- #
  4 | 
  5 | # variables: SCHOOL_CODE, GRADE, GROUP (charter / neighborhood), 
  6 | # for progressive grades: RATIO = [2012 grade k]/[2011 grade k-1]
  7 | 
  8 | setwd("/Volumes/appdata-3/Count Data")
  9 | countdata = read.csv("enrollment_byschool_byyear_bygrade.csv", stringsAsFactors = F)
 10 | 
 11 | setwd("/Volumes/appdata-3/School Reports")
 12 | schools = read.csv("schools.csv", stringsAsFactors = F)
 13 | 
 14 | data1 = subset(countdata, YEAR == 2012 & ENROLLMENT > 0 & GRADE %in% c("X2","X3","X4","X5","X6","X7","X8","X10","X11","X12"), select = c("SCHOOL_CODE","GRADE","ENROLLMENT"))
 15 | 
 16 | gradeorder = c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12")
 17 | 
 18 | data1$PREV_GRADE = lapply(data1[,"GRADE"], function(x) gradeorder[which(gradeorder == x)-1])
 19 | data1$PREV_GRADE = as.character(data1$PREV_GRADE)
 20 | table(data1$GRADE, data1$PREV_GRADE)
 21 | 
 22 | data11 = subset(countdata, YEAR == 2011)
 23 | 
 24 | data1= merge(data1, data11[,c("SCHOOL_CODE","GRADE","ENROLLMENT")], all.x = T, by.x = c("SCHOOL_CODE","PREV_GRADE"), by.y = c("SCHOOL_CODE","GRADE"), suffixes = c(".THIS_GRADE",".PREV_GRADE"))
 25 | 
 26 | data1$CSRATIO = data1$ENROLLMENT.THIS_GRADE/data1$ENROLLMENT.PREV_GRADE
 27 | hist(data1$CSRATIO)
 28 | data1 = subset(data1, is.finite(CSRATIO))
 29 | 
 30 | data1 = merge(data1, schools[,c("SchoolID","SchoolType")], all.x = T, by.x = "SCHOOL_CODE", by.y = "SchoolID")
 31 | 
 32 | data2 = subset(countdata, YEAR == 2011 & ENROLLMENT > 0 & GRADE %in% c("X2","X3","X4","X5","X6","X7","X8","X10","X11","X12"), select = c("SCHOOL_CODE","GRADE","ENROLLMENT"))
 33 | 
 34 | gradeorder = c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12")
 35 | 
 36 | data2$PREV_GRADE = lapply(data2[,"GRADE"], function(x) gradeorder[which(gradeorder == x)-1])
 37 | data2$PREV_GRADE = as.character(data2$PREV_GRADE)
 38 | table(data2$GRADE, data2$PREV_GRADE)
 39 | 
 40 | data10 = subset(countdata, YEAR == 2010)
 41 | 
 42 | data2= merge(data2, data10[,c("SCHOOL_CODE","GRADE","ENROLLMENT")], all.x = T, by.x = c("SCHOOL_CODE","PREV_GRADE"), by.y = c("SCHOOL_CODE","GRADE"), suffixes = c(".THIS_GRADE",".PREV_GRADE"))
 43 | 
 44 | data2$CSRATIO = data2$ENROLLMENT.THIS_GRADE/data2$ENROLLMENT.PREV_GRADE
 45 | hist(data2$CSRATIO)
 46 | data2 = subset(data2, is.finite(CSRATIO))
 47 | 
 48 | data2 = merge(data2, schools[,c("SchoolID","SchoolType")], all.x = T, by.x = "SCHOOL_CODE", by.y = "SchoolID")
 49 | 
 50 | data = rbind(data1, data2)
 51 | 
 52 | data = subset(data, CSRATIO <= 5)
 53 | 
 54 | data$SCHOOL_GROUP = as.numeric(as.factor(data$SchoolType))
 55 | data$SCHOOL = data$SCHOOL_CODE
 56 | data$SCHOOL = as.numeric(as.factor(data$SCHOOL))
 57 | data$GRADE_GROUP = data$GRADE
 58 | data$GRADE_GROUP = as.numeric(as.factor(data$GRADE_GROUP))
 59 | 
 60 | 
 61 | #install.packages("R2jags")
 62 | library("R2jags")
 63 | 
 64 | model<-function(){
 65 |   
 66 |   # ratios
 67 |   for (i in 1:n.obs){
 68 |     y[i] ~ dnorm(gamma[SCHOOL[i]] + delta[GRADE[i]], tau)
 69 |   }
 70 | 
 71 |   tau <- pow(sigma, -2)
 72 |   sigma ~ dunif(0,100)
 73 |   
 74 |   # grades
 75 |   for (j in 1:n.grades){
 76 |     delta[j] ~ dnorm(0, tau.delta)
 77 |   }
 78 |   
 79 |   tau.delta <- pow(sigma.delta, -2)
 80 |   sigma.delta ~ dunif(0,100)
 81 |   
 82 |   # schools
 83 |   for (k in 1:n.schools){
 84 |     gamma[k] ~ dnorm(mu.gamma[SCHOOL_GROUP[k]], tau.gamma[SCHOOL_GROUP[k]])
 85 |   }
 86 |     
 87 |   # groups
 88 |   for (l in 1:n.groups){
 89 |     mu.gamma[l] ~ dnorm(mu.star, tau.mu)
 90 |     tau.gamma[l] <- pow(sigma.gamma[l], -2)
 91 |     sigma.gamma[l] ~ dunif(0, 100)
 92 |   }
 93 |   
 94 |  mu.star ~ dnorm(0, 1/10000)
 95 |  tau.mu<- pow(sigma.mu, -2)
 96 |  sigma.mu ~ dunif(0, 100)
 97 | }
 98 | 
 99 | schoolgroupmat = unique(data[,c("SCHOOL_CODE","SCHOOL_GROUP")])
100 | 
101 | data_jags=list(y=data$CSRATIO, SCHOOL = data$SCHOOL, GRADE = data$GRADE_GROUP, SCHOOL_GROUP = schoolgroupmat$SCHOOL_GROUP, n.obs = 7546, n.schools = 671, n.grades = 10, n.groups = 5)
102 | inits=function(){list("sigma" = 1, "delta" = c(rep(0, length(unique(data$GRADE)))), "sigma.delta" = 1, "mu.star" = 1, "sigma.gamma" = c(1,1,1,1,1), "sigma.mu" = 1)}
103 | parameters.to.save<-c("delta","sigma", "sigma.delta", "mu.gamma","mu.star", "sigma.gamma", "sigma.mu","gamma")
104 | 
105 | sim = jags(data_jags, inits, parameters.to.save, model.file=model, n.chains=3, n.iter=10000)
106 | print(sim)
107 | 
108 | tracemat = sim$BUGSoutput$sims.matrix
109 | tracemat = as.data.frame(tracemat)
110 | names(tracemat)
111 | 
112 | # predictions and credible intervals for 2013/2012
113 | test = unique(data[,c("SCHOOL_CODE","GRADE","SCHOOL","SCHOOL_GROUP","GRADE_GROUP")])
114 | test$postpredmean = NA
115 | test$postpred025 = NA
116 | test$postpred975 = NA
117 | 
118 | for (i in 1:nrow(test)){
119 |   gammatemp = tracemat[,paste('gamma[',test[i,"SCHOOL"],']', sep = "")]
120 |   deltatemp = tracemat[,paste('delta[',test[i,"GRADE_GROUP"],']', sep = "")]
121 |   sigmatemp = tracemat$sigma
122 |   
123 |   ytemp = rnorm(3000, gammatemp + deltatemp, sd = sqrt(sigmatemp))
124 |   test[i,"postpredmean"] = mean(ytemp)
125 |   test[i,"postpred025"] = quantile(ytemp, .025)
126 |   test[i,"postpred975"] = quantile(ytemp,.975)
127 | }
128 | 
129 | 
130 | data3 = subset(countdata, YEAR == 2013 & ENROLLMENT > 0 & GRADE %in% c("X2","X3","X4","X5","X6","X7","X8","X10","X11","X12"), select = c("SCHOOL_CODE","GRADE","ENROLLMENT"))
131 | 
132 | gradeorder = c("X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12")
133 | 
134 | data3$PREV_GRADE = lapply(data3[,"GRADE"], function(x) gradeorder[which(gradeorder == x)-1])
135 | data3$PREV_GRADE = as.character(data3$PREV_GRADE)
136 | table(data3$GRADE, data3$PREV_GRADE)
137 | 
138 | data12 = subset(countdata, YEAR == 2012)
139 | 
140 | data3= merge(data3, data12[,c("SCHOOL_CODE","GRADE","ENROLLMENT")], all.x = T, by.x = c("SCHOOL_CODE","PREV_GRADE"), by.y = c("SCHOOL_CODE","GRADE"), suffixes = c(".THIS_GRADE",".PREV_GRADE"))
141 | 
142 | data3$CSRATIO = data3$ENROLLMENT.THIS_GRADE/data3$ENROLLMENT.PREV_GRADE
143 | hist(data3$CSRATIO)
144 | data3 = subset(data3, is.finite(CSRATIO))
145 | 
146 | data3 = merge(data3, schools[,c("SchoolID","SchoolType")], all.x = T, by.x = "SCHOOL_CODE", by.y = "SchoolID")
147 | 
148 | test = merge(test, data3[,c("SCHOOL_CODE","GRADE","CSRATIO")], all.x = T, by.x = c("SCHOOL_CODE","GRADE"), by.y = c("SCHOOL_CODE","GRADE"))
149 | 
150 | test = subset(test, CSRATIO <= 2)
151 | plot(test$CSRATIO, test$postpredmean)
152 | 
153 | 
154 | plot(1:3000, tracemat$'mu.gamma[5]', type ="l")
155 | lines(1:3000, tracemat$'mu.gamma[3]', col = colorlist[4])
156 | lines(1:3000, tracemat$'mu.gamma[2]', col = colorlist[3])
157 | lines(1:3000, tracemat$'mu.gamma[4]', col = colorlist[5])
158 | lines(1:3000, tracemat$'mu.gamma[5]', col = colorlist[7])
159 | 
160 | unique(data[,c("SchoolType","SCHOOL_GROUP")])
161 | legend("topright",c("Alternative","Charter","Contract","Performance","Regular"), col = colorlist[c(1,3,4,5,7)], pch = 16)
162 | 
163 | plot(1:3000, tracemat$'delta[1]', type ="l",ylim=c(-.2,.2), col = colorlist[1])
164 | lines(1:3000, tracemat$'delta[2]', col = colorlist[2])
165 | lines(1:3000, tracemat$'delta[3]', col = colorlist[3])
166 | lines(1:3000, tracemat$'delta[4]', col = colorlist[4])
167 | lines(1:3000, tracemat$'delta[5]', col = colorlist[5])
168 | lines(1:3000, tracemat$'delta[6]', col = colorlist[6])
169 | lines(1:3000, tracemat$'delta[7]', col = colorlist[7])
170 | lines(1:3000, tracemat$'delta[8]', col = colorlist[8])
171 | lines(1:3000, tracemat$'delta[9]', col = colorlist[9])
172 | lines(1:3000, tracemat$'delta[10]', col = colorlist[10])
173 | 
174 | legend("topright",c("2nd grade", "3rd grade", "4th grade", "5th grade", "6th grade", "7th grade", "8th grade", "10th grade", "11th grade", "12th grade"), col = colorlist[c(4,5,6,7,8,9,10,1,2,3)], pch = 16)
175 | 
176 | unique(data[,c("GRADE","GRADE_GROUP")])
177 | colorlist = brewer.pal(10, "Set3")
178 | 


--------------------------------------------------------------------------------
/Data_Analysis/CPS_gradebygrade_modelingexploration.R:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------- #
  2 | #   this code shows many of our exploratory models   #
  3 | #          for the top-down model approach           #
  4 | # -------------------------------------------------- #
  5 | 
  6 | # run CPS_gradebygrade_script.R
  7 | # run CPS_gradebygrade_dataprep.R
  8 | 
  9 | datafinal_train = datafinal12
 10 | datafinal = datafinal13
 11 | 
 12 | # fill in missing values
 13 | for (j in c(7,8,9,11,25,26)){
 14 |   datafinal_train[is.na(datafinal_train[,j]),j] = 0
 15 |   datafinal[is.na(datafinal[,j]),j] = 0
 16 | }
 17 | 
 18 | for (j in c(12:20,24)){
 19 |   datafinal_train[is.na(datafinal_train[,j]),j] = 'Unknown'
 20 |   datafinal[is.na(datafinal[,j]),j] = 'Unknown'
 21 | }
 22 | 
 23 | # make sure there's no more missing values
 24 | apply(datafinal_train, 2, function(x) sum(is.na(x)))
 25 | apply(datafinal, 2, function(x) sum(is.na(x)))
 26 | 
 27 | # all schools and grades: rollover 2012 to predict 2013
 28 | test = predictschool(data_test = datafinal, data_train = datafinal_train, algorithm_of_choice = "rollover", modelformula = NA, grade_overall = 'gradebygrade')
 29 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) # 52%
 30 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) # 74%
 31 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100) # 87%
 32 | hist(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T), 20)
 33 | 
 34 | # high school only
 35 | test = subset(test, (enrollment_to_predict.X9 >= 10)|(enrollment_to_predict.X10 >= 10)|(enrollment_to_predict.X11 >= 10)|(enrollment_to_predict.X12 >= 10))
 36 | test = test[,c("SCHOOL_CODE","enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12","predictions X9","predictions X10","predictions X11","predictions X12")]
 37 | 
 38 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 25) 
 39 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 50) 
 40 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 100) 
 41 | 
 42 | 
 43 | # Basic linear regression: train on 2012, test on 2013
 44 | test = predictschool(data_test = datafinal, data_train = datafinal_train, algorithm_of_choice = "lm", modelformula = enrollment_to_predict ~ projection + NEW, grade_overall = 'gradebygrade')
 45 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) # 50%
 46 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) # 72%
 47 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100) # 89%
 48 | hist(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T), 20)
 49 | 
 50 | # high school only
 51 | test = subset(test, (enrollment_to_predict.X9 >= 10)|(enrollment_to_predict.X10 >= 10)|(enrollment_to_predict.X11 >= 10)|(enrollment_to_predict.X12 >= 10))
 52 | test = test[,c("SCHOOL_CODE","enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12","predictions X9","predictions X10","predictions X11","predictions X12")]
 53 | 
 54 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 25) 
 55 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 50) 
 56 | mean(abs(rowSums(test[,c("predictions X9","predictions X10","predictions X11","predictions X12")],na.rm = T) - rowSums(test[,c("enrollment_to_predict.X9","enrollment_to_predict.X10","enrollment_to_predict.X11","enrollment_to_predict.X12")],na.rm=T)) <= 100) 
 57 | 
 58 | 
 59 | # Linear regression with features: train on 2012, test on 2013
 60 | test = predictschool(data_test = datafinal, data_train = datafinal_train, algorithm_of_choice = "lm", modelformula = enrollment_to_predict ~ projection*Rating + (SCHOOL_TYPE == 'Regular') + NEW, grade_overall = 'gradebygrade')
 61 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) 
 62 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) 
 63 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100)
 64 | 
 65 | # plot by grade
 66 | hist(test[,"predictions K"] - test[,"enrollment_to_predict.K"],20)
 67 | hist(test[,"predictions X1"] - test[,"enrollment_to_predict.X1"],20)
 68 | hist(test[,"predictions X2"] - test[,"enrollment_to_predict.X2"],20)
 69 | hist(test[,"predictions X3"] - test[,"enrollment_to_predict.X3"],20)
 70 | hist(test[,"predictions X4"] - test[,"enrollment_to_predict.X4"],20)
 71 | hist(test[,"predictions X5"] - test[,"enrollment_to_predict.X5"],20)
 72 | hist(test[,"predictions X6"] - test[,"enrollment_to_predict.X6"],20)
 73 | hist(test[,"predictions X7"] - test[,"enrollment_to_predict.X7"],20)
 74 | hist(test[,"predictions X8"] - test[,"enrollment_to_predict.X8"],20)
 75 | hist(test[,"predictions X9"] - test[,"enrollment_to_predict.X9"],20)
 76 | hist(test[,"predictions X10"] - test[,"enrollment_to_predict.X10"],20)
 77 | hist(test[,"predictions X11"] - test[,"enrollment_to_predict.X11"],20)
 78 | hist(test[,"predictions X12"] - test[,"enrollment_to_predict.X12"],20)
 79 | hist(test[,"predictions LRE"] - test[,"enrollment_to_predict.LRE"],20)
 80 | 
 81 | # 9th grade
 82 | plot(test[,"enrollment_to_predict.X9"], test[,"predictions X9"],xlim=c(0,1500),ylim=c(0,1500),pch=16, xlab = "Actual 9th grade 2012", ylab = "Predicted 9th grade 2012")
 83 | segments(x0 = test[,"enrollment_to_predict.X9"], y0 = test[,"predictions X9"] - 1.96*sqrt(test[,"var X9"]), x1 = test[,"enrollment_to_predict.X9"], y1 = test[,"predictions X9"] + 1.96*sqrt(test[,"var X9"]))
 84 | segments(x0=0,y0=0,x1=1500,y1=1500, col = 2)
 85 | mean((test[,"enrollment_to_predict.X9"] > test[,"predictions X9"] - 1.96*sqrt(test[,"var X9"]))&(test[,"enrollment_to_predict.X9"] < test[,"predictions X9"] + 1.96*sqrt(test[,"var X9"])), na.rm = T)
 86 | 
 87 | # 2nd grade
 88 | plot(test[,"enrollment_to_predict.X2"], test[,"predictions X2"],xlim=c(0,300),ylim=c(0,300),pch=16, xlab = "Actual 2nd grade 2012", ylab = "Predicted 2nd grade 2012")
 89 | segments(x0 = test[,"enrollment_to_predict.X2"], y0 = test[,"predictions X2"] - 1.96*sqrt(test[,"var X2"]), x1 = test[,"enrollment_to_predict.X2"], y1 = test[,"predictions X2"] + 1.96*sqrt(test[,"var X2"]))
 90 | segments(x0=0,y0=0,x1=1300,y1=1300, col = 2)
 91 | mean((test[,"enrollment_to_predict.X2"] > test[,"predictions X2"] - 1.96*sqrt(test[,"var X2"]))&(test[,"enrollment_to_predict.X2"] < test[,"predictions X2"] + 1.96*sqrt(test[,"var X2"])), na.rm = T)
 92 | 
 93 | 
 94 | # Random forest with features: train on 2012, test on 2013
 95 | test = predictschool(data_test = datafinal, data_train = datafinal_train, algorithm_of_choice = "randomForest", modelformula = enrollment_to_predict ~ projection + NEW + Rating + SCHOOL_TYPE, grade_overall = 'gradebygrade')
 96 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) 
 97 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) 
 98 | mean(abs(rowSums(test[,16:29],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100) 
 99 | 
100 | 
101 | # Experimenting with training data
102 | 
103 | # train only with schools that change less than a certain number of students
104 | datafinal_train_stable= datafinal_train[(apply(datafinal_train[,c("enrollment_to_predict","projection")], 1, function(x) max(x) - min(x) <= 25))|(datafinal_train$projection == 0),]
105 | 
106 | test = predictschool(data_test = datafinal, data_train = datafinal_train_stable, algorithm_of_choice = "lm", modelformula = enrollment_to_predict ~ projection*Rating + NEW + (SCHOOL_TYPE == 'Regular') + newnearby, grade_overall = 'gradebygrade')
107 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) 
108 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) 
109 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100)
110 | 
111 | # exclude new schools
112 | datafinal_train_nonew = datafinal_train[datafinal_train$projection > 0,]
113 | datafinal_nonew = datafinal[datafinal$projection > 0,]
114 | 
115 | # neighborhood enrollment
116 | setwd("/Volumes/appdata-3/School reports")
117 | schools = read.csv("schools.csv", stringsAsFactors = F)
118 | neighborhoodschools = as.matrix(subset(schools, SchoolStyle == 'Neighborhood Enrollment', select = "SchoolID"))
119 | 
120 | datafinal_train_temp = datafinal_train[(datafinal_train$projection > 0) & (datafinal_train$SCHOOL_CODE %in% neighborhoodschools),]
121 | datafinal_temp = datafinal[(datafinal$projection > 0) & (datafinal$SCHOOL_CODE %in% neighborhoodschools),]
122 | 
123 | test = predictschool(data_test = datafinal_temp, data_train = datafinal_train_temp, algorithm_of_choice = "lm", modelformula = enrollment_to_predict ~ projection*(Rating + Teacher + Safety) + newnearby + GRADE, grade_overall = 'overall')
124 | 
125 | schoolslist = unique(datafinal_temp$SCHOOL_CODE)
126 | test = test[test$SCHOOL_CODE %in% schoolslist,]
127 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 25) 
128 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 50) 
129 | mean(abs(rowSums(test[,seq(16,42,2)],na.rm = T) - rowSums(test[,2:15],na.rm=T)) <= 100)
130 | 


--------------------------------------------------------------------------------
/Data_Analysis/CPS_gradebygrade_script.R:
--------------------------------------------------------------------------------
  1 | # ================================== #
  2 | #   grade-by-grade modeling script   #
  3 | # ================================== #
  4 | 
  5 | 
  6 | predictschool = function(data_test, data_train, algorithm_of_choice, modelformula, grade_overall){
  7 |   # data should have columns named "SCHOOL_CODE", "enrollment_to_predict", "projection" and "GRADE"
  8 |   # all other columns are main effect predictors
  9 |   # algorithm_of_choice can be "lm","randomForest","rollover"
 10 |   # gradebygrade or overall
 11 |   
 12 |   # 1. turn character features into factors 
 13 |   for (j in 1:ncol(data_train)){
 14 |     if (class(data_train[,j]) == "character"){
 15 |       data_train[,j] = as.factor(data_train[,j])
 16 |     }
 17 |   }
 18 |   
 19 |   for (j in 1:ncol(data_test)){
 20 |     if (class(data_test[,j]) == "character"){
 21 |       data_test[,j] = as.factor(data_test[,j])
 22 |     }
 23 |   }
 24 |   
 25 |   if (grade_overall == 'gradebygrade'){
 26 |   # 2. predict each grade
 27 |   predictschools=reshape(data_test[,c("SCHOOL_CODE","GRADE","enrollment_to_predict")], v.names = "enrollment_to_predict", timevar = "GRADE", idvar = "SCHOOL_CODE", direction = "wide")  
 28 |   
 29 |   for (grade in levels(data_train$GRADE)){
 30 |     
 31 |     currentnames = names(predictschools)
 32 |     
 33 |     temp = predictgrade(data_train, data_test, grade, algorithm_of_choice, modelformula)
 34 |     
 35 |     if (algorithm_of_choice == "lm"){
 36 |       predictschools = merge(predictschools, temp[,c("SCHOOL_CODE", "test_predictions", "test_var")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SCHOOL_CODE")
 37 |       colnames(predictschools) = c(currentnames, paste('predictions', grade), paste('var', grade))
 38 |     }
 39 |     
 40 |     if ((algorithm_of_choice == "randomForest")|(algorithm_of_choice == "rollover")){
 41 |       predictschools = merge(predictschools, temp[,c("SCHOOL_CODE", "test_predictions")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SCHOOL_CODE")
 42 |       colnames(predictschools) = c(currentnames, paste('predictions',grade))
 43 |     }
 44 |   }
 45 |   
 46 |   # 3. output is predictions for test set
 47 |   predictschools
 48 |   }
 49 |   
 50 |   else if (grade_overall == 'overall'){
 51 |     predictschools=reshape(data_test[,c("SCHOOL_CODE","GRADE","enrollment_to_predict")], v.names = "enrollment_to_predict", timevar = "GRADE", idvar = "SCHOOL_CODE", direction = "wide")  
 52 |     
 53 |     # predict overall
 54 |     overallpredicts = predictoverall(data_train, data_test, algorithm_of_choice, modelformula)     
 55 |     
 56 |     # output
 57 |     merge(predictschools, overallpredicts, all.x = TRUE)
 58 |     
 59 |   }
 60 | 
 61 | }
 62 | 
 63 | 
 64 | 
 65 | predictgrade = function(data_train, data_test, grade, algorithm_of_choice, modelformula){
 66 |   # data should have columns named "SCHOOL_CODE", "enrollment_to_predict", "projection" and "GRADE"
 67 | 
 68 |   data_train_grade = subset(data_train, (GRADE == grade)&(enrollment_to_predict > 0))
 69 |   data_test_grade = subset(data_test, (GRADE == grade)&(enrollment_to_predict > 0))
 70 |   
 71 |   # algorithm = lm
 72 |   if (algorithm_of_choice == "lm"){
 73 |     lm_train = lm(modelformula, data = data_train_grade)
 74 |     lm_test = predict(lm_train, newdata = data_test_grade, interval = "prediction", level = .95)
 75 |     lm_test = as.data.frame(lm_test)
 76 |     data_test_grade$test_predictions = lm_test$fit
 77 |     data_test_grade$test_var = ((lm_test$upr - lm_test$lwr)/4)^2
 78 |     
 79 |     data_test_grade[,c("SCHOOL_CODE", "test_predictions", "test_var")] # output
 80 |   }
 81 |   
 82 |   
 83 |   # algorithm = randomForest
 84 |   else if (algorithm_of_choice == "randomForest"){
 85 |     library(randomForest)
 86 |     rf_train = randomForest(modelformula, data = data_train_grade)
 87 |     rf_test = predict(rf_train, newdata = data_test_grade)
 88 |     data_test_grade$test_predictions = rf_test
 89 |     
 90 |     data_test_grade[,c("SCHOOL_CODE", "test_predictions")] # output
 91 |   }
 92 |   
 93 |   # algorithm = rollover
 94 |   else if (algorithm_of_choice == "rollover"){
 95 |     data_test_grade$test_predictions = data_test_grade$projection
 96 |     grademean = mean(data_test_grade[(data_test_grade$test_predictions > 0),"test_predictions"])
 97 |     data_test_grade[(data_test_grade$test_predictions == 0),"test_predictions"] = grademean
 98 |     
 99 |     data_test_grade[,c("SCHOOL_CODE", "test_predictions")] # output
100 |   }
101 |   
102 |   
103 | }
104 | 
105 | 
106 | 
107 | predictoverall = function(data_train, data_test, algorithm_of_choice, modelformula){
108 |   # data should have columns named "SCHOOL_CODE", "enrollment_to_predict", "projection" and "GRADE"
109 |   
110 |   data_train = subset(data_train, (enrollment_to_predict > 0))
111 |   data_test = subset(data_test, (enrollment_to_predict > 0))
112 |   
113 |   # algorithm = lm
114 |   if (algorithm_of_choice == "lm"){
115 |     lm_train = lm(modelformula, data = data_train)
116 |     lm_test = predict(lm_train, newdata = data_test, interval = "prediction", level = .95)
117 |     lm_test = as.data.frame(lm_test)
118 |     data_test$predictions = lm_test$fit
119 |     data_test$var = ((lm_test$upr - lm_test$lwr)/4)^2
120 |     
121 |     outputtemp = data_test[,c("SCHOOL_CODE", "GRADE", "predictions", "var")] # output
122 |     output = reshape(outputtemp, v.names = c("predictions", "var"), timevar = "GRADE", idvar = "SCHOOL_CODE", direction = 'wide',sep=" ")
123 |     output
124 |   }
125 |   
126 |   # algorithm = randomForest
127 |   else if (algorithm_of_choice == "randomForest"){
128 |     library(randomForest)
129 |     rf_train = randomForest(modelformula, data = data_train)
130 |     rf_test = predict(rf_train, newdata = data_test)
131 |     data_test$predictions = rf_test
132 |     
133 |     outputtemp = data_test[,c("SCHOOL_CODE", "GRADE", "predictions")] # output
134 |     output = reshape(outputtemp, v.names = "predictions", timevar = "GRADE", idvar = "SCHOOL_CODE", direction = 'wide',sep=" ")
135 |     output
136 |   }
137 |   
138 |   
139 | }
140 | 
141 | 


--------------------------------------------------------------------------------
/Data_Analysis/DeepDive1EDA_Counts.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "First Deep Dive EDA on Enrollment Counts"
  3 | author: "Andrew Landgraf"
  4 | date: "Tuesday, June 24, 2014"
  5 | output: html_document
  6 | ---
  7 | 
  8 | Load packages. You need to be connected to the CPS VPN. 
  9 | ```{r}
 10 | library(lubridate)
 11 | library(ggplot2)
 12 | library(reshape2)
 13 | library(knitr)
 14 | opts_chunk$set(warning = FALSE)
 15 | # setwd("//admin/appdata/DataWarehouse/DSSG/Count Data/")
 16 | ```
 17 | 
 18 | Read count data from the CPS folder. This data has the number of students enrolled in each school and in each grade on the first day of each month. I also format the data a little.
 19 | ```{r}
 20 | counts = read.csv("//admin/appdata/DataWarehouse/DSSG/Count Data/Enrollment_Counts_Month_Current.csv", stringsAsFactors = FALSE)
 21 | counts$Date = as.Date(counts$Date)
 22 | counts$STUDENT_ANNUAL_GRADE_CODE = factor(counts$STUDENT_ANNUAL_GRADE_CODE, 
 23 |                                           c('PE','PK','K','1','2','3','4','5','6','7','8','9','10','11','12'))
 24 | 
 25 | counts$Year = as.numeric(substring(counts$SCHOOL_YEAR,6,9))
 26 | counts$Month = month(counts$Date)
 27 | names(counts)[names(counts)=="STUDENT_ANNUAL_GRADE_CODE"] = "Grade"
 28 | ```
 29 | 
 30 | Read the school report data. This was downloaded from the Chicago data portal. Concentrating on just student performance and overall school rating variables.
 31 | ```{r}
 32 | report_elem = read.csv("//admin/appdata/DataWarehouse/DSSG/School Reports/CPS_Elementary_Report_20122013.csv")
 33 | report_high = read.csv("//admin/appdata/DataWarehouse/DSSG/School Reports/CPS_HighSchool_Report_20122013.csv")
 34 | levels(report_elem$Student.Performance.Level) <- 
 35 |   c("Far Below Average", "Below Average", "Average", "Above Average", "Far Above Average")
 36 | report_high$Student.Performance.Level = factor(report_high$Student.Performance.Level,
 37 |                                                c("Far Below Average", "Below Average", "Average", "Above Average", "Far Above Average"))
 38 | report_elem$Overall.Rating = factor(report_elem$Overall.Rating,
 39 |                                     levels = c("Level 1", "Level 2", "Level 3"))
 40 | report_high$Overall.Rating = factor(report_high$Overall.Rating,
 41 |                                     levels = c("Level 1", "Level 2", "Level 3"))
 42 | 
 43 | school_reports <- rbind(subset(report_elem, 
 44 |                                select = c(School.ID, Student.Performance.Level, Overall.Rating)),
 45 |                         subset(report_high, 
 46 |                                select = c(School.ID, Student.Performance.Level, Overall.Rating)))
 47 | ```
 48 | 
 49 | The function below is used for adding grades. You can logically add one grade to `K` and it knows it is `1`.
 50 | ```{r}
 51 | advance_grade <- function(current_grade, years = 1) {
 52 |   locs = apply(matrix(current_grade),1,function(x) which(x == levels(counts$Grade))) + years
 53 |   is.na(locs[locs<1 | locs>length(levels(counts$Grade))]) <- TRUE
 54 |   levels(counts$Grade)[locs]
 55 | }
 56 | ```
 57 | 
 58 | Below, I split the data into five snapshots.The first is enrollments that we want to predict include.
 59 | ```{r}
 60 | pred_enrolls = subset(counts, Month == 10, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments))
 61 | ```
 62 | 
 63 | Last year 20th day enrollments.
 64 | ```{r}
 65 | prev_20_enrolls = subset(counts, Month == 10, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments))
 66 | prev_20_enrolls$Year = prev_20_enrolls$Year + 1
 67 | names(prev_20_enrolls)[names(prev_20_enrolls)=="Enrollments"] <- "Prev_20_Enrollments"
 68 | ```
 69 | 
 70 | Last year 20th day enrollments for previous grade.
 71 | ```{r}
 72 | prev_20_enrolls_grade = prev_20_enrolls
 73 | prev_20_enrolls_grade$Grade = advance_grade(prev_20_enrolls_grade$Grade, 1)
 74 | names(prev_20_enrolls_grade)[names(prev_20_enrolls_grade)=="Prev_20_Enrollments"] <- "Prev_Grade_20_Enrollments"
 75 | ```
 76 | 
 77 | Last year end of year enrollments.
 78 | ```{r}
 79 | prev_end_enrolls = subset(counts, Month == 6, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments))
 80 | prev_end_enrolls$Year = prev_end_enrolls$Year + 1
 81 | names(prev_end_enrolls)[names(prev_end_enrolls)=="Enrollments"] <- "Prev_End_Enrollments"
 82 | ```
 83 | 
 84 | Last year end of year enrollments for previous grade.
 85 | ```{r}
 86 | prev_end_enrolls_grade = prev_end_enrolls
 87 | prev_end_enrolls_grade$Grade = advance_grade(prev_end_enrolls_grade$Grade, 1)
 88 | names(prev_end_enrolls_grade)[names(prev_end_enrolls_grade)=="Prev_End_Enrollments"] <- "Prev_Grade_End_Enrollments"
 89 | ```
 90 | 
 91 | Combine the five snapshots together so that we can use previous snapshots to predict current enrollments.
 92 | ```{r}
 93 | enrolls <- merge(pred_enrolls, prev_20_enrolls, all.x=TRUE)
 94 | enrolls <- merge(enrolls, prev_20_enrolls_grade, all.x=TRUE)
 95 | enrolls <- merge(enrolls, prev_end_enrolls, all.x=TRUE)
 96 | enrolls <- merge(enrolls, prev_end_enrolls_grade, all.x=TRUE)
 97 | ```
 98 | 
 99 | I removed 2009 since there is no prior history and 2014 for testing purposes.
100 | ```{r}
101 | enrolls_test <- subset(enrolls, Year == 2014)
102 | enrolls <- subset(enrolls, Year >= 2009 & Year < 2014)
103 | ```
104 | 
105 | Set NAs to 0 to replicate real life unknowns, except when there is no previous history for school. I am not doing this right now so that we can only assess good data situations.
106 | ```{r}
107 | # enrolls[is.na(enrolls)] <- 0 
108 | # enrolls <- merge(enrolls, school_info, all.x=TRUE)
109 | # set.all.to.na <- which(enrolls$Year == enrolls$Min_Year)
110 | # set.grade.to.na <- which(enrolls$Grade == enrolls$Min_Grade)
111 | # is.na(enrolls[set.all.to.na,7:10]) <- TRUE
112 | # is.na(enrolls[set.grade.to.na,c(8,10)]) <- TRUE
113 | ```
114 | 
115 | Add in some school information from the school reports.
116 | ```{r}
117 | enrolls <- merge(enrolls, school_reports, all.x=TRUE, 
118 |                  by.x = "SCHOOL_CODE", by.y = "School.ID")
119 | ```
120 | 
121 | Look at how enrollments change throughout the year to see if it is predictive of future enrollment changes. 
122 | ```{r}
123 | enrolls$Prev_Change = enrolls$Prev_End_Enrollments - enrolls$Prev_20_Enrollments
124 | enrolls$Prev_Grade_Change = enrolls$Prev_Grade_End_Enrollments - enrolls$Prev_Grade_20_Enrollments
125 | ```
126 | 
127 | 
128 | Function to calculate correlation of 4 previous year variables with enrollments for a grade.
129 | ```{r}
130 | compare_cor <- function(grade) {
131 |   # convert grade to a number with 0=Kindergarten, 1=first grade, etc.
132 |   c(Grade = which(grade==levels(counts$Grade))-3, cor(subset(enrolls, Grade == grade)[,6:10], use="pair")[-1,1])
133 | }
134 | correls = t(sapply(unique(enrolls$Grade), compare_cor))
135 | correls = as.data.frame(correls)
136 | correls$Grade = factor(advance_grade("K",correls$Grade), levels = levels(counts$Grade))
137 | ```
138 | 
139 | Plots for the presentation. Using last year's fifth grade enrollment to estimate this year's fifth grade enrollment. There is a fairly strong correlation year over year. 
140 | ```{r}
141 | ggplot(subset(enrolls, Grade == "5"), aes(Prev_End_Enrollments, Enrollments)) + 
142 |   geom_point(alpha = 0.25) + geom_abline() + geom_smooth(method = lm, size = 1) + 
143 |   labs(x= "Previous Year 5th Grade Enrollments", y= "This Year 5th Grade Enrollments", title = "5th Grade")
144 | ```
145 | 
146 | In contrast, if we use previous year 4th grade enrollment for the same school, there is a stronger correlation.
147 | ```{r}
148 | ggplot(subset(enrolls, Grade == "5"), aes(Prev_Grade_End_Enrollments, Enrollments)) + 
149 |   geom_point(alpha = 0.25) + geom_abline() + geom_smooth(method = lm, size = 1) + 
150 |   labs(x= "Previous Year 4th Grade Enrollments", y= "This Year 5th Grade Enrollments", title = "5th Grade")
151 | ```
152 | 
153 | For eleventh grade, we can look at the same two plots. There is no longer a big difference between using eleventh and tenth grade.
154 | ```{r}
155 | ggplot(subset(enrolls, Grade == "11" & SCHOOL_CODE != "609686"), aes(Prev_End_Enrollments, Enrollments)) + 
156 |   geom_point(alpha = 0.75) + geom_abline() + geom_smooth(method = lm, size = 1) + 
157 |   labs(x= "Previous Year 11th Grade Enrollments", y= "This Year 11th Grade Enrollments", title = "11th Grade")
158 | 
159 | ggplot(subset(enrolls, Grade == "11" & SCHOOL_CODE != "609686"), aes(Prev_Grade_End_Enrollments, Enrollments)) + 
160 |   geom_point(alpha = 0.75) + geom_abline() + geom_smooth(method = lm, size = 1) + 
161 |   labs(x= "Previous Year 10th Grade Enrollments", y= "This Year 11th Grade Enrollments", title = "11th Grade")
162 | ```
163 | 
164 | We can compare the correlations for all grades. Grade school does better than high school for using the same cohort. Also, kindergarten predicts first grade better than first grade, which is contrary to CPS's intuition.
165 | ```{r}
166 | correls_pres = correls
167 | names(correls_pres)[4:5] <- c("Same Grade", "Previous Grade")
168 | is.na(correls_pres[correls_pres$Grade == "9", 5]) <- TRUE
169 | ggplot(melt(subset(correls_pres[,c(1,4:5)], !(Grade %in% c("PE", "PK"))), id = 1), aes(Grade, value, colour = variable , group = variable)) + 
170 |   geom_point() + geom_line() + labs(y = "Correlation with Next Year Enrollment", colour = "Method")
171 | ```
172 | 
173 | We broke up each high school by their rating. The best school are easier to predict because students do not leave.
174 | ```{r}
175 | ggplot(subset(enrolls, Grade %in% paste(9:12)), aes(Prev_Grade_End_Enrollments, Enrollments)) + 
176 |   geom_point() + geom_abline() + geom_smooth(method = lm) + facet_grid(Overall.Rating ~ Grade) +
177 |   labs(x = "Last Year Previous Grade Enrollments", y = "This Year Enrollments")
178 | ```
179 | 
180 | That is not necessarily the same for elementary school. This may have something to do with students not being old enough able to drop out of school yet.
181 | ```{r}
182 | ggplot(subset(enrolls, Grade %in% c("K",paste(1:5))), aes(Prev_Grade_End_Enrollments, Enrollments)) + 
183 |   geom_point(alpha = .5) + geom_abline() + geom_smooth(method = lm, size = 1) + facet_grid(Overall.Rating ~ Grade)+
184 |   labs(x = "Last Year Previous Grade Enrollments", y = "This Year Enrollments")
185 | ```
186 | 
187 | 
188 | Other exploratory analysis.
189 | ```{r}
190 | ggplot(subset(enrolls, Grade != "PE"), aes(Prev_Grade_End_Enrollments, Enrollments)) + 
191 |   geom_point() + geom_abline() + geom_smooth(method = lm) + facet_wrap(~ Grade, scales="free")
192 | ```
193 | 
194 | ```{r}
195 | ggplot(subset(enrolls, !(Grade %in% c("PE","PK",paste(9:12)))), aes(Prev_Grade_Change, Enrollments - Prev_Grade_20_Enrollments)) + 
196 |   geom_point(alpha = .25) + geom_smooth(method = lm) + facet_grid(Overall.Rating ~ Grade)
197 | 
198 | ggplot(subset(enrolls, Grade %in% paste(9:12)), aes(Prev_Grade_Change, Enrollments - Prev_Grade_20_Enrollments)) + 
199 |   geom_point(alpha = .25) + geom_smooth(method = lm) + facet_grid(Overall.Rating ~ Grade)
200 | ```
201 | 


--------------------------------------------------------------------------------
/Data_Analysis/README.md:
--------------------------------------------------------------------------------
 1 | ## Data Analysis for CPS Predicting Student Enrollment ##
 2 | 
 3 | This folder contains our work on top-down school-level modeling and exploratory data analysis. 
 4 | 
 5 | ### Top-down school-level model ###
 6 | 
 7 | The code `CPS_gradebygrade_dataprep.R` will create a data file with all of the features we considered in the top-down model.
 8 | The code `CPS_gradebygrade_script.R` creates functions that we used to try out different regression models. 
 9 | The code `CPS_gradebygrade_modelingexploration.R` uses the above two scripts and goes through many different models. 
10 | 
11 | The code `topdown_FINAL.R` is the final code where we focused on modeling catchment high school enrollment. 
12 | 
13 | ### Exploratory data analysis ###
14 | 
15 | The code `prepare_data_and_shapefile_for_tableau.R` prepares the data for the Tableau workbook that shows where students from one catchment area go to school in 2013. 
16 | 
17 | The code `map_neighborhood_catchment.R` plots the percent of students who go to their catchment area. 
18 | 
19 | The code `grade9_transition_matrix.R` produces our plots of where 9th graders live versus where they go to high school. 
20 | 
21 | `investigate_9thgrade.R` and `investigate_juarez_change.R` are files we used to explore where 9th grade 
22 | students go to school and where they live. In particular we look at how many students at certain schools
23 | come from within their catchment areas or come from outside CPS. 
24 | 
25 | The codes called `DeepDive1EDA_Counts.Rmd`, `exploratory_data_analysis.R`, `explore_8_to_9_transition.R`,
26 | `explore_enroll_totals_from_website.Rmd`
27 | are some of our initial exploratory files. 
28 | 
29 | The code `CPS_JAGS.R` documents our exploratory hierarchical modeling of cohort survival ratios. 
30 | Through R we use the program called JAGS, which stands for Just Another Gibbs Sampler. 
31 | 
32 | `dataprep_geocodeschools.R` is where we find that latitude and longitude of each CPS school using the ggmap package in R. 
33 | 
34 | `history_evaluation.R` is the visualization of historical error of CPS. 
35 | 
36 | 


--------------------------------------------------------------------------------
/Data_Analysis/dataprep_geocodeschools.R:
--------------------------------------------------------------------------------
 1 | # =================================== #
 2 | #   geocode + prep data for tableau   #
 3 | # =================================== #
 4 | 
 5 | # geocode school locations from street address --> lat long
 6 | library("ggmap")
 7 | 
 8 | setwd("/Volumes/appdata-3/School Reports")
 9 | data = read.csv("CSVfileforR_excerptof_FY14SBBALLOCATIONS_FINALSUMMARY_asof_032414.csv", stringsAsFactors = F)
10 | school_locations = read.csv("CPS_SchoolLocations_SY13_14.csv", stringsAsFactors = F)
11 | 
12 | schoolmat = matrix(nrow = 623, ncol = 6)
13 | schoolmat = as.data.frame(schoolmat)
14 | colnames(schoolmat) = c("School ID", "Lon", "Lat", "Actual Total", "Projected Total", "Difference")
15 | 
16 | for (i in 1:623){
17 |   schoolmat[i,1] = data[i,1] # school ID
18 |   
19 |   findlocation = school_locations[(school_locations$SchoolID == data[i,1]),]
20 |   geocodeschool = geocode(findlocation$Address)
21 |   schoolmat[i,2] = geocodeschool$lon # longitude
22 |   schoolmat[i,3] = geocodeschool$lat # latitude
23 |   schoolmat[i,4] = data[i,22] # actual total
24 |   schoolmat[i,5] = data[i,13] # projected total
25 |   schoolmat[i,6] = schoolmat[i,5] - schoolmat[i,4] # projected - actual
26 | }
27 | 
28 | # add more variables
29 | schoolmat_extra = matrix(nrow = 623, ncol = 5)
30 | schoolmat_extra = as.data.frame(schoolmat_extra)
31 | colnames(schoolmat_extra) = c("School ID", "Category", "Type", "Geo network", "Geographic region")
32 | 
33 | for (i in 1:623){
34 |   schoolmat_extra[i,1] = data[i,1] # school ID
35 |   
36 |   findlocation = school_locations[(school_locations$SchoolID == data[i,1]),]
37 |   if (nrow(findlocation) > 0){
38 |   schoolmat_extra[i,2] = findlocation$SchoolCate
39 |   schoolmat_extra[i,3] = findlocation$SchoolType
40 |   schoolmat_extra[i,4] = findlocation$Geographic
41 |   schoolmat_extra[i,5] = findlocation$Geograph_1
42 |   }
43 | }
44 | 
45 | write.csv(schoolmat, file = "school_projections_geocode_fall13.csv")
46 | write.csv(schoolmat_extra, file = "school_projections_geocode_fall13_EXTRA.csv")


--------------------------------------------------------------------------------
/Data_Analysis/exploratory_data_analysis.R:
--------------------------------------------------------------------------------
  1 | # ---------------------- #
  2 | #   Exploring CPS data   #
  3 | # ---------------------- #
  4 | 
  5 | # ======================================================= #
  6 | #   exploring schools that are far off from projections   #
  7 | # ======================================================= #
  8 | 
  9 | # greg's data
 10 | setwd("/Volumes/appdata-3/School Reports") 
 11 | greg_data = read.csv("CSVfileforR_excerptof_FY14SBBALLOCATIONS_FINALSUMMARY_asof_032414.csv", header = TRUE)
 12 | 
 13 | # how far off are projections in aggregate for different types of schools?
 14 | greg_data$dif = greg_data$Projected.Total - greg_data$Actual.Total
 15 | 
 16 | # look at elementary schools off by more than 50
 17 | elem_50off = greg_data[(abs(greg_data$Projected.Total - greg_data$Actual.Total) > 50)&(greg_data$School.Type == "Elementary"),]
 18 | elem_50off$dif = elem_50off$Projected.Total - elem_50off$Actual.Total
 19 | elem_50off$propbudget = elem_50off$dif/elem_50off$Projected.Total
 20 | elem_50off[order(elem_50off$propbudget),c(1,2,13,22,23,24)]
 21 | 
 22 | # high schools off by more than 50
 23 | hs_50off = greg_data[(abs(greg_data$Projected.Total - greg_data$Actual.Total) > 50)&(greg_data$School.Type == "High School"),]
 24 | hs_50off$dif = hs_50off$Projected.Total - hs_50off$Actual.Total
 25 | hs_50off$propbudget = hs_50off$dif/hs_50off$Projected.Total
 26 | hs_50off[order(hs_50off$propbudget),c(1,2,13,22,23,24)]
 27 | 
 28 | # charter schools off by more than 50
 29 | charter_50off = greg_data[(abs(greg_data$Projected.Total - greg_data$Actual.Total) > 50)&(greg_data$School.Type == "Charter"),]
 30 | charter_50off$dif = charter_50off$Projected.Total - charter_50off$Actual.Total
 31 | charter_50off$propbudget = charter_50off$dif/charter_50off$Projected.Total
 32 | charter_50off[order(charter_50off$propbudget),c(1,2,13,22,23,24)]
 33 | 
 34 | 
 35 | # ============================================================ #
 36 | #   exploring 2012 enrollments (less extreme year than 2013)   #
 37 | # ============================================================ #
 38 | 
 39 | library(lubridate)
 40 | library(plyr)
 41 | 
 42 | # read count data
 43 | setwd("/Volumes/appdata-3/Count Data")
 44 | 
 45 | counts = read.csv("Enrollment_Counts_Month_Current.csv", stringsAsFactors = FALSE)
 46 | counts$Date = as.Date(counts$Date)
 47 | counts$STUDENT_ANNUAL_GRADE_CODE = factor(counts$STUDENT_ANNUAL_GRADE_CODE, c('PE','PK','K','1','2','3','4','5','6','7','8','9','10','11','12'))
 48 | 
 49 | counts$Year = as.numeric(substring(counts$SCHOOL_YEAR,6,9))
 50 | counts$Month = month(counts$Date)
 51 | names(counts)[names(counts)=="STUDENT_ANNUAL_GRADE_CODE"] = "Grade"
 52 | 
 53 | # read school data
 54 | setwd("/Volumes/appdata-3/School Reports")
 55 | 
 56 | report_elem = read.csv("CPS_Elementary_Report_20122013.csv")
 57 | report_high = read.csv("CPS_HighSchool_Report_20122013.csv")
 58 | levels(report_elem$Student.Performance.Level) <- c("Far Below Average", "Below Average", "Average", "Above Average", "Far Above Average")
 59 | report_high$Student.Performance.Level = factor(report_high$Student.Performance.Level, c("Far Below Average", "Below Average", "Average", "Above Average", "Far Above Average"))
 60 | report_elem$Overall.Rating = factor(report_elem$Overall.Rating, levels = c("Level 1", "Level 2", "Level 3"))
 61 | report_high$Overall.Rating = factor(report_high$Overall.Rating, levels = c("Level 1", "Level 2", "Level 3"))
 62 | 
 63 | school_reports <- rbind(subset(report_elem, select = c(School.ID, Student.Performance.Level, Overall.Rating)), subset(report_high, select = c(School.ID, Student.Performance.Level, Overall.Rating)))
 64 | 
 65 | # function for adding grades
 66 | advance_grade <- function(current_grade, years = 1) {
 67 |   locs = apply(matrix(current_grade),1,function(x) which(x == levels(counts$Grade))) + years
 68 |   is.na(locs[locs<1 | locs>length(levels(counts$Grade))]) <- TRUE
 69 |   levels(counts$Grade)[locs]
 70 | }
 71 | 
 72 | # Enrollments we want to predict
 73 | pred_enrolls = subset(counts, Month == 10, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments))
 74 | 
 75 | # Last year 20th day enrollments
 76 | prev_20_enrolls = subset(counts, Month == 10, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments))
 77 | prev_20_enrolls$Year = prev_20_enrolls$Year + 1
 78 | names(prev_20_enrolls)[names(prev_20_enrolls)=="Enrollments"] <- "Prev_20_Enrollments"
 79 | 
 80 | # Last year 20th day enrollments for previous grade
 81 | prev_20_enrolls_grade = prev_20_enrolls
 82 | prev_20_enrolls_grade$Grade = advance_grade(prev_20_enrolls_grade$Grade, 1)
 83 | names(prev_20_enrolls_grade)[names(prev_20_enrolls_grade)=="Prev_20_Enrollments"] <- "Prev_Grade_20_Enrollments"
 84 | 
 85 | # Last year end of year enrollments
 86 | prev_end_enrolls = subset(counts, Month == 6, select = c(Year, SCHOOL_KEY, SCHOOL_NAME, SCHOOL_CODE, Grade, Enrollments))
 87 | prev_end_enrolls$Year = prev_end_enrolls$Year + 1
 88 | names(prev_end_enrolls)[names(prev_end_enrolls)=="Enrollments"] <- "Prev_End_Enrollments"
 89 | 
 90 | # Last year end of year enrollments for previous grade
 91 | prev_end_enrolls_grade = prev_end_enrolls
 92 | prev_end_enrolls_grade$Grade = advance_grade(prev_end_enrolls_grade$Grade, 1)
 93 | names(prev_end_enrolls_grade)[names(prev_end_enrolls_grade)=="Prev_End_Enrollments"] <- "Prev_Grade_End_Enrollments"
 94 | 
 95 | # combine to one dataset
 96 | enrolls <- merge(pred_enrolls, prev_20_enrolls, all.x=TRUE)
 97 | enrolls <- merge(enrolls, prev_20_enrolls_grade, all.x=TRUE)
 98 | enrolls <- merge(enrolls, prev_end_enrolls, all.x=TRUE)
 99 | enrolls <- merge(enrolls, prev_end_enrolls_grade, all.x=TRUE)
100 | 
101 | # add in some school information
102 | enrolls <- merge(enrolls, school_reports, all.x=TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID")
103 | 
104 | schools_basicdata = read.csv("schools.csv")
105 | 
106 | enrolls <- merge(enrolls, schools_basicdata, all.x=TRUE, by.x = "SCHOOL_CODE", by.y = "SCHOOL_CODE")
107 | 
108 | # look at 2011 to 2012 enrollments
109 | schools_fall11to12 = ddply(enrolls, .(SCHOOL_CODE), summarize, fall2011enrollment = sum(Enrollments[Year == 2012]), fall2012enrollment = sum(Enrollments[Year == 2013]))
110 | schools_fall11to12 = subset(schools_fall11to12, (fall2011enrollment > 0)|(fall2012enrollment > 0))
111 | 
112 | offby50 = schools_fall11to12[(abs(schools_fall11to12$fall2012enrollment - schools_fall11to12$fall2011enrollment) >= 100)&(schools_fall11to12$fall2011enrollment > 0)&(schools_fall11to12$fall2012enrollment > 0),]
113 | 
114 | school_info = ddply(enrolls, .(SCHOOL_CODE), summarize, Min_Year = min(Year, na.rm=TRUE), Max_Year = max(Year, na.rm=TRUE))
115 | 
116 | newschools_fall2012 = school_info[(school_info$Min_Year == 2013),]
117 | closingschools_fall2012 = school_info[(school_info$Max_Year == 2012),]
118 | 
119 | # ------------------------- #
120 | #   new schools fall 2013   #
121 | # ------------------------- #
122 | 
123 | newschools_fall2013 = school_info[(school_info$Min_Year == 2014),]
124 | 
125 | # pick 1 new school from fall 2013 to analyze
126 | 
127 | newschool1_fall2013 = newschools_fall2013[13,]$SCHOOL_CODE
128 | newschool1_type = schools_basicdata[(schools_basicdata$SchoolID == newschool1_fall2013),]$SchoolGradeGroup
129 | newschool1_location = schools_basicdata[(schools_basicdata$SchoolID == newschool1_fall2013),c("Longitude","Latitude")]
130 | newschool1_name = schools_basicdata[(schools_basicdata$SchoolID == newschool1_fall2013),]$SchoolName
131 | 
132 | dist_school1 = rdist.earth(newschool1_location, schools_basicdata[,c("Longitude","Latitude")], miles = TRUE, R = NULL)
133 | schools_within1mile = schools_basicdata[(dist_school1 <= 1),]
134 | schools_within1mile = schools_within1mile[(!is.na(schools_within1mile$SchoolID)),]
135 | 
136 | test = merge(schools_within1mile, enrolls, all.x = TRUE, by.x = "SchoolID", by.y = "SCHOOL_CODE")
137 | test = test[(test$SchoolGradeGroup == newschool1_type),]
138 | 
139 | table(test[(test$SchoolID == newschool1_fall2013),]$Grade)
140 | 
141 | test2 = ddply(test, .(SchoolID, Year, Longitude, Latitude), summarize, total_enr = sum(Enrollments, na.rm = TRUE))
142 | test2 = test2[(!is.na(test2$Year)),]
143 | 
144 | scaleparam = scale_size(limits=c(0,max(test2$total_enr)), range = c(1,15))
145 | 
146 | enr = as.data.frame(test2[(test2$Year == "2012"),])
147 | qm3 = qmap(location = c(newschool1_location$Longitude, newschool1_location$Latitude) , zoom = 14) + labs(title = '2011-2012', size = "enrollment") +geom_point(data = enr[(enr$SchoolID != newschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), show_guide = FALSE) + scaleparam
148 | print(qm3)
149 | 
150 | enr = as.data.frame(test2[(test2$Year == "2013"),])
151 | qm4 = qmap(location = c(newschool1_location$Longitude, newschool1_location$Latitude) , zoom = 14) +labs(title = '2012-2013', size = "enrollment")+ geom_point(data = enr[(enr$SchoolID != newschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), show_guide = FALSE) + scaleparam
152 | print(qm4)
153 | 
154 | enr = as.data.frame(test2[(test2$Year == "2014"),])
155 | #qm5 = qmap(location = c(newschool1_location$lon, newschool1_location$lat) , zoom = 14) + labs(title = '2013-2014: newschool', size = "enrollment")+geom_point(data = enr[(enr$SCHOOL_CODE == newschool1_fall2013),], aes(x = lon.x, y = lat.x, size = total_enr), position = position_jitter(w = .0005, h = .0005), color = 'red', show_guide = FALSE) + geom_point(data = enr[(enr$SCHOOL_CODE != newschool1_fall2013),], aes(x = lon.x, y = lat.x, size = total_enr), show_guide = FALSE) + scaleparam
156 | qm5 = qmap(location = c(newschool1_location$Longitude, newschool1_location$Latitude) , zoom = 14) + labs(title = '2013-2014: Back of the Yards IB HS', size = "enrollment")+geom_point(data = enr[(enr$SchoolID == newschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), color = 'red', show_guide = FALSE) + geom_point(data = enr[(enr$SchoolID != newschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), show_guide = FALSE) + scaleparam
157 | print(qm5)
158 | 
159 | 
160 | # school closings
161 | closeschools_fall2013 = school_info[(school_info$Max_Year == 2013),]
162 | 
163 | # pick 1 school that closed in spring 2013 to analyze
164 | closeschool1_fall2013 = sample(unique(closeschools_fall2013$SCHOOL_CODE), size = 1)
165 | closeschool1_type = schools_basicdata[(schools_basicdata$SchoolID == closeschool1_fall2013),]$SchoolGradeGroup
166 | closeschool1_location = report_elem[(report_elem$School.ID == closeschool1_fall2013),c("Longitude","Latitude")]
167 | closeschool1_name = schools_basicdata[(schools_basicdata$SchoolID == closeschool1_fall2013),]$SchoolName
168 | 
169 | dist_school1 = rdist.earth(closeschool1_location[c(2,1)], schools_basicdata[,c("Longitude","Latitude")], miles = TRUE, R = NULL)
170 | 
171 | schools_within1mile = schools_basicdata[((dist_school1 <= 1)|(schools_basicdata$SchoolID == closeschool1_fall2013)),]
172 | schools_within1mile = schools_within1mile[(!is.na(schools_within1mile$SchoolID)),]
173 | 
174 | schools_within1mile[(schools_within1mile$SchoolID == closeschool1_fall2013),]$Longitude = closeschool1_location$Latitude
175 | schools_within1mile[(schools_within1mile$SchoolID == closeschool1_fall2013),]$Latitude = closeschool1_location$Longitude
176 | 
177 | test = merge(schools_within1mile, enrolls, all.x = TRUE, by.x = "SchoolID", by.y = "SCHOOL_CODE")
178 | test = test[(test$SchoolGradeGroup == closeschool1_type),]
179 | 
180 | table(test[(test$SchoolID == closeschool1_fall2013),]$Grade) # K - 8
181 | test = test[-(test$Grade == 'PE'),]
182 | test = test[-(test$Grade == 'PK'),]
183 | 
184 | test2 = ddply(test, .(SchoolID, Year, Longitude, Latitude), summarize, total_enr = sum(Enrollments, na.rm = TRUE))
185 | #test2 = test2[(!is.na(test2$Year)),]
186 | 
187 | enr = test2[(test2$Year == "2010"),]
188 | qm1 = qmap(location = c(closeschool1_location$Latitude, closeschool1_location$Longitude), zoom = 14) + labs(title = '2010') + geom_point(data = enr[(enr$SchoolID == closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), position = position_jitter(w = .0005, h = .0005), color = 'cyan') + geom_point(data = enr[(enr$SchoolID != closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr)) + scale_size(breaks = c(200,400,600,800,1000), limits=c(0,1000), range = c(1,15))
189 | 
190 | enr = test2[(test2$Year == "2011"),]
191 | qm2 = qmap(location = c(closeschool1_location$Latitude, closeschool1_location$Longitude) , zoom = 14) + labs(title = '2011')+geom_point(data = enr[(enr$SchoolID == closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), position = position_jitter(w = .0005, h = .0005), color = 'cyan')+ geom_point(data = enr[(enr$SchoolID != closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr)) + scale_size(breaks = c(200,400,600,800,1000), limits=c(0,1000), range = c(1,15))
192 | 
193 | enr = test2[(test2$Year == "2012"),]
194 | qm3 = qmap(location = c(closeschool1_location$Latitude, closeschool1_location$Longitude) , zoom = 14) + labs(title = '2012') +geom_point(data = enr[(enr$SchoolID == closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), position = position_jitter(w = .0005, h = .0005), color = 'cyan') +geom_point(data = enr[(enr$SchoolID != closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr)) + scale_size(breaks = c(200,400,600,800,1000), limits=c(0,1000), range = c(1,15))
195 | 
196 | enr = test2[(test2$Year == "2013"),]
197 | qm4 = qmap(location = c(closeschool1_location$Latitude, closeschool1_location$Longitude) , zoom = 14) +labs(title = '2013')+ geom_point(data = enr[(enr$SchoolID == closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr), position = position_jitter(w = .0005, h = .0005), color = 'cyan') +geom_point(data = enr[(enr$SchoolID != closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr)) + scale_size(breaks = c(200,400,600,800,1000), limits=c(0,1000), range = c(1,15))
198 | 
199 | enr = test2[(test2$Year == "2014"),]
200 | qm5 = qmap(location = c(closeschool1_location$Latitude, closeschool1_location$Longitude) , zoom = 14) + labs(title = '2014') + geom_point(data = enr[(enr$SchoolID != closeschool1_fall2013),], aes(x = Longitude, y = Latitude, size = total_enr)) + scale_size(breaks = c(200,400,600,800,1000), limits=c(0,1000), range = c(1,15))
201 | 
202 | pushViewport(viewport(layout = grid.layout(2, 3)))
203 | print(qm1, vp = viewport(layout.pos.row = 1, layout.pos.col = 1))
204 | print(qm2, vp = viewport(layout.pos.row = 1, layout.pos.col = 2))
205 | print(qm3, vp = viewport(layout.pos.row = 1, layout.pos.col = 3))
206 | print(qm4, vp = viewport(layout.pos.row = 2, layout.pos.col = 1))
207 | print(qm5, vp = viewport(layout.pos.row = 2, layout.pos.col = 2))
208 | 


--------------------------------------------------------------------------------
/Data_Analysis/explore_8_to_9_transition.R:
--------------------------------------------------------------------------------
  1 | setwd("//admin/appdata/DataWarehouse/DSSG")
  2 | load("Student_Data/students.RData")
  3 | library(reshape2)
  4 | library(ggplot2)
  5 | 
  6 | students$High_School = ifelse(is.na(students$NEXT_GRADE_STUDENT_ANNUAL_SCHOOL),
  7 |                                   "None", as.character(students$NEXT_GRADE_STUDENT_ANNUAL_SCHOOL))
  8 | students$High_School_Code = ifelse(is.na(students$NEXT_GRADE_STUDENT_ANNUAL_SCHOOL_CODE),
  9 |                                        0, students$NEXT_GRADE_STUDENT_ANNUAL_SCHOOL_CODE)
 10 | 
 11 | students2010 = subset(students, SCHOOL_YEAR == "2010-2011")
 12 | 
 13 | transition_codes = table(students2010$STUDENT_ANNUAL_SCHOOL_CODE, 
 14 |                          students2010$High_School_Code)
 15 | transition_codes_years = table(students$STUDENT_ANNUAL_SCHOOL_CODE, 
 16 |                                students$High_School_Code,
 17 |                                students$SCHOOL_YEAR)
 18 | 
 19 | 
 20 | 
 21 | trans_codes_m = melt(transition_codes_years, varnames = c("ES_code", "HS_code", "SCHOOL_YEAR"))
 22 | trans_codes_m = trans_codes_m[trans_codes_m$value>0,]
 23 | head(trans_codes_m)
 24 | 
 25 | schools = read.csv("School Reports/schools.csv")
 26 | schools_loc = subset(schools, select = c("SCHOOL_CODE", "lat", "lon", "SCHOOL_TYPE"))
 27 | 
 28 | trans_codes_m = merge(trans_codes_m, schools_loc[, -4], by.x = "ES_code", by.y = "SCHOOL_CODE", all.x = TRUE)
 29 | names(trans_codes_m)[5:6] <- paste("ES", c("lat","lon"), sep = "_")
 30 | 
 31 | trans_codes_m = merge(trans_codes_m, schools_loc, by.x = "HS_code", by.y = "SCHOOL_CODE", all.x = TRUE)
 32 | names(trans_codes_m)[7:8] <- paste("HS", c("lat","lon"), sep = "_")
 33 | 
 34 | # plot on map for some schools ####
 35 | library(ggmap)
 36 | pdf("Visualizations/R plots/explore_8_to_9.pdf")
 37 | source_school_codes = sample(unique(trans_codes_m$ES_code),20)
 38 | # map = get_googlemap('chicago', zoom = 11)
 39 | for (sc in source_school_codes) {
 40 |   es_trans = as.data.frame(trans_codes_m[trans_codes_m$ES_code == sc & trans_codes_m$SCHOOL_YEAR == "2010-2011",])
 41 |   if (!is.na(es_trans$ES_lat[1])) {
 42 |     school_name = as.character(schools$SCHOOL_SHORT_NAME[schools$SCHOOL_CODE == es_trans$ES_code[1]])
 43 |     qm <- qmap("Chicago", zoom = 10) + labs(size = "# Students", title = school_name) + 
 44 |       geom_point(data = es_trans[1,], aes(x = ES_lon, y = ES_lat), shape = "x", colour = "black", size = 10) + 
 45 |       geom_point(data = es_trans, aes(x = HS_lon, y = HS_lat, size = value), colour = "red")
 46 |     print(qm)
 47 |   }
 48 | }
 49 | dev.off()
 50 | 
 51 | # plot with "background singers" ####
 52 | high_schools = as.data.frame(unique(trans_codes_m[!is.na(trans_codes_m$HS_lat), 7:9]))
 53 | plot_base = ggplot(high_schools, aes(HS_lon, HS_lat)) + geom_point(aes(colour = SCHOOL_TYPE), shape = 1) + coord_map()
 54 | 
 55 | source_school_codes = sample(unique(trans_codes_m$ES_code),20)
 56 | for (sc in source_school_codes) {
 57 |   es_trans = as.data.frame(trans_codes_m[trans_codes_m$ES_code == sc, ])
 58 |   es_trans[es_trans$HS_code == 0, 7:8] = rep(c(41.9, -87.55), each = sum(es_trans$HS_code == 0))
 59 |   if (!is.na(es_trans$ES_lat[1])) {
 60 |     school_name = as.character(schools$SCHOOL_SHORT_NAME[schools$SCHOOL_CODE == es_trans$ES_code[1]])
 61 |     school_type = as.character(schools$SCHOOL_TYPE[schools$SCHOOL_CODE == es_trans$ES_code[1]])
 62 |     p <- plot_base + geom_point(data = es_trans, aes(size = value, colour = SCHOOL_TYPE)) +
 63 |       geom_point(data = es_trans, aes(x = ES_lon, y = ES_lat, colour = school_type), shape = "x", size = 10) +
 64 |       labs(size = "# Students", title = paste0(school_name, " (", school_type, ")"), x = "Latitude", y = "Longitude") + 
 65 |       facet_wrap( ~ SCHOOL_YEAR, nrow = 1)
 66 |     print(p)
 67 |   }
 68 | }
 69 | 
 70 | # distance between elem school and high schools ####
 71 | rdist.earth.vec <- function(x1, x2, miles = TRUE, R = NULL) {
 72 |   # modified from fields::rdist.earth
 73 |   if (is.null(R)) {
 74 |     if (miles) 
 75 |       R <- 3963.34
 76 |     else R <- 6378.388
 77 |   }
 78 |   coslat1 <- cos((x1[, 2] * pi)/180)
 79 |   sinlat1 <- sin((x1[, 2] * pi)/180)
 80 |   coslon1 <- cos((x1[, 1] * pi)/180)
 81 |   sinlon1 <- sin((x1[, 1] * pi)/180)
 82 |   
 83 |   coslat2 <- cos((x2[, 2] * pi)/180)
 84 |   sinlat2 <- sin((x2[, 2] * pi)/180)
 85 |   coslon2 <- cos((x2[, 1] * pi)/180)
 86 |   sinlon2 <- sin((x2[, 1] * pi)/180)
 87 | 
 88 |   pp <- cbind(coslat1 * coslon1, coslat1 * sinlon1, sinlat1) *
 89 |     cbind(coslat2 * coslon2, coslat2 * sinlon2, sinlat2)
 90 |   pp = rowSums(pp)
 91 |   return(R * acos(ifelse(abs(pp) > 1, 1 * sign(pp), pp)))
 92 | }
 93 | 
 94 | trans_codes_m$Distance = with(trans_codes_m, rdist.earth.vec(cbind(ES_lon, ES_lat), cbind(HS_lon, HS_lat)))
 95 | 
 96 | trans_codes_m2010 = subset(trans_codes_m, SCHOOL_YEAR == "2010-2011" & !is.na(Distance))
 97 | ggplot(trans_codes_m2010, aes(Distance, weight = value)) + geom_histogram(binwidth=1) + facet_wrap( ~ SCHOOL_TYPE, scales = "free_y")
 98 | 
 99 | wquantile <- function(v,w=rep(1,length(v)),p=.5) {
100 |   # from https://stat.ethz.ch/pipermail/r-help/2009-February/188762.html
101 |   if (!is.numeric(v) || !is.numeric(w) || length(v) != length(w))
102 |     stop("Values and weights must be equal-length numeric vectors")
103 |   if ( !is.numeric(p) || any( p<0 | p>1 ) )
104 |     stop("Quantiles must be 0<=p<=1")
105 |   ranking <- order(v)
106 |   sumw <- cumsum(w[ranking])
107 |   if ( is.na(w[1]) || w[1]<0 ) stop("Weights must be non-negative numbers")
108 |   plist <- sumw/sumw[length(sumw)]
109 |   sapply(p, function(p) v [ ranking [ which.max( plist >= p ) ] ])
110 | }
111 | 
112 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Regular"),
113 |      wquantile(Distance, value, p = .9))
114 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Charter"),
115 |      wquantile(Distance, value, p = .9))
116 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Performance"),
117 |      wquantile(Distance, value, p = .9))
118 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Contract"),
119 |      wquantile(Distance, value, p = .9))
120 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Alternative"),
121 |      wquantile(Distance, value, p = .9))
122 | with(subset(trans_codes_m2010),
123 |      wquantile(Distance, value, p = 0.8569962))
124 | 
125 | with(subset(trans_codes_m2010, SCHOOL_TYPE == "Regular"),
126 |   sum(value[Distance<6.27])/sum(value))
127 | 
128 | 
129 | # open Illinois zip code shape files ####
130 | setwd("//admin/appdata/DataWarehouse/DSSG/Other Datasets/IllinoisZipCodes")
131 | # http://geocommons.com/overlays/305198
132 | library(rgdal) 
133 | library(maptools)
134 | library(rgeos)
135 | 
136 | ilzips = readOGR(dsn=".", layer="Zip_Codes")
137 | ilzips@data$id = rownames(ilzips@data)
138 | ilzips = gBuffer(ilzips, width=0, byid=TRUE)
139 | ilzips.points = fortify(ilzips, region="id")
140 | ilzips.df = plyr::join(ilzips.points, ilzips@data, by="id")
141 | 
142 | 
143 | # look at zip code source to high school ####
144 | setwd("//admin/appdata/DataWarehouse/DSSG")
145 | transition_zips_years = table(students$STUDENT_POSTAL_CODE, 
146 |                                students$High_School_Code,
147 |                                students$SCHOOL_YEAR)
148 | 
149 | trans_codes_m = melt(transition_zips_years, varnames = c("ZipCode", "HS_code", "SCHOOL_YEAR"))
150 | trans_codes_m = trans_codes_m[trans_codes_m$value>0,]
151 | head(trans_codes_m)
152 | 
153 | schools = read.csv("School Reports/schools.csv")
154 | schools_loc = subset(schools, select = c("SCHOOL_CODE", "lat", "lon", "SCHOOL_TYPE"))
155 | 
156 | library(zipcode)
157 | data(zipcode)
158 | 
159 | trans_codes_m = merge(trans_codes_m, zipcode[, c(1, 4, 5)], by.x = "ZipCode", by.y = "zip", all.x = TRUE)
160 | names(trans_codes_m)[5:6] <- paste("zip", c("lat","lon"), sep = "_")
161 | 
162 | trans_codes_m = merge(trans_codes_m, schools_loc, by.x = "HS_code", by.y = "SCHOOL_CODE", all.x = TRUE)
163 | names(trans_codes_m)[7:8] <- paste("HS", c("lat","lon"), sep = "_")
164 | 
165 | library(dplyr)
166 | big_zips = tbl_df(trans_codes_m) %>%
167 |   group_by(ZipCode) %>%
168 |   summarise(Students = sum(value)) %>%
169 |   filter(Students >= 1000) %>%
170 |   arrange(desc(Students))
171 | 
172 | total_per_zip_year = tbl_df(trans_codes_m) %>%
173 |   group_by(ZipCode, SCHOOL_YEAR) %>%
174 |   summarise(TotalStudents = sum(value))
175 | trans_codes_m = merge(trans_codes_m, total_per_zip_year)
176 | trans_codes_m$Percent = trans_codes_m$value / trans_codes_m$TotalStudents * 100
177 | # trans_codes_m$Distance = with(trans_codes_m, rdist.earth.vec(cbind(zip_lon, zip_lat), cbind(HS_lon, HS_lat)))
178 | 
179 | 
180 | # plot
181 | # source_zips = sort(sample(as.character(unique(trans_codes_m$ZipCode))[-1],20))
182 | library(maps)
183 | high_schools = as.data.frame(unique(subset(trans_codes_m, !is.na(HS_lat), 
184 |                                            c("SCHOOL_YEAR", "HS_lat", "HS_lon", "SCHOOL_TYPE"))))
185 | 
186 | source_zips = big_zips[1:10, 1]
187 | z = source_zips[2]
188 | size_range = range(trans_codes_m[trans_codes_m$ZipCode %in% source_zips, "Percent"])
189 | 
190 | 
191 | plot_base = ggplot(high_schools, aes(HS_lon, HS_lat)) + 
192 |   labs(size = "% of Students", y = "Latitude", x = "Longitude", colour = "High School Type") + 
193 |   scale_size_continuous(breaks = seq(10, floor(size_range[2]/10)*10, by = 10), limit = size_range, range = c(2, 6)) + 
194 |   facet_wrap( ~ SCHOOL_YEAR, nrow = 1) + geom_path( data=map_data("state"), aes(x=long, y=lat, group = group),colour="black") + 
195 |   coord_map(xlim = range(high_schools$HS_lon), ylim = range(high_schools$HS_lat)) + 
196 |   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
197 | pdf("Visualizations/R plots/explore_zip_to_9.pdf", hei=7, wid = 11)
198 | for (z in source_zips) {
199 |   cat(which(z == source_zips), " ")
200 |   zip_trans = as.data.frame(trans_codes_m[trans_codes_m$ZipCode == z, ])
201 |   if (sum(zip_trans$HS_code == 0)>0)
202 |     zip_trans[zip_trans$HS_code == 0, 7:8] = rep(c(41.9, -87.55), each = sum(zip_trans$HS_code == 0))
203 |   if (!is.na(zip_trans$zip_lat[1])) {
204 |     zip_code = as.character(z)
205 |     zip_poly = geom_polygon(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), fill="grey", alpha = .8)
206 |     # zip_poly = geom_path(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), colour="black")
207 |     p <- plot_base + zip_poly + geom_point(aes(colour = SCHOOL_TYPE), shape = 1) + 
208 |       geom_point(data = zip_trans, aes(size = Percent, colour = SCHOOL_TYPE)) +
209 |       ggtitle(paste0("Zip Code = ", zip_code))
210 |       # geom_point(data = zip_trans, aes(x = zip_lon, y = zip_lat), shape = "x", size = 10)
211 |     print(p)
212 |   }
213 | }
214 | dev.off()
215 | 
216 | 
217 | # same as above with google map in background
218 | # library(ggmap)
219 | # plot_base = qmap("Chicago", zoom = 10) + 
220 | #   labs(size = "% of Students", x = "Latitude", y = "Longitude", colour = "High School Type") + 
221 | #   scale_size_continuous(breaks = seq(10, floor(size_range[2]/10)*10, by = 10), limit = size_range, range = c(3, 10)) + 
222 | #   facet_wrap( ~ SCHOOL_YEAR, nrow = 1) # + xlim(range(high_schools$HS_lat)) + ylim(range(high_schools$HS_lon))
223 | # for (z in source_zips) {
224 | #   zip_trans = as.data.frame(trans_codes_m[trans_codes_m$ZipCode == z, ])
225 | #   if (sum(zip_trans$HS_code == 0)>0)
226 | #     zip_trans[zip_trans$HS_code == 0, 7:8] = rep(c(41.9, -87.55), each = sum(zip_trans$HS_code == 0))
227 | #   if (!is.na(zip_trans$zip_lat[1])) {
228 | #     zip_code = as.character(z)
229 | #     zip_poly = geom_path(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), colour="black")
230 | #     p <- plot_base + zip_poly + 
231 | #       geom_point(data = high_schools, aes(HS_lon, HS_lat, colour = SCHOOL_TYPE), shape = 1, size = 3) + 
232 | #       geom_point(data = zip_trans, aes(HS_lon, HS_lat, size = Percent, colour = SCHOOL_TYPE)) +
233 | #       ggtitle(paste0("Zip Code = ", zip_code))
234 | #     # geom_point(data = zip_trans, aes(x = zip_lon, y = zip_lat), shape = "x", size = 10)
235 | #     print(p)
236 | #   }
237 | # }
238 | 
239 | # plot for OpenGov Hack Night
240 | trans_codes_m = subset(trans_codes_m, SCHOOL_YEAR == "2012-2013")
241 | 
242 | high_schools = as.data.frame(unique(subset(trans_codes_m, !is.na(HS_lat), 
243 |                                            c("SCHOOL_YEAR", "HS_lat", "HS_lon", "SCHOOL_TYPE"))))
244 | 
245 | source_zips = big_zips[1:10, 1]
246 | size_range = range(trans_codes_m[trans_codes_m$ZipCode %in% source_zips, "Percent"])
247 | 
248 | 
249 | plot_base = ggplot(high_schools, aes(HS_lon, HS_lat)) + 
250 |   labs(size = "# of Students", y = "Latitude", x = "Longitude") + 
251 |   scale_size_continuous(range = c(2, 6)) + # breaks = seq(5, 25, by = 5), limit = size_range, 
252 |   geom_path( data=map_data("state"), aes(x=long, y=lat, group = group),colour="black") + 
253 |   coord_map(xlim = range(high_schools$HS_lon), ylim = range(high_schools$HS_lat)) + 
254 |   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
255 | 
256 | z = source_zips[2]
257 | zip_trans = as.data.frame(trans_codes_m[trans_codes_m$ZipCode == z, ])
258 | if (sum(zip_trans$HS_code == 0)>0)
259 |   zip_trans[zip_trans$HS_code == 0, 7:8] = rep(c(41.9, -87.55), each = sum(zip_trans$HS_code == 0))
260 | if (!is.na(zip_trans$zip_lat[1])) {
261 |   zip_code = as.character(z)
262 |   zip_poly = geom_polygon(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), fill="grey", alpha = .8)
263 |   # zip_poly = geom_path(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), colour="black")
264 |   p <- plot_base + zip_poly + geom_point(shape = 1) + 
265 |     geom_point(data = zip_trans, aes(size = value)) +
266 |     ggtitle(paste0("Zip Code = ", zip_code))
267 |   # geom_point(data = zip_trans, aes(x = zip_lon, y = zip_lat), shape = "x", size = 10)
268 |   print(p)
269 | }
270 | 
271 | 
272 | library(ggmap)
273 | 
274 | # plot_base = qmap("Chicago", zoom = 10, maptype = "roadmap") + 
275 | # colMeans(high_schools[,c(3,2)])
276 | plot_base = qmap(location = c(-87.70831, 41.84566), zoom = 11, maptype = "terrain") + 
277 |   labs(size = "Students", x = "Latitude", y = "Longitude")
278 |   # + xlim(range(high_schools$HS_lat)) + ylim(range(high_schools$HS_lon))
279 | 
280 | z = big_zips[2,1]
281 | pdf("Visualizations/R plots/zip_to_HS_maps.pdf")
282 | for (z in big_zips[,1]) {
283 |   print(z)
284 |   zip_trans = as.data.frame(trans_codes_m[trans_codes_m$ZipCode == z, ])
285 |   if (sum(zip_trans$HS_code == 0)>0)
286 |     zip_trans[zip_trans$HS_code == 0, 7:8] = rep(c(41.9, -87.55), each = sum(zip_trans$HS_code == 0))
287 |   
288 |   zip_code = as.character(z)
289 |   # zip_poly = geom_path(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), colour="black")
290 |   zip_poly = geom_polygon(data = subset(ilzips.df, ZCTA5CE10 == z), aes(long,lat,group=group), fill="blue", alpha = .5)
291 |   p <- plot_base + zip_poly + 
292 |     geom_point(data = high_schools, aes(HS_lon, HS_lat), shape = 1, size = 3) + 
293 |     geom_point(data = zip_trans, aes(HS_lon, HS_lat, size = value)) +
294 |     scale_size_continuous(range = c(3, 9)) + 
295 |     ggtitle(paste0("Zip Code = ", zip_code)) + theme(legend.text = element_text(size = 16), legend.title = element_text(size = 16))
296 |   print(p)
297 | }
298 | dev.off()
299 | 


--------------------------------------------------------------------------------
/Data_Analysis/explore_enroll_totals_from_website.Rmd:
--------------------------------------------------------------------------------
 1 | Exploratory Data Analysis of Aggregate Enrollments
 2 | ========================================================
 3 | 
 4 | Andrew Landgraf
 5 | 
 6 | First, pull in the data created by Zhou, which was downloaded from [here](http://www.cps.edu/SchoolData/Pages/SchoolData.aspx). Also, look at the structure of the data.
 7 | 
 8 | ```{r}
 9 | # setwd("~/../github/predicting_student_enrollment/")
10 | load("enrollment_all_data.Rdata")
11 | # load("enrollment_data.Rdata")
12 | str(output)
13 | head(output[,,1])
14 | ```
15 | 
16 | Transform the data into a data frame, with one row per school and year.
17 | 
18 | ```{r}
19 | library(plyr)
20 | options(stringsAsFactors = FALSE)
21 | dat=adply(output, 3, rbind.fill)
22 | str(dat)
23 | ```
24 | 
25 | Edit the variables types and add a `Year` variable.
26 | 
27 | ```{r}
28 | dat$Year = as.numeric(dat$X1) + 2005
29 | dat$School_ID = as.numeric(dat$School_ID)
30 | # dat$School_Name = as.factor(dat$School_Name)
31 | dat$School_Type = as.factor(dat$School_Type)
32 | 
33 | for (col in 5:20) {
34 |   dat[, col] = as.numeric(dat[, col])
35 | }
36 | ```
37 | 
38 | Add the total enrollments of the school for the first year of the data set (2006) in order to make comparisons.
39 | 
40 | ```{r}
41 | dat2006 = subset(dat, Year == 2006, select = c("School_ID", "Total"))
42 | colnames(dat2006)[2] = "Total_2006"
43 | dat = merge(dat, dat2006)
44 | ```
45 | 
46 | The total enrollments should match the sum of the enrollments by grade, for each school and year.
47 | 
48 | ```{r}
49 | plot(rowSums(subset(dat,select=c(PE:G12))), dat$Total)
50 | max( abs( rowSums(subset(dat,select=c(PE:G12))) - dat$Total))
51 | ```
52 | 
53 | 
54 | Plot the total enrollments by year of each school. Also plot the ratio of the yearly enrollments and the enrollments in the first year (2006). These are broken up by charter and non-charter schools.
55 | 
56 | ```{r fig.width=8, fig.height=6}
57 | library(ggplot2)
58 | ggplot(subset(dat), aes(Year, Total, group = School_ID)) + facet_wrap( ~ School_Type) + geom_line()
59 | 
60 | ggplot(subset(dat), aes(Year, Total / Total_2006, group = School_ID)) + geom_line(aes(alpha = Total_2006)) + 
61 |   facet_wrap( ~ School_Type) + scale_y_log10()
62 | ```
63 | 
64 | Summarize the schools by their average yearly enrollment and the SD of the yearly enrollment. I did this to find a school that consistently has lots of students.
65 | ```{r}
66 | head(ddply(dat, .(School_ID), summarize, mean = mean(Total), sd = sd(Total)))
67 | ```
68 | 
69 | For one school (Portage Park), plot some yearly enrollment by grade.
70 | 
71 | ```{r fig.width=8, fig.height=6}
72 | dat_school = subset(dat, School_ID == 610135, select=c(Year,PK:G08))
73 | dat_school = dat_school[order(dat_school$Year), ]
74 | 
75 | library(reshape2)
76 | school_m = melt(dat_school, id = "Year")
77 | ggplot(school_m, aes(Year, value, colour = variable)) + geom_line() + facet_wrap( ~ variable)
78 | ggplot(school_m, aes(Year, variable, fill = cut(value,8))) + geom_tile() + 
79 |   scale_fill_brewer(type = "div", palette = 1) + labs(title = "Portage Park", y = "Grade", fill = "# Enrolled")
80 | ```
81 | 
82 | From the tile plot, you can see some trends of high and low enrollments following each year.
83 | 


--------------------------------------------------------------------------------
/Data_Analysis/grade9_transition_matrix.R:
--------------------------------------------------------------------------------
  1 | library("plyr")
  2 | library("GISTools")
  3 | library("ggplot2")
  4 | 
  5 | setwd("/Volumes/appdata-3/School Reports")
  6 | data = read.csv("tbl_9thgrade_toandfrom_tracy.csv", stringsAsFactors = F)
  7 | data2013 = subset(data, year == 2013)
  8 | schoolstable = read.csv("schools.csv")
  9 | 
 10 | setwd("/Volumes/appdata-3/Other Datasets/public neighborhood data/CPS High School Attendance Boundaries SY13 14")
 11 | HS_boundaries = readShapePoly(fn = "geo_aeud-d3nt-1")
 12 | HS_boundaries_data = slot(HS_boundaries, "data")
 13 | HS_boundaries_data = subset(HS_boundaries_data, BoundaryGr %in% c("9, 10", "9, 10, 11, 12"))
 14 | HS_catchment1314 = unique(HS_boundaries_data$SchoolID)
 15 | HS_catchment1314_keys = subset(schoolstable, SchoolID %in% HS_catchment1314, select = SchoolKey)
 16 | 
 17 | catchmentsum = ddply(data2013, .(CATCHMENT_SCHOOL_KEY), summarize, catchmentsum = sum(Freq))
 18 | 
 19 | data2013 = merge(data2013, catchmentsum, all.x = T, by.x = "CATCHMENT_SCHOOL_KEY", by.y = "CATCHMENT_SCHOOL_KEY")
 20 | 
 21 | data2013 = subset(data2013, CATCHMENT_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys))
 22 | 
 23 | # merge lat long
 24 | data2013 = merge(data2013, schoolstable[,c("SchoolKey","Longitude","Latitude")], all.x = T, by.x = "NEXT_GRADE_SCHOOL_KEY", by.y = "SchoolKey")
 25 | data2013 = subset(data2013, !is.na(CATCHMENT_SCHOOL_NAME))
 26 | data2013 = subset(data2013, !is.na(NEXT_GRADE_SCHOOL_NAME))
 27 | 
 28 | data2013$catchmentprop = data2013$Freq/data2013$catchmentsum
 29 | data2013$iscatchment = 1*(data2013$NEXT_GRADE_SCHOOL_KEY  == data2013$CATCHMENT_SCHOOL_KEY)
 30 | 
 31 | # all together sorted N to S
 32 | data2013$NEXT_GRADE_SCHOOL_NAME = with(data2013, reorder(NEXT_GRADE_SCHOOL_NAME, Latitude))
 33 | data2013$CATCHMENT_SCHOOL_NAME = factor(data2013$CATCHMENT_SCHOOL_NAME, levels = intersect(levels(data2013$NEXT_GRADE_SCHOOL_NAME), unique(data2013$CATCHMENT_SCHOOL_NAME)))
 34 | 
 35 | p <- ggplot(data2013, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = catchmentprop), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_y_discrete(limits = rev(levels(data2013$CATCHMENT_SCHOOL_NAME)))
 36 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())
 37 | 
 38 | 
 39 | # catchment schools only
 40 | data2013_catch = subset(data2013, CATCHMENT_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys))
 41 | data2013_catch = subset(data2013_catch, NEXT_GRADE_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys))
 42 | 
 43 | data2013_catch$NEXT_GRADE_SCHOOL_NAME = with(data2013_catch, reorder(NEXT_GRADE_SCHOOL_NAME, Latitude))
 44 | data2013_catch$CATCHMENT_SCHOOL_NAME = factor(data2013_catch$CATCHMENT_SCHOOL_NAME, levels = levels(data2013_catch$NEXT_GRADE_SCHOOL_NAME))
 45 | #data2013$NEXT_GRADE_SCHOOL_NAME = factor(data2013$NEXT_GRADE_SCHOOL_NAME, levels = c(levels(data2013$CATCHMENT_SCHOOL_NAME), setdiff(data2013$NEXT_GRADE_SCHOOL_NAME, data2013$CATCHMENT_SCHOOL_NAME)))
 46 | 
 47 | p <- ggplot(data2013_catch, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = catchmentprop), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_x_discrete(limits = rev(levels(data2013_catch$NEXT_GRADE_SCHOOL_NAME)))
 48 | # with names
 49 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(size = 8, angle = 270, hjust = 0, colour = "grey50"))
 50 | # without names
 51 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())
 52 | 
 53 | 
 54 | p2 <- ggplot(data2013_catch, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = iscatchment), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_x_discrete(limits = rev(levels(data2013_catch$NEXT_GRADE_SCHOOL_NAME)))
 55 | # with names
 56 | p2 + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(size = 8, angle = 270, hjust = 0, colour = "grey50"))
 57 | # without names
 58 | p2 + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())
 59 | 
 60 | 
 61 | # non-catchment schools only
 62 | data2013_noncatch = subset(data2013, CATCHMENT_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys))
 63 | data2013_noncatch = subset(data2013_noncatch, ! NEXT_GRADE_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys))
 64 | 
 65 | data2013_noncatch$NEXT_GRADE_SCHOOL_NAME = with(data2013_noncatch, reorder(NEXT_GRADE_SCHOOL_NAME, Latitude))
 66 | data2013_noncatch$CATCHMENT_SCHOOL_NAME = factor(data2013_noncatch$CATCHMENT_SCHOOL_NAME, levels = levels(data2013_catch$CATCHMENT_SCHOOL_NAME))
 67 | #data2013$NEXT_GRADE_SCHOOL_NAME = factor(data2013$NEXT_GRADE_SCHOOL_NAME, levels = c(levels(data2013$CATCHMENT_SCHOOL_NAME), setdiff(data2013$NEXT_GRADE_SCHOOL_NAME, data2013$CATCHMENT_SCHOOL_NAME)))
 68 | 
 69 | 
 70 | p <- ggplot(data2013_noncatch, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = catchmentprop), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_x_discrete(limits = rev(levels(data2013_noncatch$NEXT_GRADE_SCHOOL_NAME)))
 71 | # with names
 72 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_text(size = 8, angle = 270, hjust = 0, colour = "grey50"))
 73 | # without names
 74 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())
 75 | 
 76 | 
 77 | # all schools, by rating
 78 | setwd("/Volumes/appdata-3/Master_Data")
 79 | schoolfeatures = read.csv("schools_changing_features_2013.csv", stringsAsFactors = F)
 80 | 
 81 | data2013 = merge(data2013, schoolstable[,c("SchoolKey", "SchoolID")], all.x = T, by.x = "CATCHMENT_SCHOOL_KEY", by.y = "SchoolKey")
 82 | data2013 = merge(data2013, schoolfeatures[,c("SchoolID","Rating","Safety","Mobility")], all.x = T, by.x = "SchoolID", by.y = "SchoolID")
 83 | table(data2013$Rating)
 84 | data2013$Rating2 = as.numeric(as.factor(data2013$Rating))
 85 | 
 86 | # catchment schools only
 87 | data2013_catch = subset(data2013, CATCHMENT_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys))
 88 | data2013_catch = subset(data2013_catch, NEXT_GRADE_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys))
 89 | 
 90 | data2013_catch$CATCHMENT_SCHOOL_NAME = with(data2013_catch, reorder(CATCHMENT_SCHOOL_NAME, Rating2))
 91 | data2013_catch$NEXT_GRADE_SCHOOL_NAME = factor(data2013_catch$NEXT_GRADE_SCHOOL_NAME, levels = levels(data2013_catch$CATCHMENT_SCHOOL_NAME))
 92 | #data2013$NEXT_GRADE_SCHOOL_NAME = factor(data2013$NEXT_GRADE_SCHOOL_NAME, levels = c(levels(data2013$CATCHMENT_SCHOOL_NAME), setdiff(data2013$NEXT_GRADE_SCHOOL_NAME, data2013$CATCHMENT_SCHOOL_NAME)))
 93 | 
 94 | p <- ggplot(data2013_catch, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = catchmentprop), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_x_discrete(limits = rev(levels(data2013_catch$NEXT_GRADE_SCHOOL_NAME)))
 95 | # with names
 96 | p + theme(legend.position = "none", axis.ticks = element_blank(), 
 97 |           axis.title.x = element_blank(), axis.title.y = element_blank(), 
 98 |           axis.text.x = element_text(size = 8, angle = 270, hjust = 0, 
 99 |                                      colour = "grey50")) + 
100 |   geom_rect(aes(xmin = 0.5, xmax = 47.5, ymin = 5.5, ymax = 5.5),
101 |             fill = "transparent", color = "black", size = 1) + 
102 |   geom_rect(aes(xmin = 0.5, xmax = 47.5, ymin = 30.5, ymax = 30.5),
103 |             fill = "transparent", color = "black", size = 1) + 
104 |     geom_rect(aes(ymin = 0.5, ymax = 47.5, xmin = 47.5-5, xmax = 47.5-5),
105 |             fill = "transparent", color = "black", size = 1) + 
106 |   geom_rect(aes(ymin = 0.5, ymax = 47.5, xmin = 47.5-30, xmax = 47.5-30),
107 |             fill = "transparent", color = "black", size = 1)
108 | 
109 | # non-catchment
110 | data2013_noncatch = subset(data2013, CATCHMENT_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys))
111 | data2013_noncatch = subset(data2013_noncatch, ! NEXT_GRADE_SCHOOL_KEY %in% as.matrix(HS_catchment1314_keys))
112 | 
113 | data2013_noncatch$CATCHMENT_SCHOOL_NAME = factor(data2013_noncatch$CATCHMENT_SCHOOL_NAME, levels = levels(data2013_catch$CATCHMENT_SCHOOL_NAME))
114 | #data2013$NEXT_GRADE_SCHOOL_NAME = factor(data2013$NEXT_GRADE_SCHOOL_NAME, levels = c(levels(data2013$CATCHMENT_SCHOOL_NAME), setdiff(data2013$NEXT_GRADE_SCHOOL_NAME, data2013$CATCHMENT_SCHOOL_NAME)))
115 | 
116 | 
117 | p2 <- ggplot(data2013_noncatch, aes(NEXT_GRADE_SCHOOL_NAME, CATCHMENT_SCHOOL_NAME)) + geom_tile(aes(fill = catchmentprop), colour = "white") + scale_fill_gradient(low = "white", high = "#000066", trans = "sqrt", limits=c(0,1)) + scale_x_discrete(limits = rev(levels(data2013_noncatch$NEXT_GRADE_SCHOOL_NAME)))
118 | # with names
119 | p2 + theme(legend.position = "none", axis.ticks = element_blank(), 
120 |           axis.title.x = element_blank(), axis.title.y = element_blank(), 
121 |           axis.text.x = element_text(size = 8, angle = 270, hjust = 0, 
122 |                                      colour = "grey50")) + 
123 |   geom_rect(aes(xmin = 0.5, xmax = 103.5, ymin = 5.5, ymax = 5.5),
124 |             fill = "transparent", color = "black", size = 1) + 
125 |   geom_rect(aes(xmin = 0.5, xmax = 103.5, ymin = 30.5, ymax = 30.5),
126 |             fill = "transparent", color = "black", size = 1)
127 |   
128 | 
129 | # without names
130 | p + theme(legend.position = "none", axis.ticks = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())
131 | 


--------------------------------------------------------------------------------
/Data_Analysis/history_evaluation.R:
--------------------------------------------------------------------------------
  1 | ######################################
  2 | ###Historical Projection Evaluation###
  3 | ######################################
  4 | 
  5 | ###CPS Team###
  6 | ###07/21/2014###
  7 | 
  8 | rm(list=ls())
  9 | setwd("/Volumes/DSSG/Master_Data/")
 10 | suppressMessages(library(ggplot2))
 11 | suppressMessages(library(reshape2))
 12 | suppressMessages(library(gridExtra))
 13 | pred2009 <- read.csv("csp2009.csv")
 14 | pred2010 <- read.csv("csp2010.csv")
 15 | pred2011 <- read.csv("csp2011.csv")
 16 | pred2012 <- read.csv("csp2012.csv")
 17 | pred2013 <- read.csv("csp2013.csv")
 18 | true2009 <- read.csv("true2009.csv")
 19 | true2010 <- read.csv("true2010.csv")
 20 | true2011 <- read.csv("true2011.csv")
 21 | true2012 <- read.csv("true2012.csv")
 22 | true2013 <- read.csv("true2013.csv")
 23 | edit <- read.csv("cps_adjust.csv")
 24 | 
 25 | ###Process Data###
 26 | mis_projection <- function(pred, real) {
 27 |     id <- intersect(pred$SchoolID, real$SchoolID)
 28 |     pred <- subset(pred, pred$SchoolID%in%id)
 29 |     real <- subset(real, real$SchoolID%in%id)
 30 |     pred <- pred[order(pred$SchoolID),]
 31 |     real <- real[order(real$SchoolID),]
 32 |     data <- pred-real #projection-actual
 33 |     data$SchoolID <- sort(id)
 34 |     return(data)
 35 | }
 36 | data2009 <- mis_projection(pred2009, true2009)
 37 | data2010 <- mis_projection(pred2010, true2010)
 38 | data2011 <- mis_projection(pred2011, true2011)
 39 | data2012 <- mis_projection(pred2012, true2012)
 40 | data2013 <- mis_projection(pred2013, true2013)
 41 | data2009$year <- 2009
 42 | data2010$year <- 2010
 43 | data2011$year <- 2011
 44 | data2012$year <- 2012
 45 | data2013$year <- 2013
 46 | 
 47 | ###Evaluation###
 48 | data_plot <- function(data, name, threshold) {
 49 |     temp <- data.frame(year=data$year, miss=unlist(data[name]))
 50 |     temp$indicator <- NA
 51 |     temp[temp$miss>=threshold,]$indicator <- "Over-projected By 100"
 52 |     temp[temp$miss>25 & temp$miss<threshold,]$indicator <- "Over-projected By 25"
 53 |     temp[temp$miss>= -25 & temp$miss<=25,]$indicator <- "Tolerably Mis-projected"
 54 |     temp[temp$miss< -25 & temp$miss>-threshold,]$indicator <- "Under-projected By 25"
 55 |     temp[temp$miss<=-threshold,]$indicator <- "Under-projected By 100"
 56 |     temp$indicator <- factor(temp$indicator, levels=c("Under-projected By 100", "Under-projected By 25", "Tolerably Mis-projected", "Over-projected By 25", "Over-projected By 100"))
 57 |     return(temp)
 58 | }
 59 | data_error <- function(data, threshold) {
 60 |     err <- data.frame(Year=c(2009, 2010, 2011, 2012, 2013), Total=NA, Error=NA)
 61 |     for (i in 1:nrow(err)) {
 62 |         temp <- subset(data, data$year==err[i,]$Year)
 63 |         err[i,]$Total <- nrow(temp)
 64 |         err[i,]$Error <- length(which(abs(temp$miss)>=threshold))
 65 |     }
 66 |     return(err)
 67 | }
 68 | #school
 69 | data2009_new <- data_plot(data2009, "ATOT", 100)
 70 | data2010_new <- data_plot(data2010, "ATOT", 100)
 71 | data2011_new <- data_plot(data2011, "ATOT", 100)
 72 | data2012_new <- data_plot(data2012, "ATOT", 100)
 73 | data2013_new <- data_plot(data2013, "ATOT", 100)
 74 | data_all <- do.call("rbind", list(data2009_new, data2010_new, data2011_new, data2012_new, data2013_new))
 75 | data_err <- data_error(data_all, 100)
 76 | ggplot()+
 77 |     geom_bar(data=data_all, aes(x=factor(year), fill=indicator), binwidth=0.5, position="stack")+
 78 |     geom_text(data=data_err, aes(x=factor(Year), y=Total+10, label=Error), vjust=0)+
 79 |     xlab("Year")+ylab("Number of Schools")
 80 | 
 81 | ###Serious Mis-projection By Year###
 82 | name <- c("K", "Grade1", "Grade2", "Grade3", "Grade4", "Grade5", "Grade6", "Grade7", "Grade8", "Grade9", "Grade10", "Grade11", "Grade12")
 83 | grade <- data.frame(matrix(NA, 5, length(name)+1))
 84 | colnames(grade) <- c("Year", name)
 85 | grade$Year <- 2009:2013
 86 | for (i in 1:length(name)) {
 87 |     grade[1,i+1] <- length(which(abs(unlist(data2009[name[i]]))>=25))
 88 |     grade[2,i+1] <- length(which(abs(unlist(data2010[name[i]]))>=25))
 89 |     grade[3,i+1] <- length(which(abs(unlist(data2011[name[i]]))>=25))
 90 |     grade[4,i+1] <- length(which(abs(unlist(data2012[name[i]]))>=25))
 91 |     grade[5,i+1] <- length(which(abs(unlist(data2013[name[i]]))>=25))
 92 | }
 93 | grade_melted <- melt(grade, id=1)
 94 | ggplot(data=grade_melted, aes(x=Year, y=value, colour=variable))+geom_line()+xlab("Year")+ylab("Number of Schools")
 95 | 
 96 | ###Compare CSM And Edit###
 97 | real <- data.frame(SchoolID=true2013$SchoolID, Total=true2013$ATOT)
 98 | csmp <- data.frame(SchoolID=pred2013$SchoolID, ATOT=pred2013$ATOT)
 99 | before <- merge(real, csmp)
100 | after <- merge(real, edit)
101 | before_plot <- ggplot(before, aes(x=ATOT-Total))+geom_histogram(binwidth=5)+xlab("Mis-project")+ylab("Number of Schools")+xlim(-500, 500)+ggtitle("Before Edit")
102 | after_plot <- ggplot(after, aes(x=ATOT-Total))+geom_histogram(binwidth=5)+xlab("Mis-project")+ylab("Number of Schools")+xlim(-500, 500)+ggtitle("After Edit")
103 | grid.arrange(before_plot, after_plot, ncol=2)
104 | 
105 | ####Last Year Prediction Error####
106 | name <- c("K", paste0("Grade", 1:12))
107 | temp <- data2013[,names(data2013)%in%name]
108 | grade_graph <- data.frame(Grade=c("K", as.character(1:12)), Misprojection=colSums(abs(temp)>=30))
109 | grade_graph$Grade <- factor(grade_graph$Grade, levels=c("K", as.character(1:12)))
110 | ggplot(grade_graph, aes(x=Grade, y=Misprojection))+geom_bar(stat="identity", fill="slategray4")+theme_bw(base_size=20)+labs(x=NULL, y="# Schools with Large Error")
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/Data_Analysis/investigate_9thgrade.R:
--------------------------------------------------------------------------------
  1 | # source('C:/Users/Andrew/github/predicting_student_enrollment/estimation-sandbox/feeder_school_prediction_89.R')
  2 | setwd("/Volumes/appdata-3")
  3 | load("/Volumes/appdata-3/Student_Data/students_89_snap.RData")
  4 | schools = read.csv("/Volumes/appdata-3/School Reports/schools.csv")
  5 | 
  6 | library(fields)
  7 | 
  8 | students = subset(students, !(STUDENT_EDUCATIONAL_EXCEPT_TYP %in% c("21 - 60%", "61 - 100%")))
  9 | students$THIS_GRADE_SCHOOL_KEY[is.na(students$THIS_GRADE_SCHOOL_KEY)] = 0
 10 | students$NEXT_GRADE_SCHOOL_KEY[is.na(students$NEXT_GRADE_SCHOOL_KEY)] = 0
 11 | 
 12 | tbl2011_0 = as.data.frame(with(subset(students, SCHOOL_YEAR == 2010 & THIS_GRADE_SCHOOL_KEY == 0), table(NEXT_GRADE_SCHOOL_KEY)))
 13 | tbl2011_0$studentsource = 'outside_CPS'
 14 | #colnames(tbl2011_0) = c("SCHOOL_KEY", "Num_0")
 15 | 
 16 | tbl2011_catchment = as.data.frame(with(subset(students, SCHOOL_YEAR == 2010 & NEXT_GRADE_SCHOOL_KEY == CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY)))
 17 | tbl2011_catchment$studentsource = 'catchment'
 18 | #colnames(tbl2011_catchment) = c("SCHOOL_KEY", "Num_catchment")
 19 | 
 20 | tbl2011_other = as.data.frame(with(subset(students, SCHOOL_YEAR == 2010 & THIS_GRADE_SCHOOL_KEY > 0 & NEXT_GRADE_SCHOOL_KEY != CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY)))
 21 | tbl2011_other$studentsource = 'other'
 22 | #colnames(tbl2011_other) = c("SCHOOL_KEY", "Num_other")
 23 | 
 24 | tbl2011 = rbind(tbl2011_0, tbl2011_catchment, tbl2011_other)
 25 | tbl2011$year = 2011
 26 | #tbl2011 = merge(tbl2011_0, tbl2011_catchment, all = TRUE)
 27 | #tbl2011 = merge(tbl2011, tbl2011_other, all = TRUE)
 28 | 
 29 | 
 30 | tbl2012_0 = as.data.frame(with(subset(students, SCHOOL_YEAR == 2011 & THIS_GRADE_SCHOOL_KEY == 0), table(NEXT_GRADE_SCHOOL_KEY)))
 31 | tbl2012_0$studentsource = 'outside_CPS'
 32 | #colnames(tbl2012_0) = c("SCHOOL_KEY", "Num_0")
 33 | tbl2012_catchment = as.data.frame(with(subset(students, SCHOOL_YEAR == 2011 & NEXT_GRADE_SCHOOL_KEY == CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY)))
 34 | tbl2012_catchment$studentsource = 'catchment'
 35 | #colnames(tbl2012_catchment) = c("SCHOOL_KEY", "Num_catchment")
 36 | tbl2012_other = as.data.frame(with(subset(students, SCHOOL_YEAR == 2011 & THIS_GRADE_SCHOOL_KEY > 0 & NEXT_GRADE_SCHOOL_KEY != CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY)))
 37 | tbl2012_other$studentsource = 'other'
 38 | #colnames(tbl2012_other) = c("SCHOOL_KEY", "Num_other")
 39 | 
 40 | tbl2012 = rbind(tbl2012_0, tbl2012_catchment, tbl2012_other)
 41 | tbl2012$year = 2012
 42 | #tbl2012 = merge(tbl2012_0, tbl2012_catchment, all = TRUE)
 43 | #tbl2012 = merge(tbl2012, tbl2012_other, all = TRUE)
 44 | 
 45 | tbl2013_0 = as.data.frame(with(subset(students, SCHOOL_YEAR == 2012 & THIS_GRADE_SCHOOL_KEY == 0), table(NEXT_GRADE_SCHOOL_KEY)))
 46 | tbl2013_0$studentsource = 'outside_CPS'
 47 | #colnames(tbl2013_0) = c("SCHOOL_KEY", "Num_0")
 48 | tbl2013_catchment = as.data.frame(with(subset(students, SCHOOL_YEAR == 2012 & NEXT_GRADE_SCHOOL_KEY == CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY)))
 49 | tbl2013_catchment$studentsource = 'catchment'
 50 | #colnames(tbl2013_catchment) = c("SCHOOL_KEY", "Num_catchment")
 51 | tbl2013_other = as.data.frame(with(subset(students, SCHOOL_YEAR == 2012 & THIS_GRADE_SCHOOL_KEY > 0 & NEXT_GRADE_SCHOOL_KEY != CATCHMENT_SCHOOL_KEY), table(NEXT_GRADE_SCHOOL_KEY)))
 52 | tbl2013_other$studentsource = 'other'
 53 | #colnames(tbl2013_other) = c("SCHOOL_KEY", "Num_other")
 54 | 
 55 | tbl2013 = rbind(tbl2013_0, tbl2013_catchment, tbl2013_other)
 56 | tbl2013$year = 2013
 57 | #tbl2013 = merge(tbl2013_0, tbl2013_catchment, all = TRUE)
 58 | #tbl2013 = merge(tbl2013, tbl2013_other, all = TRUE)
 59 | 
 60 | #tbl2011$Year = 2011
 61 | #tbl2012$Year = 2012
 62 | #tbl2013$Year = 2013
 63 | 
 64 | tbl9thgradesources = rbind(tbl2011, tbl2012, tbl2013)
 65 | setwd("/Volumes/appdata-3/Student_Data")
 66 | write.csv(tbl9thgradesources, file = "tbl9thgradesources_tracy.csv", row.names = FALSE)
 67 | 
 68 | 
 69 | # all to-and-from data
 70 | tbl2011_all = as.data.frame(with(subset(students, SCHOOL_YEAR == 2010), table(CATCHMENT_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY)))
 71 | tbl2011_all$year = 2011
 72 | 
 73 | tbl2012_all = as.data.frame(with(subset(students, SCHOOL_YEAR == 2011), table(CATCHMENT_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY)))
 74 | tbl2012_all$year = 2012
 75 | 
 76 | tbl2013_all = as.data.frame(with(subset(students, SCHOOL_YEAR == 2012), table(CATCHMENT_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY)))
 77 | tbl2013_all$year = 2013
 78 | 
 79 | tblall = rbind(tbl2011_all, tbl2012_all, tbl2013_all)
 80 | 
 81 | tblall = merge(tblall, schools[,c("SCHOOL_KEY","SCHOOL_SHORT_NAME")], all.x = TRUE, by.x = "CATCHMENT_SCHOOL_KEY", by.y = "SCHOOL_KEY")
 82 | colnames(tblall)[5] = 'CATCHMENT_SCHOOL_NAME'
 83 | 
 84 | tblall = merge(tblall, schools[,c("SCHOOL_KEY","SCHOOL_SHORT_NAME")], all.x = TRUE, by.x = "NEXT_GRADE_SCHOOL_KEY", by.y = "SCHOOL_KEY")
 85 | colnames(tblall)[6] = 'NEXT_GRADE_SCHOOL_NAME'
 86 | 
 87 | setwd("/Volumes/appdata-3/School Reports")
 88 | write.csv(tblall, file = "tbl_9thgrade_toandfrom_tracy.csv", row.names = FALSE)
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | setwd("/Volumes/appdata-3/Count Data")
 98 | countdata = read.csv("enrollment_byschool_byyear_bygrade.csv")
 99 | 
100 | # plot by school
101 | countdata_wide = reshape(countdata, v.names = "ENROLLMENT", timevar = "YEAR", idvar = c("SCHOOL_CODE","GRADE"), direction = "wide")
102 | countdata_wide[is.na(countdata_wide)] = 0
103 | 
104 | grade9change = subset(countdata_wide, GRADE == 'X9' & ENROLLMENT.2012 > 0 & ENROLLMENT.2013 > 0)
105 | head(grade9change[order(abs(grade9change$ENROLLMENT.2013 - grade9change$ENROLLMENT.2012), decreasing = T),c("SCHOOL_CODE")])
106 | 
107 | plotschool(609707)
108 | 
109 | plotschool = function(sc_code){
110 | key = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_KEY)[1,1]
111 | name = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_NAME)[1,1]
112 | 
113 | test = matrix(nrow = 3, ncol = 3)
114 | test = as.data.frame(test)
115 | rownames(test) = c("Catchment","Other","Outside district")
116 | colnames(test) = c("2011", "2012", "2013")
117 | test[1,1] = subset(tbl2011, SCHOOL_KEY == key, select = "Num_catchment")
118 | test[2,1] = subset(tbl2011, SCHOOL_KEY == key, select = "Num_other")
119 | test[3,1] = subset(tbl2011, SCHOOL_KEY == key, select = "Num_0")
120 | 
121 | test[1,2] = subset(tbl2012, SCHOOL_KEY == key, select = "Num_catchment")
122 | test[2,2] = subset(tbl2012, SCHOOL_KEY == key, select = "Num_other")
123 | test[3,2] = subset(tbl2012, SCHOOL_KEY == key, select = "Num_0")
124 | 
125 | test[1,3] = subset(tbl2013, SCHOOL_KEY == key, select = "Num_catchment")
126 | test[2,3] = subset(tbl2013, SCHOOL_KEY == key, select = "Num_other")
127 | test[3,3] = subset(tbl2013, SCHOOL_KEY == key, select = "Num_0")
128 | test = as.matrix(test)
129 | test[is.na(test)]=0
130 | 
131 | barplot(test, beside = F, legend.text = T, args.legend = list(x = 'topleft'),main =paste(name),ylab="Grade 9 enrollment",col=c(2,4,5))
132 | }
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | tbl2013 = with(subset(students, SCHOOL_YEAR == 2012), table(THIS_GRADE_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY))
141 | 
142 | sc_code = 609713
143 | sc_code = 609764
144 | sc_code = 610563
145 | key = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_KEY)[1,1]
146 | location = subset(schools, SCHOOL_CODE == sc_code, c(lon,lat))
147 | el_schools2012 = tbl2012[,colnames(tbl2012)==key]
148 | el_schools2013 = tbl2013[,colnames(tbl2013)==key]
149 | 
150 | el_schools2012df = data.frame(SCHOOL_KEY = names(el_schools2012), Num2012 = el_schools2012)
151 | el_schools2013df = data.frame(SCHOOL_KEY = names(el_schools2013), Num2013 = el_schools2013)
152 | 
153 | el_schoolsdf = merge(el_schools2012df, el_schools2013df, all = TRUE)
154 | el_schoolsdf = el_schools2013df
155 | el_schoolsdf$Num2012[is.na(el_schoolsdf$Num2012)] = 0
156 | el_schoolsdf$Num2013[is.na(el_schoolsdf$Num2013)] = 0
157 | el_schoolsdf = subset(el_schoolsdf, Num2012>0 | Num2013>0)
158 | el_schoolsdf = merge(el_schoolsdf, subset(schools, , c(SCHOOL_KEY, SCHOOL_CODE, SCHOOL_SHORT_NAME, lon, lat)), all.x = TRUE)
159 | dist.mat = rdist.earth(subset(el_schoolsdf, , c(lon, lat)), location)
160 | el_schoolsdf = cbind(el_schoolsdf, Distance = dist.mat[,1])
161 | el_schoolsdf$Change = el_schoolsdf$Num2013 - el_schoolsdf$Num2012
162 | 
163 | head(el_schoolsdf[order(-abs(el_schoolsdf$Num2012-el_schoolsdf$Num2013)),], 10)
164 | head(el_schoolsdf[order(-abs(el_schoolsdf$Num2013)),], 10)
165 | 
166 | qplot(Distance,Change,data = el_schoolsdf)
167 | 
168 | sc_code = 610329
169 | key = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_KEY)[1,1]
170 | location = subset(schools, SCHOOL_CODE == sc_code, c(lon,lat))
171 | 
172 | high_schools2012 = tbl2012[rownames(tbl2012)==key,]
173 | high_schools2013 = tbl2013[rownames(tbl2013)==key,]
174 | 
175 | high_schools2012df = data.frame(SCHOOL_KEY = names(high_schools2012), Num2012 = high_schools2012)
176 | high_schools2013df = data.frame(SCHOOL_KEY = names(high_schools2013), Num2013 = high_schools2013)
177 | 
178 | high_schoolsdf = merge(high_schools2012df, high_schools2013df, all = TRUE)
179 | high_schoolsdf$Num2012[is.na(high_schoolsdf$Num2012)] = 0
180 | high_schoolsdf$Num2013[is.na(high_schoolsdf$Num2013)] = 0
181 | high_schoolsdf = subset(high_schoolsdf, Num2012>0 | Num2013>0)
182 | high_schoolsdf = merge(high_schoolsdf, subset(schools, , c(SCHOOL_KEY, SCHOOL_CODE, SCHOOL_SHORT_NAME, lon, lat)), all.x = TRUE)
183 | dist.mat = rdist.earth(subset(high_schoolsdf, , c(lon, lat)), location)
184 | high_schoolsdf = cbind(high_schoolsdf, Distance = dist.mat[,1])
185 | high_schoolsdf$Change = high_schoolsdf$Num2013 - high_schoolsdf$Num2012
186 | 
187 | head(high_schoolsdf[order(-abs(high_schoolsdf$Num2012-high_schoolsdf$Num2013)),], 10)
188 | tail(high_schoolsdf[order(high_schoolsdf$Num2012-high_schoolsdf$Num2013),], 10)
189 | head(high_schoolsdf[order(-abs(high_schoolsdf$Num2012)),], 10)
190 | 
191 | colSums(high_schoolsdf[,2:3])
192 | 


--------------------------------------------------------------------------------
/Data_Analysis/investigate_juarez_change.R:
--------------------------------------------------------------------------------
 1 | # source('C:/Users/Andrew/github/predicting_student_enrollment/estimation-sandbox/feeder_school_prediction_89.R')
 2 | setwd("//admin/appdata/DataWarehouse/DSSG")
 3 | load("//admin/appdata/DataWarehouse/DSSG/Student_Data/students_89_snap.RData")
 4 | schools = read.csv("//admin/appdata/DataWarehouse/DSSG/School Reports/schools.csv")
 5 | library(fields)
 6 | 
 7 | students = subset(students, !(STUDENT_EDUCATIONAL_EXCEPT_TYP %in% c("21 - 60%", "61 - 100%")))
 8 | students$THIS_GRADE_SCHOOL_KEY[is.na(students$THIS_GRADE_SCHOOL_KEY)] = 0
 9 | students$NEXT_GRADE_SCHOOL_KEY[is.na(students$NEXT_GRADE_SCHOOL_KEY)] = 0
10 | 
11 | tbl2012 = with(subset(students, SCHOOL_YEAR == 2011), table(THIS_GRADE_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY))
12 | tbl2013 = with(subset(students, SCHOOL_YEAR == 2012), table(THIS_GRADE_SCHOOL_KEY, NEXT_GRADE_SCHOOL_KEY))
13 | 
14 | sc_code = 609713
15 | sc_code = 609764
16 | sc_code = 610563
17 | key = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_KEY)[1,1]
18 | location = subset(schools, SCHOOL_CODE == sc_code, c(lon,lat))
19 | el_schools2012 = tbl2012[,colnames(tbl2012)==key]
20 | el_schools2013 = tbl2013[,colnames(tbl2013)==key]
21 | 
22 | el_schools2012df = data.frame(SCHOOL_KEY = names(el_schools2012), Num2012 = el_schools2012)
23 | el_schools2013df = data.frame(SCHOOL_KEY = names(el_schools2013), Num2013 = el_schools2013)
24 | 
25 | el_schoolsdf = merge(el_schools2012df, el_schools2013df, all = TRUE)
26 | el_schoolsdf = el_schools2013df
27 | el_schoolsdf$Num2012[is.na(el_schoolsdf$Num2012)] = 0
28 | el_schoolsdf$Num2013[is.na(el_schoolsdf$Num2013)] = 0
29 | el_schoolsdf = subset(el_schoolsdf, Num2012>0 | Num2013>0)
30 | el_schoolsdf = merge(el_schoolsdf, subset(schools, , c(SCHOOL_KEY, SCHOOL_CODE, SCHOOL_SHORT_NAME, lon, lat)), all.x = TRUE)
31 | dist.mat = rdist.earth(subset(el_schoolsdf, , c(lon, lat)), location)
32 | el_schoolsdf = cbind(el_schoolsdf, Distance = dist.mat[,1])
33 | el_schoolsdf$Change = el_schoolsdf$Num2013 - el_schoolsdf$Num2012
34 | 
35 | head(el_schoolsdf[order(-abs(el_schoolsdf$Num2012-el_schoolsdf$Num2013)),], 10)
36 | head(el_schoolsdf[order(-abs(el_schoolsdf$Num2013)),], 10)
37 | 
38 | qplot(Distance,Change,data = el_schoolsdf)
39 | 
40 | sc_code = 610329
41 | key = subset(schools, SCHOOL_CODE == sc_code, SCHOOL_KEY)[1,1]
42 | location = subset(schools, SCHOOL_CODE == sc_code, c(lon,lat))
43 | 
44 | high_schools2012 = tbl2012[rownames(tbl2012)==key,]
45 | high_schools2013 = tbl2013[rownames(tbl2013)==key,]
46 | 
47 | high_schools2012df = data.frame(SCHOOL_KEY = names(high_schools2012), Num2012 = high_schools2012)
48 | high_schools2013df = data.frame(SCHOOL_KEY = names(high_schools2013), Num2013 = high_schools2013)
49 | 
50 | high_schoolsdf = merge(high_schools2012df, high_schools2013df, all = TRUE)
51 | high_schoolsdf$Num2012[is.na(high_schoolsdf$Num2012)] = 0
52 | high_schoolsdf$Num2013[is.na(high_schoolsdf$Num2013)] = 0
53 | high_schoolsdf = subset(high_schoolsdf, Num2012>0 | Num2013>0)
54 | high_schoolsdf = merge(high_schoolsdf, subset(schools, , c(SCHOOL_KEY, SCHOOL_CODE, SCHOOL_SHORT_NAME, lon, lat)), all.x = TRUE)
55 | dist.mat = rdist.earth(subset(high_schoolsdf, , c(lon, lat)), location)
56 | high_schoolsdf = cbind(high_schoolsdf, Distance = dist.mat[,1])
57 | high_schoolsdf$Change = high_schoolsdf$Num2013 - high_schoolsdf$Num2012
58 | 
59 | head(high_schoolsdf[order(-abs(high_schoolsdf$Num2012-high_schoolsdf$Num2013)),], 10)
60 | tail(high_schoolsdf[order(high_schoolsdf$Num2012-high_schoolsdf$Num2013),], 10)
61 | head(high_schoolsdf[order(-abs(high_schoolsdf$Num2012)),], 10)
62 | 
63 | colSums(high_schoolsdf[,2:3])
64 | 


--------------------------------------------------------------------------------
/Data_Analysis/map_neighborhood_catchment.R:
--------------------------------------------------------------------------------
 1 | setwd("//admin/appdata/DataWarehouse/DSSG/Master_Data/")
 2 | library(reshape2)
 3 | library(ggplot2)
 4 | rm(list=ls())
 5 | 
 6 | year = 2013
 7 | students = read.csv(paste0("student",year,".csv"))
 8 | students = subset(students, !(EducationType %in% c("21 - 60%", "61 - 100%")))
 9 | schools = read.csv("schools_static_features.csv")
10 | 
11 | # students$NextGradeSchoolKey = ifelse(is.na(students$NextGradeSchoolKey),
12 | #                                      0, as.character(students$NextGradeSchoolKey))
13 | students = subset(students, !is.na(students$NextGradeSchoolKey))
14 | students$CatchmentSchoolKey = ifelse(is.na(students$CatchmentSchoolKey),
15 |                                      0, students$CatchmentSchoolKey)
16 | 
17 | trans_table = with(students, table(CatchmentSchoolKey, NextGradeSchoolKey))
18 | trans_table_melt <- melt(trans_table)
19 | trans_table_melt = merge(trans_table_melt, subset(schools,,c(SchoolKey, SchoolID)), 
20 |                          by.x = "CatchmentSchoolKey", by.y = "SchoolKey", all.x = TRUE)
21 | colnames(trans_table_melt)[4] = "CatchmentSchoolID"
22 | trans_table_melt = merge(trans_table_melt, subset(schools,,c(SchoolKey, SchoolID)), 
23 |                          by.x = "NextGradeSchoolKey", by.y = "SchoolKey", all.x = TRUE)
24 | colnames(trans_table_melt)[5] = "NextGradeSchoolID"
25 | 
26 | library(dplyr)
27 | PercentSame = tbl_df(trans_table_melt) %>% 
28 |   mutate(SameSchool = (CatchmentSchoolKey==NextGradeSchoolKey)) %>%
29 |   group_by(CatchmentSchoolID) %>% 
30 |   summarise(
31 |     Total = sum(value),
32 |     Catchment = sum(ifelse(SameSchool,value,0))
33 |     ) %>%
34 |   mutate(PercentCatchment = Catchment/Total)
35 | 
36 | # Neighborhood boundaries
37 | setwd("//admin/appdata/DataWarehouse/DSSG/Other Datasets/public neighborhood data/CPS High School Attendance Boundaries SY13 14")
38 | # http://geocommons.com/overlays/305198
39 | library(rgdal)
40 | library(maptools)
41 | library(rgeos)
42 | 
43 | boundaries = readOGR(dsn=".", layer="geo_aeud-d3nt-1")
44 | boundaries@data$id = rownames(boundaries@data)
45 | boundaries = gBuffer(boundaries, width=0, byid=TRUE)
46 | boundaries.points = fortify(boundaries, region="id")
47 | boundaries.df = plyr::join(boundaries.points, boundaries@data, by="id")
48 | 
49 | boundaries.df = merge(boundaries.df, PercentSame, by.x = "SchoolID", by.y = "CatchmentSchoolID", all.x=TRUE)
50 | boundaries.df = boundaries.df[order(boundaries.df$order),]
51 | boundaries.df = subset(boundaries.df, !is.na(boundaries.df$Catchment))
52 | 
53 | library(ggmap)
54 | plot_base = qmap(location = c(-87.70831, 41.84566), zoom = 10, maptype = "terrain") 
55 | 
56 | # id = 609708 # 609679
57 | # boundaries_school_counts = merge(bourdaries.df, subset(trans_table_melt, CatchmentSchoolID == id, c(NextGradeSchoolID, value)),
58 | #                                  by.x = "SchoolID", by.y = "NextGradeSchoolID", all.x = TRUE)
59 | # # boundaries_school_counts$value[is.na(boundaries_school_counts$value)] <- 0
60 | # colnames(boundaries_school_counts)[ncol(boundaries_school_counts)] <- "To"
61 | # boundaries_school_counts = merge(boundaries_school_counts, subset(trans_table_melt, NextGradeSchoolID == id, c(CatchmentSchoolID, value)),
62 | #                                  by.x = "SchoolID", by.y = "CatchmentSchoolID", all.x = TRUE)
63 | # # boundaries_school_counts$value[is.na(boundaries_school_counts$value)] <- 0
64 | # colnames(boundaries_school_counts)[ncol(boundaries_school_counts)] <- "From"
65 | # boundaries_school_counts = boundaries_school_counts[order(boundaries_school_counts$order),]
66 | # 
67 | # school = subset(schools, SchoolID == id)
68 | # 
69 | # school_mark = geom_point(data=school, aes(x = Longitude, y = Latitude), col = 'red', shape = 4, size = 4)
70 | # school_path = geom_path(data = boundaries_school_counts, aes(long,lat,group=group))
71 | # school_poly = geom_polygon(data = boundaries_school_counts, aes(long,lat,group=group, fill = To), alpha = 1)
72 | # p <- ggplot() + school_poly + school_mark + school_path + ggtitle(paste0("School = ", school$SchoolName)) +
73 | #   scale_fill_continuous(low = "white", high = "steelblue")
74 | # print(p)
75 | 
76 | # school_path = geom_path(data = boundaries.df, aes(long,lat,group=group), colour = "black", alpha = .65, size = 0.5)
77 | # school_poly = geom_polygon(data = boundaries.df, aes(long,lat,group=group, fill = PercentCatchment*100), alpha = .6)
78 | # p <- plot_base + school_poly + school_path + scale_fill_continuous(low = "white", high = "#000066") +
79 | #   ggtitle("Percent of Students that Go to their Catchment School") + labs(fill=NULL) + 
80 | #   theme(legend.justification=c(1,1), legend.position=c(1,1), legend.background = element_rect(fill="transparent"))
81 | # print(p)
82 | 
83 | school_poly = geom_polygon(data = boundaries.df, aes(long,lat,group=group, fill = cut(round(PercentCatchment*100), seq(0,75,15))), alpha = .6)
84 | p <- plot_base + school_poly + school_path + scale_fill_brewer(palette="Purples", labels = paste0(seq(0,60,15),"% to ",c(seq(15,60,15),65),"%")) +
85 |   labs(fill=NULL) + theme(legend.text = element_text(size = 18), legend.justification=c(1,1), legend.position=c(1,1), legend.background = element_rect(fill="transparent"))
86 | print(p)
87 | ggsave(plot = p, filename = "//admin/appdata/DataWarehouse/DSSG/Visualizations/R plots/PercentCatchmentSchool.pdf", width=8, height=8)
88 | # ggtitle("Percent of Students that Go to their Catchment School")
89 | 


--------------------------------------------------------------------------------
/Data_Analysis/prepare_data_and_shapefile_for_tableau.R:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------ #
 2 | #   create data file to read in to tableau   #
 3 | # ------------------------------------------ #
 4 | 
 5 | setwd("/Volumes/appdata-3/Other Datasets/public neighborhood data/CPS High School Attendance Boundaries SY13 14")
 6 | 
 7 | library("GISTools")
 8 | boundaries = readShapePoly(fn = "geo_aeud-d3nt-1") # load boundaries
 9 | boundaries = boundaries[boundaries@data$BoundaryGr %in% c("9, 10", "9, 10, 11, 12"),] # only 9th grade boundaries
10 | boundaries_polygons = slot(boundaries, "polygons")
11 | boundaries_data = slot(boundaries, "data")
12 | 
13 | boundaries_csv = NULL
14 | 
15 | for (i in 1:57){
16 |   
17 |   temp = boundaries_polygons[[i]] # pull out polygon i
18 |   tempcoords = slot(slot(temp, "Polygons")[[1]],"coords") # extract coordinates
19 |   tempid = slot(temp, "ID") # extract polygon ID
20 |   tempname = as.character(boundaries_data[i,"SchoolID"]) # extract school ID
21 |   
22 |   tempmat = cbind(c(0:(nrow(tempcoords)-1)),tempcoords, rep(tempid, nrow(tempcoords)), rep(tempname, nrow(tempcoords)))
23 |   tempmat = as.data.frame(tempmat)
24 |   colnames(tempmat) = c("PointOrder","Longitude","Latitude","PolygonID","SchoolID")
25 |   boundaries_csv = rbind(boundaries_csv, tempmat) # save information about this polygon
26 |   
27 | }
28 | 
29 | setwd("/Volumes/appdata-3/School Reports")
30 | schoolstable = read.csv("schools.csv") # schools table from database
31 | # for each catchment school ID, match up the school key and school name
32 | boundaries_csv = merge(boundaries_csv, schoolstable[,c("SchoolKey","SchoolID","SchoolShortName")], all.x = T, by.x = "SchoolID", by.y = "SchoolID")
33 | 
34 | setwd("/Volumes/appdata-3/School Reports")
35 | data = read.csv("tbl_9thgrade_toandfrom_tracy.csv", stringsAsFactors = F) # matrix of where students are from and where they go to 9th grade
36 | data2013 = subset(data, year == 2013) # only look at 2013
37 | 
38 | # match up each catchment school key with the school ID
39 | data2013 = merge(data2013, schoolstable[,c("SchoolID","SchoolKey")], all.x = TRUE, by.x = "CATCHMENT_SCHOOL_KEY", by.y = "SchoolKey")
40 | 
41 | # for each 'next grade' school key, match up the latitude and longitude of that school
42 | data2013 = merge(data2013, schoolstable[,c("SchoolKey","Latitude","Longitude")], all.x = T, by.x = "NEXT_GRADE_SCHOOL_KEY", by.y = "SchoolKey")
43 | 
44 | data2013 = subset(data2013, Freq > 0) # only need combinations of catchment area and school where there are actually students
45 | data2013 = subset(data2013, !is.na(Latitude)) # remove if the locoation is missing
46 | 
47 | catchment1314 = unique(boundaries_data$SchoolID) # find out school IDs of catchment areas
48 | 
49 | data2013 = subset(data2013, SchoolID %in% catchment1314) # only need to consider catchment areas that have associated geographic area
50 | # there are some students where the catchment area is still their 8th grade school, we remove these
51 | 
52 | # prepare data to read in to tableau: create 'type' variable
53 | boundaries_csv$type = 'polygon' 
54 | data2013$type = 'school_point'
55 | 
56 | boundaries_csv$NEXT_GRADE_SCHOOL_NAME = NA
57 | boundaries_csv$Freq = 0
58 | 
59 | names(boundaries_csv)[7] = "CATCHMENT_SCHOOL_NAME"
60 | boundaries_csv = boundaries_csv[,c("type","SchoolID","PointOrder","PolygonID","Longitude","Latitude","CATCHMENT_SCHOOL_NAME","NEXT_GRADE_SCHOOL_NAME","Freq")]
61 | 
62 | data2013$PointOrder = NA
63 | data2013$PolygonID = NA
64 | data2013 = data2013[,c("type","SchoolID","PointOrder","PolygonID","Longitude","Latitude","CATCHMENT_SCHOOL_NAME","NEXT_GRADE_SCHOOL_NAME","Freq")]
65 | 
66 | setwd("/Volumes/appdata-3/Other Datasets/public neighborhood data")
67 | 
68 | boundaries_csv$Longitude = as.numeric(as.character(boundaries_csv$Longitude))
69 | boundaries_csv$Latitude = as.numeric(as.character(boundaries_csv$Latitude))
70 | 
71 | data_for_tableau = rbind(boundaries_csv, data2013)
72 | write.csv(data_for_tableau, file = "data_for_tableau_FINAL.csv", row.names = F)
73 | 
74 | # for reverse, run above except do not remove 0 Freq's
75 | locations = unique(data2013[,c("NEXT_GRADE_SCHOOL_NAME","Latitude","Longitude")])
76 | locations$type = 'locations'
77 | locations$PolygonID = NA
78 | locations$PointOrder = NA
79 | locations$SchoolID = NA
80 | locations$Freq = 0
81 | locations$CATCHMENT_SCHOOL_NAME = NA
82 | 
83 | locations = locations[,names(data2013)]
84 | 
85 | data_for_tableau = rbind(boundaries_csv, locations)
86 | data_for_tableau = data_for_tableau[,names(data_for_tableau) != 'Freq']
87 | write.csv(data_for_tableau, file = "data_for_tableau_FINAL_reverse.csv", row.names = F)
88 | write.csv(data2013, file = "data_for_tableau_FINAL_reverse2.csv", row.names = F)
89 | 


--------------------------------------------------------------------------------
/Data_Analysis/topdown_FINAL.R:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------- #
 2 | #   predicting catchment high schools - fall 2013   #
 3 | # ------------------------------------------------- #
 4 | 
 5 | # run CPS_gradebygrade_dataprep.R
 6 | 
 7 | ### greg's predictions
 8 | setwd("/Volumes/appdata-3/School Reports")
 9 | gregdata = read.csv("CSVfileforR_excerptof_FY14SBBALLOCATIONS_FINALSUMMARY_asof_032414.csv", stringsAsFactors = FALSE)
10 | gregdata_chs = subset(gregdata, SchoolID %in% schools_with1314boundaries)
11 | 
12 | summary(gregdata_chs$Projected.Total - gregdata_chs$Actual.Total)
13 | mean(abs(gregdata_chs$Projected.Total - gregdata_chs$Actual.Total)) # 66
14 | mean(abs(gregdata_chs$Projected.Total - gregdata_chs$Actual.Total) <= 50) # 0.51
15 | mean(abs(gregdata_chs$Projected.Total - gregdata_chs$Actual.Total) <= 100) # 0.73
16 | 
17 | ### prepare data 
18 | datafinal12_chs = subset(datafinal12, SCHOOL_CODE %in% schools_with1314boundaries)
19 | datafinal13_chs = subset(datafinal13, SCHOOL_CODE %in% schools_with1314boundaries)
20 | 
21 | apply(datafinal12_chs, 2, function(x) sum(is.na(x)))
22 | apply(datafinal13_chs, 2, function(x) sum(is.na(x)))
23 | datafinal12_chs[is.na(datafinal12_chs)] = 'Unknown'
24 | datafinal13_chs[is.na(datafinal13_chs)] = 'Unknown'
25 | 
26 | ### compare to rolling over 2012 or Zhou's cohort survival projection
27 | library("plyr")
28 | baseline_comparison = ddply(datafinal13_chs, .(SCHOOL_CODE), summarize, cstotal = sum(cs_projection), lastyeartotal = sum(projection), truetotal = sum(enrollment_to_predict))
29 | mean(abs(baseline_comparison$truetotal - baseline_comparison$cstotal)) # cohort survival: 77 student mean absolute error
30 | mean(abs(baseline_comparison$truetotal - baseline_comparison$cstotal) <= 50) # 0.49
31 | mean(abs(baseline_comparison$truetotal - baseline_comparison$cstotal) <= 100) # 0.75
32 | 
33 | mean(abs(baseline_comparison$lastyeartotal - baseline_comparison$truetotal)) # roll over 2012: 87 student mean absolute error
34 | mean(abs(baseline_comparison$lastyeartotal - baseline_comparison$truetotal) <= 50) # 0.39
35 | mean(abs(baseline_comparison$lastyeartotal - baseline_comparison$truetotal) <= 100) # 0.61
36 | 
37 | ### train regression model on 2012 data
38 | m1 = lm(enrollment_to_predict ~ (projection + cs_projection)*Rating + newnearby, data = datafinal12_chs)
39 | summary(m1) # summarize coefficients
40 | plot(m1) # residual plots for model diagnostic
41 | 
42 | ### test model on 2013 data
43 | m1_test = predict(m1, newdata = datafinal13_chs, interval = "prediction", level = 0.95)
44 | m1_test = as.data.frame(m1_test)
45 | m1_test_var = ((m1_test$upr - m1_test$lwr)/4)^2
46 | 
47 | datafinal13_chs$testpredictions = m1_test$fit
48 | datafinal13_chs$testvar = m1_test_var
49 | 
50 | test_summary = ddply(datafinal13_chs, .(SCHOOL_CODE), summarize, truetotal = sum(enrollment_to_predict), predictedtotal = sum(testpredictions), predictedvar = sum(testvar))
51 | mean(abs(test_summary$truetotal - test_summary$predictedtotal)) # mean absolute error
52 | mean(abs(test_summary$truetotal - test_summary$predictedtotal) <= 50) 
53 | mean(abs(test_summary$truetotal - test_summary$predictedtotal) <= 100) # 0.80 
54 | 
55 | # calculate prediction interval lower and upper limits
56 | test_summary$predictedlower = test_summary$predictedtotal - 1.96*sqrt(test_summary$predictedvar)
57 | test_summary$predictedupper = test_summary$predictedtotal + 1.96*sqrt(test_summary$predictedvar)
58 | 
59 | # how many 95% intervals cover the true school total (should be about 95%)
60 | mean((test_summary$predictedlower <= test_summary$truetotal) & (test_summary$truetotal <= test_summary$predictedupper))
61 | 
62 | # summarize the widths of the confidence intervals
63 | summary(test_summary$predictedupper - test_summary$predictedlower)
64 | 
65 | setwd("/Volumes/appdata-3/School Reports")
66 | schools = read.csv("schools.csv", stringsAsFactors = F)
67 | 
68 | test_summary = merge(test_summary, schools[,c("SchoolID","SchoolShortName")], all.x = T, by.x = "SCHOOL_CODE", by.y = "SchoolID")
69 | test_summary = merge(test_summary, gregdata_chs[,c("SchoolID","Projected.Total")], all.x = T, by.x = "SCHOOL_CODE", by.y = "SchoolID")
70 | 
71 | # plot our estimates and CPS's estimates
72 | 
73 | plot(test_summary$truetotal, test_summary$predictedtotal, pch = 16, xlim = c(0,3300), ylim = c(0,3300),xlab = "True enrollment 2013", ylab = "Predicted enrollment 2013")
74 | segments(x0 = test_summary$truetotal, y0 = test_summary$predictedtotal - 1.96*sqrt(test_summary$predictedvar), x1 = test_summary$truetotal, y1 = test_summary$predictedtotal + 1.96*sqrt(test_summary$predictedvar))
75 | segments(x0=0,y0=0,x1=4000,y1=4000, col=2)
76 | points(x = test_summary$truetotal, y = test_summary$Projected.Total, pch = 2, col = 4)
77 | legend("bottomright", c("DSSG", "CPS"),col = c(1,4), pch = c(16,2), bty = "n", pt.cex = 2, title = "Prediction Model:")
78 | 
79 | # plot school estimates and intervals
80 | library(ggplot2)
81 | test_summary$SCHOOL_CODE = with(test_summary, reorder(SCHOOL_CODE, -predictedtotal))
82 | 
83 | ggplot(data=test_summary, aes(x=SCHOOL_CODE, y=predictedtotal)) + 
84 |   geom_bar(stat="identity", fill = "#E69F00") + 
85 | geom_errorbar(aes(ymin=predictedtotal - 1.96*sqrt(predictedvar), ymax=predictedtotal + 1.96*sqrt(predictedvar)),
86 |               width=.2,                    # Width of the error bars
87 |               position=position_dodge(.9)) +
88 |   xlab("Catchment high schools") + ylab("Predicted enrollment") + 
89 | theme(legend.position = "none", axis.ticks = element_blank(), 
90 |       axis.text.x = element_blank())
91 | 


--------------------------------------------------------------------------------
/Data_Pipeline/README.md:
--------------------------------------------------------------------------------
 1 | ##Data Pipeline for CPS Project##
 2 | 
 3 | This folder includes scripts for querying, merging, and saving the data we used for most of our analyses.
 4 | 
 5 | ###Student Data###
 6 | 
 7 | `create_students_table.R` gathers data for student-level modelling. First, it runs the query `student_table_with_ninth_snapshot_vw.sql` to get the students who attended CPS in either 8th or 9th grades in the last few years. There is one record per student, per school year, and it combines which school the students went to in 8th and/or 9th grade. 
 8 | 
 9 | Then it runs `student_ISAT_scores_vw.sql` and `student_attendance_vw.sql` to get the students' ISAT scores and historical attendance, respectively. This is done separately, to increase speed when combined with the original query. These data sets are all merged together. Finally, the data is saved one year at a time.
10 | 
11 | Finally, we run the script `geocode_addresses.R` to get the latitude and longitude for each student's home address.
12 | 
13 | ###School Data###
14 | 
15 | `create_schools_table.R` pulls all their features from the database. After this, it geocodes the schools, gets their census block information, and saves the results as `schools_static_features.csv`.
16 | 
17 | This script also runs the query in `school_aggregate_features_vw.sql`, which pulls yearly summaries of the students who attend each of the schools. It captures the percentage of students receiving free or reduced priced lunches, the percentage of ESL students, the percentage of male students, the percentage of homeless students, the average student's GPA, and the students' attendance rate.
18 | 
19 | The results are saved in `schools_aggregate_features_YEAR.csv`, with YEAR replaced by the appropriate year.
20 | 
21 | The code in `create_features_table.R` creates the dataframe of school features that change every year. This file combines information from the school report cards downloaded from the [Chicago data portal](https://data.cityofchicago.org/), mobility and race data from [CPS's website](http://cps.edu/SchoolData/Pages/SchoolData.aspx), and [AUSL data](http://auslchicago.org/). Since the inputs are school progress reports, race files and ausl files, they are all excel sheets. Part of the work is done by hand and part of the work is automated. In the future, all files will change but will be publicly available data.
22 | 
23 | ###Note###
24 | 
25 | All the SQL queries pull data from views created for us by CPS. This is done to keep CPS's data stucture private.
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/Data_Pipeline/create_features_table.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript
  2 | 
  3 | #######################################
  4 | ####Create School Changing Features####
  5 | #######################################
  6 | 
  7 | ####CPS Team####
  8 | ####08/19/2014####
  9 | 
 10 | rm(list=ls())
 11 | setwd("/Volumes/DSSG/Zhou_Classification/")
 12 | elem2013 <- read.csv("CPS_Elementary_Report_20122013.csv", stringsAsFactors=F)
 13 | high2013 <- read.csv("CPS_HighSchool_Report_20122013.csv", stringsAsFactors=F)
 14 | elem2014 <- read.csv("CPS_Elementary_Report_20132014.csv", stringsAsFactors=F)
 15 | high2014 <- read.csv("CPS_HighSchool_Report_20132014.csv", stringsAsFactors=F)
 16 | prog2012 <- read.csv("Progress_2011_2012.csv", stringsAsFactors=F)
 17 | name <- c("SchoolID", "SchoolName", "SchoolType", "ZipCode", "Rating", "Probation", "Health", "Safety", "Family", "Environment", "Instruction", "Leader", "Teacher")
 18 | 
 19 | ####2011 - 2012###
 20 | progress_2011_2012 <- with(prog2012, {
 21 |     data.frame(School.ID, Name.of.School, Elementary..Middle..or.High.School, ZIP.Code,
 22 |                CPS.Performance.Policy.Level, CPS.Performance.Policy.Status,
 23 |                Healthy.Schools.Certified., Safety.Icon, Family.Involvement.Icon,
 24 |                Environment.Icon, Instruction.Icon, Leaders.Icon, Teachers.Icon, stringsAsFactors=F)
 25 | })
 26 | colnames(progress_2011_2012) <- name
 27 | progress_2011_2012 <- within(progress_2011_2012, {
 28 |     SchoolType[SchoolType=="MS"] <- "ES";
 29 |     Rating[Rating=="Not Enough Data" | Rating=="NDA"] <- NA;
 30 |     Probation[Probation=="Not on Probation"] <- "No";
 31 |     Probation[Probation=="Probation"] <- "Yes";
 32 |     Probation[Probation=="Not Applicable" | Probation=="NDA"] <- NA;
 33 |     Safety[Safety=="NDA"] <- NA;
 34 |     Family[Family=="NDA"] <- NA;
 35 |     Environment[Environment=="NDA"] <- NA;
 36 |     Instruction[Instruction=="NDA"] <- NA;
 37 |     Leader[Leader=="NDA"] <- NA;
 38 |     Teacher[Teacher=="NDA"] <- NA;
 39 |     Rating <- tolower(Rating);
 40 |     Probation <- tolower(Probation);
 41 |     Health <- tolower(Health);
 42 |     Safety <- tolower(Safety);
 43 |     Family <- tolower(Family);
 44 |     Environment <- tolower(Environment);
 45 |     Instruction <- tolower(Instruction);
 46 |     Leader <- tolower(Leader);
 47 |     Teacher <- tolower(Teacher);
 48 | })
 49 | write.csv(x=progress_2011_2012, file="/Users/zhouye/Dropbox/DSSG/Edit_Prediction/progress_2011_2012.csv", row.names=F)
 50 | 
 51 | ###2012 - 2013###
 52 | progress_2012_2013_es <- with(elem2013, {
 53 |     data.frame(School.ID, School.Name, "ES", ZIP, Overall.Rating,
 54 |                On.Probation, Healthy.School.Certified, Safety,
 55 |                Involved.Families, Supportive.Environment, 
 56 |                Ambitious.Instruction, Effective.Leaders, Collaborative.Teachers, stringsAsFactors=F)
 57 | })
 58 | colnames(progress_2012_2013_es) <- name
 59 | progress_2012_2013_es <- within(progress_2012_2013_es, {
 60 |     Rating[Rating=="Not Enough Data"] <- NA;
 61 |     Safety[Safety=="Not Enough Data"] <- NA;
 62 |     Safety[Safety=="Neutral"] <- "Average";
 63 |     Family[Family=="Not Enough Data"] <- NA;
 64 |     Family[Family=="Neutral"] <- "Average";
 65 |     Environment[Environment=="Not Enough Data"] <- NA;
 66 |     Environment[Environment=="Neutral"] <- "Average";
 67 |     Instruction[Instruction=="Not Enough Data"] <- NA;
 68 |     Instruction[Instruction=="Neutral"] <- "Average";
 69 |     Leader[Leader=="Not Enough Data"] <- NA;
 70 |     Leader[Leader=="Neutral"] <- "Average";
 71 |     Teacher[Teacher=="Not Enough Data"] <- NA;
 72 |     Teacher[Teacher=="Neutral"] <- "Average";
 73 |     Rating <- tolower(Rating);
 74 |     Probation <- tolower(Probation);
 75 |     Health <- tolower(Health);
 76 |     Safety <- tolower(Safety);
 77 |     Family <- tolower(Family);
 78 |     Environment <- tolower(Environment);
 79 |     Instruction <- tolower(Instruction);
 80 |     Leader <- tolower(Leader);
 81 |     Teacher <- tolower(Teacher);
 82 | })
 83 | progress_2012_2013_hs <- with(high2013, {
 84 |     data.frame(School.ID, School.Name, "HS", ZIP, Overall.Rating,
 85 |                On.Probation, Healthy.School.Certified, Safety,
 86 |                Involved.Families, Supportive.Environment, 
 87 |                Ambitious.Instruction, Effective.Leaders, Collaborative.Teachers, stringsAsFactors=F)
 88 | })
 89 | colnames(progress_2012_2013_hs) <- name
 90 | progress_2012_2013_hs <- within(progress_2012_2013_hs, {
 91 |     Rating[Rating=="Not Enough Data"] <- NA;
 92 |     Safety[Safety=="Not Enough Data"] <- NA;
 93 |     Safety[Safety=="Neutral"] <- "Average";
 94 |     Family[Family=="Not Enough Data"] <- NA;
 95 |     Family[Family=="Neutral"] <- "Average";
 96 |     Environment[Environment=="Not Enough Data"] <- NA;
 97 |     Environment[Environment=="Neutral"] <- "Average";
 98 |     Instruction[Instruction=="Not Enough Data"] <- NA;
 99 |     Instruction[Instruction=="Neutral"] <- "Average";
100 |     Leader[Leader=="Not Enough Data"] <- NA;
101 |     Leader[Leader=="Neutral"] <- "Average";
102 |     Teacher[Teacher=="Not Enough Data"] <- NA;
103 |     Teacher[Teacher=="Neutral"] <- "Average";
104 |     Rating <- tolower(Rating);
105 |     Probation <- tolower(Probation);
106 |     Health <- tolower(Health);
107 |     Safety <- tolower(Safety);
108 |     Family <- tolower(Family);
109 |     Environment <- tolower(Environment);
110 |     Instruction <- tolower(Instruction);
111 |     Leader <- tolower(Leader);
112 |     Teacher <- tolower(Teacher);
113 | })
114 | progress_2012_2013 <- rbind(progress_2012_2013_es, progress_2012_2013_hs)
115 | write.csv(x=progress_2012_2013, file="/Users/zhouye/Dropbox/DSSG/Edit_Prediction/progress_2012_2013.csv", row.names=F)
116 | 
117 | ###2013 - 2014###
118 | progress_2013_2014_es <- with(elem2014, {
119 |     data.frame(School.ID, Name.of.School, "ES", ZIP.Code, CPS.Performance.Policy.Level,
120 |                CPS.Performance.Policy.Status, Healthy.Schools.Certification, Safe,
121 |                Involved.Family, Supportive.Environment, 
122 |                Ambitious.Instruction, Effective.Leaders, Collaborative.Teachers, stringsAsFactors=F)
123 | })
124 | colnames(progress_2013_2014_es) <- name
125 | progress_2013_2014_es <- within(progress_2013_2014_es, {
126 |     Rating[Rating=="NOT ENOUGH DATA" | Rating==""] <- NA;
127 |     Probation[Probation=="NOT APPLICABLE" | Probation==""] <- NA
128 |     Probation[Probation=="ON PROBATION"] <- "Yes"
129 |     Probation[Probation=="NOT ON PROBATION"] <- "No"
130 |     Health[Health=="PENDING CERTIFICATION" | Health==""] <- NA
131 |     Health[Health=="HEALTHY SCHOOLS CERTIFIED"] <- "Yes"
132 |     Health[Health=="NOT CERTIFIED"] <- "No"
133 |     Safety[Safety=="NOT ENOUGH DATA" | Safety==""] <- NA;
134 |     Safety[Safety=="NEUTRAL"] <- "Average";
135 |     Family[Family=="NOT ENOUGH DATA" | Family==""] <- NA;
136 |     Family[Family=="NEUTRAL"] <- "Average";
137 |     Environment[Environment=="NOT ENOUGH DATA" | Environment==""] <- NA;
138 |     Environment[Environment=="NEUTRAL"] <- "Average";
139 |     Instruction[Instruction=="NOT ENOUGH DATA" | Instruction==""] <- NA;
140 |     Instruction[Instruction=="NEUTRAL"] <- "Average";
141 |     Leader[Leader=="NOT ENOUGH DATA" | Leader==""] <- NA;
142 |     Leader[Leader=="NEUTRAL"] <- "Average";
143 |     Teacher[Teacher=="NOT ENOUGH DATA"] <- NA;
144 |     Teacher[Teacher=="NEUTRAL"] <- "Average";
145 |     Rating <- tolower(Rating);
146 |     Probation <- tolower(Probation);
147 |     Health <- tolower(Health);
148 |     Safety <- tolower(Safety);
149 |     Family <- tolower(Family);
150 |     Environment <- tolower(Environment);
151 |     Instruction <- tolower(Instruction);
152 |     Leader <- tolower(Leader);
153 |     Teacher <- tolower(Teacher);
154 | })
155 | progress_2013_2014_hs <- with(high2014, {
156 |     data.frame(School.ID, Name.of.School, "ES", ZIP.Code, CPS.Performance.Policy.Level,
157 |                CPS.Performance.Policy.Status, Healthy.Schools.Certification, Safe,
158 |                Involved.Family, Supportive.Environment, 
159 |                Ambitious.Instruction, Effective.Leaders, Collaborative.Teachers, stringsAsFactors=F)
160 | })
161 | colnames(progress_2013_2014_hs) <- name
162 | progress_2013_2014_hs <- within(progress_2013_2014_hs, {
163 |     Rating[Rating=="NOT ENOUGH DATA" | Rating==""] <- NA;
164 |     Probation[Probation=="NOT APPLICABLE" | Probation==""] <- NA
165 |     Probation[Probation=="ON PROBATION"] <- "Yes"
166 |     Probation[Probation=="NOT ON PROBATION"] <- "No"
167 |     Health[Health=="PENDING CERTIFICATION" | Health==""] <- NA
168 |     Health[Health=="HEALTHY SCHOOLS CERTIFIED"] <- "Yes"
169 |     Health[Health=="NOT CERTIFIED"] <- "No"
170 |     Safety[Safety=="NOT ENOUGH DATA" | Safety==""] <- NA;
171 |     Safety[Safety=="NEUTRAL"] <- "Average";
172 |     Family[Family=="NOT ENOUGH DATA" | Family==""] <- NA;
173 |     Family[Family=="NEUTRAL"] <- "Average";
174 |     Environment[Environment=="NOT ENOUGH DATA" | Environment==""] <- NA;
175 |     Environment[Environment=="NEUTRAL"] <- "Average";
176 |     Instruction[Instruction=="NOT ENOUGH DATA" | Instruction==""] <- NA;
177 |     Instruction[Instruction=="NEUTRAL"] <- "Average";
178 |     Leader[Leader=="NOT ENOUGH DATA" | Leader==""] <- NA;
179 |     Leader[Leader=="NEUTRAL"] <- "Average";
180 |     Teacher[Teacher=="NOT ENOUGH DATA"] <- NA;
181 |     Teacher[Teacher=="NEUTRAL"] <- "Average";
182 |     Rating <- tolower(Rating);
183 |     Probation <- tolower(Probation);
184 |     Health <- tolower(Health);
185 |     Safety <- tolower(Safety);
186 |     Family <- tolower(Family);
187 |     Environment <- tolower(Environment);
188 |     Instruction <- tolower(Instruction);
189 |     Leader <- tolower(Leader);
190 |     Teacher <- tolower(Teacher);
191 | })
192 | progress_2013_2014 <- rbind(progress_2013_2014_es, progress_2013_2014_hs)
193 | write.csv(x=progress_2013_2014, file="/Users/zhouye/Dropbox/DSSG/Edit_Prediction/progress_2013_2014.csv", row.names=F)
194 | 
195 | setwd("/Volumes/appdata-3/Volan data/01 - FY 06 to FY 14 Enrollment History/csvfiles")
196 | 
197 | # csv files of Greg's 20th day excel files
198 | enrollment2013mat = read.csv("FY2014_20th_DAY_ENROLLMENT_allgrades.csv")
199 | enrollment2012mat = read.csv("FY2013_20th_DAY_ENROLLMENT_allgrades.csv")
200 | enrollment2011mat = read.csv("FY2012_20th_DAY_ENROLLMENT_allgrades.csv")
201 | enrollment2010mat = read.csv("FY2011_20th_DAY_ENROLLMENT_allgrades.csv")
202 | enrollment2009mat = read.csv("FY2010_20th_DAY_ENROLLMENT_allgrades.csv")
203 | 
204 | # sum up SPED and LRE
205 | enrollment2013mat$LRE = rowSums(enrollment2013mat[,15:20], na.rm = TRUE)
206 | enrollment2012mat$LRE = rowSums(enrollment2012mat[,15:16], na.rm = TRUE)
207 | enrollment2011mat$LRE = rowSums(enrollment2011mat[,15:17], na.rm = TRUE)
208 | enrollment2010mat$LRE = rowSums(enrollment2010mat[,15:17], na.rm = TRUE)
209 | enrollment2009mat$LRE = rowSums(enrollment2009mat[,15:17], na.rm = TRUE)
210 | 
211 | # columns to include: school ID, K - 12, LRE total
212 | enrollment2013mat = enrollment2013mat[,c(1:14,21)]
213 | enrollment2012mat = enrollment2012mat[,c(1:14,17)]
214 | enrollment2011mat = enrollment2011mat[,c(1:14,18)]
215 | enrollment2010mat = enrollment2010mat[,c(1:14,18)]
216 | enrollment2009mat = enrollment2009mat[,c(1:14,18)]
217 | 
218 | # column names
219 | colnames(enrollment2009mat) = c("SCHOOL_CODE", "K","X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","LRE")
220 | colnames(enrollment2010mat) = c("SCHOOL_CODE", "K","X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","LRE")
221 | colnames(enrollment2011mat) = c("SCHOOL_CODE", "K","X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","LRE")
222 | colnames(enrollment2012mat) = c("SCHOOL_CODE", "K","X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","LRE")
223 | colnames(enrollment2013mat) = c("SCHOOL_CODE", "K","X1","X2","X3","X4","X5","X6","X7","X8","X9","X10","X11","X12","LRE")
224 | 
225 | # remove NA school codes
226 | enrollment2013mat = enrollment2013mat[!is.na(enrollment2013mat$SCHOOL_CODE),]
227 | enrollment2012mat = enrollment2012mat[!is.na(enrollment2012mat$SCHOOL_CODE),]
228 | enrollment2011mat = enrollment2011mat[!is.na(enrollment2011mat$SCHOOL_CODE),]
229 | enrollment2010mat = enrollment2010mat[!is.na(enrollment2010mat$SCHOOL_CODE),]
230 | enrollment2009mat = enrollment2009mat[!is.na(enrollment2009mat$SCHOOL_CODE),]
231 | 
232 | # if enrollment is NA, fill in 0
233 | enrollment2013mat[(is.na(enrollment2013mat))] = 0
234 | enrollment2012mat[(is.na(enrollment2012mat))] = 0
235 | enrollment2011mat[(is.na(enrollment2011mat))] = 0
236 | enrollment2010mat[(is.na(enrollment2010mat))] = 0
237 | enrollment2009mat[(is.na(enrollment2009mat))] = 0
238 | 
239 | # enrollment matrix
240 | temp2009 = reshape(enrollment2009mat, idvar = "SCHOOL_CODE", times = names(enrollment2009mat)[2:15], timevar = "GRADE", varying = list(names(enrollment2009mat)[2:15]), direction = "long", v.names = "ENROLLMENT")
241 | temp2009$YEAR = 2009
242 | 
243 | temp2010 = reshape(enrollment2010mat, idvar = "SCHOOL_CODE", times = names(enrollment2010mat)[2:15], timevar = "GRADE", varying = list(names(enrollment2010mat)[2:15]), direction = "long", v.names = "ENROLLMENT")
244 | temp2010$YEAR = 2010
245 | 
246 | temp2011 = reshape(enrollment2011mat, idvar = "SCHOOL_CODE", times = names(enrollment2011mat)[2:15], timevar = "GRADE", varying = list(names(enrollment2011mat)[2:15]), direction = "long", v.names = "ENROLLMENT")
247 | temp2011$YEAR = 2011
248 | 
249 | temp2012 = reshape(enrollment2012mat, idvar = "SCHOOL_CODE", times = names(enrollment2012mat)[2:15], timevar = "GRADE", varying = list(names(enrollment2012mat)[2:15]), direction = "long", v.names = "ENROLLMENT")
250 | temp2012$YEAR = 2012
251 | 
252 | temp2013 = reshape(enrollment2013mat, idvar = "SCHOOL_CODE", times = names(enrollment2013mat)[2:15], timevar = "GRADE", varying = list(names(enrollment2013mat)[2:15]), direction = "long", v.names = "ENROLLMENT")
253 | temp2013$YEAR = 2013
254 | 
255 | enrollmentmatrix = rbind(temp2009, temp2010, temp2011, temp2012, temp2013)
256 | 
257 | # write to csv
258 | setwd("/Volumes/appdata-3/Count Data")
259 | write.csv(enrollmentmatrix, file = "enrollment_byschool_byyear_bygrade.csv", row.names = FALSE)
260 | 
261 | # DONE
262 | 
263 | 
264 | # ========================================================== #
265 | #   creating school matrix of features that don't change     #
266 | # ========================================================== #
267 | 
268 | # list of schools for each year
269 | schools2009 = unique(enrollment2009mat$SCHOOL_CODE)
270 | schools2010 = unique(enrollment2010mat$SCHOOL_CODE)
271 | schools2011 = unique(enrollment2011mat$SCHOOL_CODE)
272 | schools2012 = unique(enrollment2012mat$SCHOOL_CODE)
273 | schools2013 = unique(enrollment2013mat$SCHOOL_CODE)
274 | 
275 | # all schools from fall 2009 - 2013
276 | allschools = union(schools2009, union(schools2010, union(schools2011, union(schools2012, schools2013))))
277 | allschools = allschools[!is.na(allschools)]
278 | 
279 | # start making all schools matrix
280 | allschoolsmat = as.data.frame(allschools)
281 | colnames(allschoolsmat) = "SCHOOL_CODE"
282 | 
283 | # all schools in database
284 | setwd("/Volumes/appdata-3/School Reports")
285 | schools_table = read.csv("schools.csv", stringsAsFactors = FALSE)
286 | 
287 | allschoolsmat = merge(allschoolsmat, schools_table[,c("SCHOOL_CODE","SCHOOL_NAME","SCHOOL_TYPE","SCHOOL_GRADES_GROUP","lon","lat")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SCHOOL_CODE")
288 | 
289 | # use school report from 2011 to try to fill in missing lat/lon
290 | schools_report2011 = read.csv("Chicago_Public_Schools_-_Progress_Report_Cards__2011-2012_.csv", stringsAsFactors = FALSE)
291 | 
292 | for (i in 1:nrow(allschoolsmat)){
293 |     if (is.na(allschoolsmat[i,"lon"]) & nrow(schools_report2011[schools_report2011$School.ID == allschoolsmat[i,"SCHOOL_CODE"],]) > 0)
294 |     {allschoolsmat[i,"lon"] = schools_report2011[schools_report2011$School.ID == allschoolsmat[i,"SCHOOL_CODE"],"Longitude"] ;
295 |      allschoolsmat[i,"lat"] = schools_report2011[schools_report2011$School.ID == allschoolsmat[i,"SCHOOL_CODE"],"Latitude"]}  
296 | }
297 | 
298 | library(ggmap)
299 | 
300 | garfieldparklocation = geocode('3250 W Monroe St, Chicago, IL 60612')
301 | allschoolsmat[(allschoolsmat$SCHOOL_CODE == 400095),"lon"] = garfieldparklocation$lon
302 | allschoolsmat[(allschoolsmat$SCHOOL_CODE == 400095),"lat"] = garfieldparklocation$lat
303 | 
304 | # crane achievement has same address as crane tech
305 | allschoolsmat[allschoolsmat$SCHOOL_CODE == 610378,"lon"] = allschoolsmat[allschoolsmat$SCHOOL_CODE == 609702,"lon"]
306 | allschoolsmat[allschoolsmat$SCHOOL_CODE == 610378,"lat"] = allschoolsmat[allschoolsmat$SCHOOL_CODE == 609702,"lat"]
307 | 
308 | # end of filling in lat/lon
309 | 
310 | # indicator for years when school is active
311 | allschoolsmat$active2009 = NA
312 | allschoolsmat$active2010 = NA
313 | allschoolsmat$active2011 = NA
314 | allschoolsmat$active2012 = NA
315 | allschoolsmat$active2013 = NA
316 | 
317 | for (i in 1:nrow(allschoolsmat)){
318 |     allschoolsmat[i,"active2009"] = length(intersect(allschoolsmat[i,"SCHOOL_CODE"], schools2009))
319 |     allschoolsmat[i,"active2010"] = length(intersect(allschoolsmat[i,"SCHOOL_CODE"], schools2010))
320 |     allschoolsmat[i,"active2011"] = length(intersect(allschoolsmat[i,"SCHOOL_CODE"], schools2011))
321 |     allschoolsmat[i,"active2012"] = length(intersect(allschoolsmat[i,"SCHOOL_CODE"], schools2012))
322 |     allschoolsmat[i,"active2013"] = length(intersect(allschoolsmat[i,"SCHOOL_CODE"], schools2013))
323 | }
324 | 
325 | allschoolsmat = allschoolsmat[(allschoolsmat$SCHOOL_CODE != 400152),] # this school doesn't have any enrollment
326 | 
327 | sum(is.na(allschoolsmat)) # 0
328 | 
329 | setwd("/Volumes/appdata-3/School Reports")
330 | write.csv(allschoolsmat, file = "schools_staticfeatures.csv", row.names = FALSE)
331 | 
332 | # DONE
333 | 
334 | 
335 | # =============================================== #
336 | #   school features that do change year to year   #
337 | # =============================================== #
338 | 
339 | # list of schools
340 | setwd("/Volumes/appdata-3/Count Data")
341 | countdata = read.csv("enrollment_byschool_byyear_bygrade.csv")
342 | 
343 | # make one data frame for each year to start
344 | 
345 | schools_whatyouknowforfall11 = as.data.frame(unique(countdata[(countdata$YEAR == 2011),"SCHOOL_CODE"]))
346 | colnames(schools_whatyouknowforfall11) = "SCHOOL_CODE"
347 | schools_whatyouknowforfall11$year_topredict = 2011
348 | 
349 | schools_whatyouknowforfall12 = as.data.frame(unique(countdata[(countdata$YEAR == 2012),"SCHOOL_CODE"]))
350 | colnames(schools_whatyouknowforfall12) = "SCHOOL_CODE"
351 | schools_whatyouknowforfall12$year_topredict = 2012
352 | 
353 | schools_whatyouknowforfall13 = as.data.frame(unique(countdata[(countdata$YEAR == 2013),"SCHOOL_CODE"]))
354 | colnames(schools_whatyouknowforfall13) = "SCHOOL_CODE"
355 | schools_whatyouknowforfall13$year_topredict = 2013
356 | 
357 | 
358 | # race
359 | setwd("/Volumes/appdata-3/School Reports/Ethnicity")
360 | race1011 = read.csv("FY2011_race.csv")
361 | race1112 = read.csv("FY2012_race.csv")
362 | race1213 = read.csv("FY2013_race.csv")
363 | 
364 | schools_whatyouknowforfall11 = merge(schools_whatyouknowforfall11, race1011, all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID")
365 | schools_whatyouknowforfall12 = merge(schools_whatyouknowforfall12, race1112, all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID")
366 | schools_whatyouknowforfall13 = merge(schools_whatyouknowforfall13, race1213, all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID")
367 | 
368 | 
369 | # AUSL turnarounds (source = AUSL site)
370 | 
371 | # starting fall 2010 or before: bradwell, chicago academy, chicago academy HS,
372 | # collins, curtis, deneen, dodge, dulles, harvard, howe, johnson, morton,
373 | # national teachers academy, orr, phillips, sherman, solorio, tarkington
374 | AUSL_priorfall2010 = c(609806, 610248, 610340, 610499, 609900, 609883, 609888, 
375 |                        610263, 609971, 610000, 610274, 610257, 610231, 610389, 
376 |                        609727, 610172, 610543, 610396)
377 | # starting fall 2011: none
378 | 
379 | schools_whatyouknowforfall11$AUSL = (lapply(schools_whatyouknowforfall11$SCHOOL_CODE, function(x) (x %in% AUSL_priorfall2010)) == TRUE)
380 | 
381 | 
382 | # starting fall 2012: casals, fuller, herzl, marquette, piccolo, stagg
383 | AUSL_fall2012 = c(610021, 609928, 609991, 610053, 610106, 610339)
384 | schools_whatyouknowforfall12$AUSL = (lapply(schools_whatyouknowforfall12$SCHOOL_CODE, function(x) (x %in% AUSL_fall2012)) == TRUE)
385 | 
386 | # starting fall 2013: carter, chalmers, dewey, lewis, o'keeffe
387 | AUSL_fall2013 = c(609844, 609851, 609885, 610036, 610103)
388 | schools_whatyouknowforfall13$AUSL = (lapply(schools_whatyouknowforfall13$SCHOOL_CODE, function(x) (x %in% AUSL_fall2013)) == TRUE)
389 | 
390 | # starting fall 2014: dvorak, gresham, mcnair
391 | #AUSL_fall2014 = c(610254, 609955, 610282)
392 | 
393 | # other turnarounds (by OSI): marshall HS, vocational career academy, tilden career HS, wendell smith elem, woodson south elem
394 | 
395 | 
396 | 
397 | # mobility rate
398 | setwd("/Volumes/appdata-3/School Reports")
399 | mobilitydata = read.csv("mobilityrates.csv", na.strings = c(" ",""))
400 | # I think the year is the year the school year ended 
401 | # (i.e. the rate for 2009 is for the 2008-09 school year)
402 | 
403 | currentnames = names(schools_whatyouknowforfall12)
404 | 
405 | schools_whatyouknowforfall11 = merge(schools_whatyouknowforfall11, mobilitydata[,c("School.ID","X2011")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID")
406 | colnames(schools_whatyouknowforfall11) = c(currentnames, "mobility_rate")
407 | 
408 | schools_whatyouknowforfall12 = merge(schools_whatyouknowforfall12, mobilitydata[,c("School.ID","X2012")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID")
409 | colnames(schools_whatyouknowforfall12) = c(currentnames, "mobility_rate")
410 | 
411 | schools_whatyouknowforfall13 = merge(schools_whatyouknowforfall13, mobilitydata[,c("School.ID","X2013")], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "School.ID")
412 | colnames(schools_whatyouknowforfall13) = c(currentnames, "mobility_rate")
413 | 
414 | 
415 | # report data
416 | setwd("/Volumes/appdata-3/Master_Data")
417 | progress1112 = read.csv("progress_2011_2012.csv", stringsAsFactors = FALSE)
418 | progress1213 = read.csv("progress_2012_2013.csv", stringsAsFactors = FALSE)
419 | progress1314 = read.csv("progress_2013_2014.csv", stringsAsFactors = FALSE)
420 | 
421 | schools_whatyouknowforfall11 = merge(schools_whatyouknowforfall11, progress1112[,c(1,5:13)], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SchoolID")
422 | schools_whatyouknowforfall12 = merge(schools_whatyouknowforfall12, progress1213[,c(1,5:13)], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SchoolID")
423 | schools_whatyouknowforfall13 = merge(schools_whatyouknowforfall13, progress1314[,c(1,5:13)], all.x = TRUE, by.x = "SCHOOL_CODE", by.y = "SchoolID")
424 | 
425 | schools_whatyouknowforfallxx = rbind(schools_whatyouknowforfall11, schools_whatyouknowforfall12, schools_whatyouknowforfall13)
426 | 
427 | setwd("/Volumes/appdata-3/School Reports")
428 | write.csv(schools_whatyouknowforfallxx, file = "schools_changingfeatures.csv", row.names = FALSE)
429 | 
430 | rm(list=ls())
431 | setwd("/Volumes/DSSG/School Reports")
432 | data <- read.csv("schools_changingfeatures.csv")
433 | colnames(data) <- c("SchoolID", "YearToPredict", "White", "Black", "Hispanic", "AUSL", "Mobility", "Rating", "Probation", "Health", "Safety", "Family", "Environment", "Instruction", "Leader", "Teacher")
434 | 
435 | data2011 <- subset(data, data$YearToPredict==2011)
436 | data2011$YearToPredict <- NULL
437 | write.csv(x=data2011, file="schools_changing_features_2011.csv", row.names=F)
438 | 
439 | data2012 <- subset(data, data$YearToPredict==2012)
440 | data2012$YearToPredict <- NULL
441 | write.csv(x=data2012, file="schools_changing_features_2012.csv", row.names=F)
442 | 
443 | data2013 <- subset(data, data$YearToPredict==2013)
444 | data2013$YearToPredict <- NULL
445 | write.csv(x=data2013, file="schools_changing_features_2013.csv", row.names=F)
446 | 
447 | rm(list=ls())
448 | setwd("/Volumes/DSSG/Master_Data/")
449 | enroll <- read.csv("enrollment.csv", stringsAsFactors=T)
450 | all_year <- sort(unique(enroll$YEAR))
451 | name <- c("SCHOOL_CODE", "Year", "K", "G1", "G2", "G3", "G4", "G5", "G6",
452 |           "G7", "G8", "G9", "G10", "G11", "G12")
453 | old_name <- c("K", "X1", "X2", "X3", "X4", "X5", "X6",
454 |               "X7", "X8", "X9", "X10", "X11", "X12")
455 | 
456 | active_grade <- function(year) {
457 |     temp <- subset(enroll, enroll$YEAR==year)
458 |     result <- data.frame(matrix(0, nrow(temp), length(name)))
459 |     colnames(result) <- name
460 |     result$SCHOOL_CODE <- temp$SCHOOL_CODE
461 |     result$Year <- year
462 |     for (i in 1:nrow(result)) {
463 |         for (j in 1:length(old_name)) {
464 |             flag <- subset(temp, temp$SCHOOL_CODE==result[i,]$SCHOOL_CODE & temp$GRADE==old_name[j])$ENROLLMENT
465 |             result[i,j+2] <- ifelse(flag>0, 1, 0)
466 |         }
467 |     }
468 |     return(result)
469 | }
470 | 
471 | data <- list()
472 | for (i in 1:length(all_year)) {
473 |     print(i)
474 |     data[[i]] <- active_grade(all_year[i])
475 | }
476 | final_data <- do.call("rbind", data)
477 | write.csv(x=final_data, file="/Users/zhouye/Dropbox/DSSG/avtive_grade.csv", row.names=F)
478 | 
479 | rm(list=ls())
480 | setwd("/Volumes/DSSG/Master_Data/")
481 | tract <- read.csv("../Other Datasets/percent_race_by_tract.csv")
482 | static <- read.csv("schools_static_features.csv")
483 | change2011 <- read.csv("schools_changing_features_2011.csv")
484 | change2012 <- read.csv("schools_changing_features_2012.csv")
485 | change2013 <- read.csv("schools_changing_features_2013.csv")
486 | fill_miss <- function(data) {
487 |     index <- which(is.na(rowSums(data[,c("White", "Black", "Hispanic")])))
488 |     for (i in index) {
489 |         print(i)
490 |         id <- data[i,]$SchoolID
491 |         j <- which(static$SchoolID==id)
492 |         fips <- static[j,]$FIPSCode
493 |         code <- as.numeric(substring(fips, 3, 13))
494 |         k <- which(tract$Tract==code)
495 |         if (length(k)!=0) {
496 |             data[i,c("White", "Black", "Hispanic")] <- tract[k,c("PercentWhite", "PercentBlack", "PercentHispanic")]
497 |         }
498 |     }
499 |     return(data)
500 | }
501 | new2011 <- fill_miss(change2011)
502 | new2012 <- fill_miss(change2012)
503 | new2013 <- fill_miss(change2013)
504 | write.csv(x=new2011, file="schools_changing_features_2011.csv", row.names=F)
505 | write.csv(x=new2012, file="schools_changing_features_2012.csv", row.names=F)
506 | write.csv(x=new2013, file="schools_changing_features_2013.csv", row.names=F)
507 | 
508 | 
509 | 
510 | 


--------------------------------------------------------------------------------
/Data_Pipeline/create_schools_table.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | 
 3 | #####################################
 4 | ####Create School Static Features####
 5 | #####################################
 6 | 
 7 | ####CPS Team####
 8 | ####08/20/2014####
 9 | 
10 | rm(list = ls())
11 | setwd("//admin/appdata/DataWarehouse/DSSG/School Reports/")
12 | 
13 | # Pull in data from CPS data warehouse ####
14 | library(RODBC)
15 | # an ODBC called "DW_QA" which accesses the K12intel_qa database
16 | channel <- odbcConnect("DW_QA")
17 | 
18 | school_query <- "SELECT * 
19 |   FROM K12INTEL_DW.dbo.DSSG_School_Query
20 |   WHERE SchoolKey>1"
21 | odbcClose(channel)
22 | 
23 | schools = sqlQuery(channel, school_query, stringsAsFactors = FALSE)
24 | 
25 | # Geocode school addresses ####
26 | library("ggmap")
27 | addresses = with(schools, paste(Address, City, State, ZipCode, sep = ", "))
28 | good_addresses = !is.na(schools$Address)
29 | geocodeschools = geocode(addresses[good_addresses])
30 | 
31 | schools$Longitude[good_addresses] = geocodeschools$lon
32 | schools$Latitude[good_addresses] = geocodeschools$lat
33 | 
34 | # plot(geocodeschools) # should look like chicago
35 | 
36 | # Get census block group and tract information based on the geocode ####
37 | get_FIPS <- function(latlons) {
38 |   library(RJSONIO)
39 |   # latlons is a n X 2 matrix with latitude in first column and longitude in second column
40 |   # returns an n-dimensional character vector of FIPS codes
41 |   urls = paste0("http://data.fcc.gov/api/block/2010/find?format=json&latitude=",
42 |                 latlons[,1], "&longitude=", latlons[,2])
43 |   
44 |   FIPSs = rep(NA, nrow(latlons))
45 |   index = which(!apply(is.na(latlons), 1, any))
46 |   for (i in index) {
47 |     bk = fromJSON(urls[i])
48 |     
49 |     if (bk$status == "OK") {
50 |       FIPSs[i] = bk$Block
51 |     }
52 |   }
53 |   
54 |   FIPSs[index] = paste0("US", FIPSs[index])
55 |   return(FIPSs)
56 | }
57 | 
58 | FIPS = get_FIPS(subset(schools, select = c("Latitude", "Longitude")))
59 | schools$FIPSCode = FIPS
60 | 
61 | write.table(schools, "//admin/appdata/DataWarehouse/DSSG/Master_Data/schools_static_features.csv", row.names = FALSE, sep = ",")
62 | 
63 | 
64 | ## School Aggregate Features ####
65 | setwd("//admin/appdata/DataWarehouse/DSSG/School Reports/")
66 | 
67 | fileName1 <- '//admin/appdata/DataWarehouse/DSSG/School Reports/school_aggregate_features_vw.sql'
68 | aggr_query <- readChar(fileName1, file.info(fileName1)$size)
69 | 
70 | library(RODBC)
71 | channel <- odbcConnect("DW_QA")
72 | school_aggr = sqlQuery(channel, aggr_query, stringsAsFactors = FALSE)
73 | odbcClose(channel)
74 | 
75 | for (year in 2011:2013) {
76 |   school_aggr_year = subset(school_aggr, SchoolYear == year)
77 |   school_aggr_year = school_aggr_year[,-which(colnames(school_aggr)=="SchoolYear")]
78 |   write.table(school_aggr_year, paste0("//admin/appdata/DataWarehouse/DSSG/Master_Data/schools_aggregate_features_",year,".csv"), row.names = FALSE, sep = ",")
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/Data_Pipeline/create_students_table.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | 
 3 | ###########################
 4 | ####Create Student data####
 5 | ###########################
 6 | 
 7 | ####CPS Team####
 8 | ####08/20/2014####
 9 | 
10 | setwd("//admin/appdata/DataWarehouse/DSSG/Student_Data")
11 | 
12 | library(RODBC)
13 | 
14 | fileName1 <- '//admin/appdata/DataWarehouse/DSSG/Student_Data/student_table_with_ninth_snapshot_vw.sql'
15 | student_query <- readChar(fileName1, file.info(fileName1)$size)
16 | fileName2 <- '//admin/appdata/DataWarehouse/DSSG/Student_Data/student_ISAT_scores_vw.sql'
17 | tests_query <- readChar(fileName2, file.info(fileName2)$size)
18 | fileName3 <- '//admin/appdata/DataWarehouse/DSSG/Student_Data/student_attendance_vw.sql'
19 | attendance_query <- readChar(fileName3, file.info(fileName3)$size)
20 | 
21 | # an ODBC called "DW_QA" which accesses the K12intel_qa database
22 | channel <- odbcConnect("DW_QA")
23 | students = sqlQuery(channel, student_query, stringsAsFactors = TRUE)
24 | tests = sqlQuery(channel, tests_query)
25 | attendance = sqlQuery(channel, attendance_query)
26 | odbcCloseAll()
27 | 
28 | # indicators of whether the student left the system after 8th
29 | # or entered the system in 9th grade
30 | students$Left <- as.numeric(is.na(students$NextGradeSchoolKey))
31 | students$Entered <- as.numericis.na(students$ThisGradeSchoolKey))
32 | 
33 | # combine with students testing and attendance information
34 | students = merge(students, tests, all.x = TRUE)
35 | students = merge(students, attendance, all.x = TRUE)
36 | 
37 | # add in geocodes
38 | student_geo = read.csv("student_geo_correct.csv")
39 | students = merge(students, student_geo[,-2], all.x = TRUE)
40 | 
41 | # split up by year and save
42 | for (year in 2011:2013) {
43 |   students_year = subset(students, SchoolYear == (year-1))
44 |   students_year = students_year[,-which(colnames(students_year)=="SchoolYear")]
45 |   write.table(students_year, paste0("//admin/appdata/DataWarehouse/DSSG/Master_Data/students",year,".csv"), sep = ",", row.names = FALSE)
46 | }
47 | 


--------------------------------------------------------------------------------
/Data_Pipeline/geocode_addresses.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | 
 3 | #######################
 4 | ####Create Geo Code####
 5 | #######################
 6 | 
 7 | ####CPS Team####
 8 | ####08/20/2014####
 9 | 
10 | library('ggmap')
11 | library('plyr')
12 | library('RDSTK')
13 | 
14 | setwd("//admin/appdata/DataWarehouse/DSSG/")
15 | 
16 | year=2011
17 | students <- read.csv(paste0("students",year,".csv"), header=T)
18 | 
19 | students$Address <- do.call(paste, c(students[c("StreetNumber", "StreetDirection", "StreetName", "StreetType", "City", "State", "ZipCode")], sep="+"))
20 | #fit student addresses into a data frame of text string
21 | #subset to create new dataset
22 | names(students)
23 | student_addresses <- students[,c(5,42)]
24 | 
25 | temp <- data.frame(matrix(NA, nrow(students), 13))
26 | for (i in 1:nrow(students))
27 | {
28 |   tmp <- tryCatch(
29 |     street2coordinates(student_addresses[i,3], session=getCurlHandle())
30 |   )
31 |   temp[i,] <- tmp[1,]
32 |   
33 |   print(i)
34 | }
35 | 
36 | #add latitude and longitudes to original dataset
37 | student_addresses$latitude <- newset[,3]
38 | student_addresses$longitude <- newset[,5]
39 | 
40 | #write to csv
41 | write.csv(student_addresses, file="student_address_table.csv", row.names=F)
42 | 


--------------------------------------------------------------------------------
/Data_Pipeline/school_aggregate_features_vw.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM K12INTEL_DW.DSSG_SCHOOL_AGGREGATE_FEATURES_VW


--------------------------------------------------------------------------------
/Data_Pipeline/student_ISAT_scores_vw.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM K12INTEL_DW.dbo.DSSG_STUDENT_ISTAT_SCORES_VW


--------------------------------------------------------------------------------
/Data_Pipeline/student_attendance_vw.sql:
--------------------------------------------------------------------------------
1 | SELECT *   
2 | FROM K12INTEL_DW.dbo.DSSG_STUDENT_ATTENDANCE_VW


--------------------------------------------------------------------------------
/Data_Pipeline/student_table_with_ninth_snapshot_vw.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM K12INTEL_DW.dbo.DSSG_STUDENT_TABLE_WITH_NINTH_SNAPSHOT_VW


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ##Predicting School-Level Enrollment for Chicago Public Schools##
 2 | <img src="http://dssg.io/img/partners/cps.png" width="300" align="right">
 3 | 
 4 | Statistical models and analysis of student enrollment to better allocate school budgets in Chicago. 
 5 | This project is a 2014 Data Science for Social Good project in partnership with Chicago Public Schools.
 6 | 
 7 | For a more comprehensive overview, please consult our [blog post](http://dssg.io/2014/07/23/cps-enrollment-prediction.html).
 8 | 
 9 | ###Project Description###
10 | Lack of accurate enrollment forecasting (at the individual school level) causes millions of dollars to be re-allocated every year causing administration and teaching disruption.
11 | 
12 | In the Chicago Public Schools system, each school's budget depends on how many students enroll there. Unfortunately, it can be difficult to predict how many students will enroll in a given year, and inaccurate predictions can result in last-minute money shuffles, staff changes, and wasted educational time. Because budget allocations are made in the spring, long before students start in September, early and accurate predictions of student enrollment are crucial.
13 | 
14 | ###The Solution###
15 | To answer these questions, we approached the problem in the following ways:
16 |   * Extensive Exploratory Data Analysis
17 |   * Linear Regression (School Level)
18 |   * Conditional Logit Model (Student Level)
19 |   * Decision Tree (Predict Misprojection)
20 | 
21 | ###Project Layout###
22 |   * Data_Pipeline: process raw data into structured data frame
23 |   * Data_Analysis: data visualization and regression analysis
24 |   * Tool_Box: main tool box for CPS usage to make prediction and diagnose prediction for the future
25 | 
26 | ###Installation###
27 | 
28 | #### Git
29 | To download the code we used for the project, you will need to clone the repository using Git. [Git](http://git-scm.com/) is used version control system we used to orginize our code. We hosted the code on [Github](http://github.com/). You can download Git [here](http://git-scm.com/downloads). Information on getting started with Git is [here](http://git-scm.com/book/en/Getting-Started-Git-Basics). Additionally, you will need to create a Github account.
30 | 
31 | Once you have installed Git, you will need to navigate in command line to the folder in which you want to download the code. Then you will need to clone the respository with the following commands. 
32 | 
33 | ```
34 | git clone https://github.com/dssg/predicting_student_enrollment.git
35 | cd predicting_student_enrollment/
36 | ```
37 | 
38 | #### R
39 | 
40 | We did most of our analyses in R. R is available for free via the [R Project website](http://www.r-project.org) or, with a slightly friendlier user interface, as [RStudio](http://www.rstudio.com). See [these resources](https://github.com/dssg/nfp#using-r) for a more detailed overview of R.
41 | 
42 | Much of R's functionality comes not from the base installation but from additional packages.  Details about how to install packages are available [here](http://www.r-bloggers.com/installing-r-packages).  Our code makes use of the following packages:
43 | 
44 | * ggplot2
45 | * plyr
46 | * survival
47 | * clogitL1
48 | * randomForest
49 | 
50 | ### Team
51 | ![CPS Team](http://dssg.io/img/posts/cps-team.png)
52 | 
53 |   * [Vanessa Ko](https://github.com/vanessako), Political Science, McGill University
54 |   * [Andrew Landgraf](https://github.com/andland), Statistics, The Ohio State University
55 |   * [Tracy Schifeling](https://github.com/tracyschifeling), Statistics, Duke University
56 |   * [Zhou Ye](https://github.com/ZhouYeJava), Computer Science, Johns Hopkins University
57 |   * [Joe Walsh](https://github.com/jtwalsh0), Mentor
58 | 
59 | ### License
60 | Copyright (C) 2014 Data Science for Social Good Fellowship at the University of Chicago
61 | 
62 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
63 | 
64 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
65 | 
66 | THE SOFTWARE IS PROVIDED "AS IS," WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
67 | 


--------------------------------------------------------------------------------
/Tool_Box/README.md:
--------------------------------------------------------------------------------
  1 | ##CPS Tool Box##
  2 | 
  3 | ###Description###
  4 | Models For CPS:
  5 | 
  6 | 1. Cohort Survival Model: **cohort_survival.R**.
  7 | 
  8 | 2. Decision Tree: **model_diagnostics.R**.
  9 | 
 10 | 3. Conditional Logistic Regression: **grade9_prediction.R**.
 11 | 
 12 | The model code is designed to run on **RStudio**. We recommend RStudio since it is the best way to edit and run R for people without much experience before. 
 13 | 
 14 | In order to run three models, you should use the script *main.R*. There are three sections in the code that correspond to the three models. 
 15 | 
 16 | For all parts, you need to source the function scripts, set the parameters and run the function to obtain the results. The details of three models are introduced below. 
 17 | 
 18 | **We expect CPS to use the above four scripts.** There are also evaluation scripts which are used to evaluate the model: *eval_grade9_prediction.R* and *eval_new_grade9_prediction.R*. The first one is to evaluate on all schools and the second one is on new schools. These codes are used by ourselves to see how the model will perform. 
 19 | 
 20 | 
 21 | ###Cohort Survival Model###
 22 | This is essentially the replication of CPS's work on predicting the next year enrollment for each grade of each school given five-year history. For example, if we want to predict the enrollment on 2013, we need to use the enrollment data from 2008 to 2012. Our replication improves on the original VBA to code: the replication needs 3 seconds to run while the original VBA requires 1 hour and 25 minutes. 
 23 | 
 24 | This is the code to run the script:
 25 | 
 26 | ```
 27 | source("cohort_survival.R")
 28 | location <- "/mnt/data/cps/Data/"
 29 | year_to_predict <- 2013
 30 | result <- cohort_survival(location, year_to_predict)
 31 | ```
 32 | 
 33 | The variable "location" is the directory to store all data files. The variable "year_to_predict" is the year you want to make the prediction. For example, if you want to make predictions on 2013, then type 2013. The code will automatically find the previous five-year enrollment data for you. 
 34 | 
 35 | The input data for the cohort survival model is one active grade file and five enrollment history files. We stress the usage of the active grade file. The column names of this file are SchoolID and all grades. Before making predictions, we suggest knowing, for each school and each grade, what the predictions are on. If the grades at a certain school needs prediction, then put 1 there; otherwise 0. As a default, set all entries as 1. 
 36 | 
 37 | The output is a csv file called "csm_prediction_year.csv". The year corresponds on the year you input into the model. 
 38 | 
 39 | 
 40 | ###Decision Tree###
 41 | This part is to help CPS diagnose their predictions. Specifically, it will help CPS find the set of schools with the highest likelihood to be mis-projected given the threshold.
 42 | 
 43 | This is the code to run the script:
 44 | 
 45 | ```
 46 | source("model_diagnostics.R")
 47 | location <- "/mnt/data/cps/Data/"
 48 | year_to_train <- 2012
 49 | threshold <- 50
 50 | grade <- "Grade9"
 51 | model <- diagnostic(location, year, threshold, grade)
 52 | year_to_predict <- 2013
 53 | result <- prediction(location, year, threshold, grade, model)
 54 | ```
 55 | 
 56 | The variable "location" is the directory to store all data files. The variable "year_to_train" is the year you want to analyze the misprojection. The variable "threshold" shows the tolerance for the misprojection: if the misprojection is larger than the threshold, it is bad; otherwise, good. The variable "grade" is the grade you want to make diagnostics on. 
 57 | 
 58 | In the code above, we will make diagnostics on the year 2012 and grade 9 with a threshold of 50. The output of the "diagnostic" function is the R object from the decision tree and a visualization of the decision tree. You will find the probability of making a bad projection from the leaves of the tree to see which types of schools are likely to be misprojected. 
 59 | 
 60 | The decision tree is very sensitive to the threshold. If you set threshold 50 or 25, the two trees might be very different. Thus you need to know what threshold you are interested in before using this script. 
 61 | 
 62 | Another function is "prediction". This function is used to predict the probability of making a bad projection for the future. The hope here is to help CPS adjust predictions more accurately. We suggest CPS use this model when you do not have other solid information when adjusting predictions. The input arguments are similar with "diagnostic" but requires as input the model trained from "diagnostic" into the model. 
 63 | 
 64 | If you want to make a prediction, the output will be "misprojection_year.csv". The year depends on your inputs. 
 65 | 
 66 | 
 67 | ###Conditional Logistic Regression###
 68 | This is our main work of this summer -- a student-level model to make predictions for the 8th to 9th grade. The model we use is a conditional logistic regression. It can provide the probability of student i going to school j for all students and all schools. You can also obtain the predictive enrollment from these probability.
 69 | 
 70 | 
 71 | This is the code to run the script:
 72 | 
 73 | ```
 74 | source("grade9_prediction.R")
 75 | location <- "/mnt/data/cps/Data/"
 76 | year_to_predict <- 2012
 77 | result <- grade9_prediction(location, year_to_predict)
 78 | ```
 79 | 
 80 | The input is exactly the same as the cohort survival model. 
 81 | 
 82 | For output, there are two RData files. They are the data frames for training and testing conditonal logit model. Since the data size has over 4,000,000 rows and 80 columns, they are saved as RData files. Use "load" function in R to read the data. The variable names are "data_last_year" and "data_this_year". The code to generate the Rdata files is (variable "year" is the year to make prediction on): 
 83 | 
 84 | ```
 85 | save(data_last_year, file=paste0("clogit_", year, "_data_", year-1, ".RData")) #last year data
 86 | save(data_this_year, file=paste0("clogit_", year, "_data_", year, ".RData")) #this year data
 87 | ```
 88 | 
 89 | There are also four csv files associated with the probability of student i going to school j. The code to generate the probability files is:
 90 | 
 91 | ```
 92 | write.csv(x=prob_d, file=paste0("clogit_", year, "_prob_d.csv"), row.names=F) #distance
 93 | write.csv(x=prob_dc, file=paste0("clogit_", year, "_prob_dc.csv"), row.names=F) #distance+catchment+rating
 94 | write.csv(x=prob_dcr, file=paste0("clogit_", year, "_prob_dcr.csv"), row.names=F) #distance+catchment+rating
 95 | write.csv(x=prob_all, file=paste0("clogit_", year, "_prob_all.csv"), row.names=F) #all features
 96 | ```
 97 | 
 98 | There are four models which are associated with what features we want to put in:
 99 | 
100 | 1. d: we only use distance from home to high school as features.
101 | 
102 | 2. dc: we use distance and if the school is a catchment school as features.
103 | 
104 | 3. dcr: we use distance, catchment school and school rating as features.
105 | 
106 | 4. all: we use all the features including the above and also previous enrollment, school attendance rate, etc. 
107 | 
108 | Another four csv files associated with four models above are the enrollment prediction files. The code to generate the enrollment files is:
109 | 
110 | ```
111 | write.csv(x=enrollment_d, file=paste0("clogit_", year, "_prediction_d.csv"), row.names=F) #distance
112 | write.csv(x=enrollment_dc, file=paste0("clogit_", year, "_prediction_dc.csv"), row.names=F) #distance+catchment
113 | write.csv(x=enrollment_dcr, file=paste0("clogit_", year, "_prediction_dcr.csv"), row.names=F) #distance+catchment+rating
114 | write.csv(x=enrollment_all, file=paste0("clogit_", year, "_prediction_all.csv"), row.names=F) #all features
115 | ```
116 | 
117 | The predictive enrollment is obtained by summing up the probability of all students for a specific school. 
118 | 
119 | Finally, another two files are the baseline and our final ensemble enrollment prediction. CPS should use the ensemble file as their prediction of enrollment of 9th grade in practice. The code to generate the two enrollment files is:
120 | 
121 | ```
122 | write.csv(x=baseline, file=paste0("baseline_", year, "_prediction_all.csv"), row.names=F) #baseline
123 | write.csv(x=ensemble, file=paste0("ensemble_", year, "_prediction_all.csv"), row.names=F) #final model
124 | ```
125 | 
126 | The baseline model is copy the enrollment last year for old schools and for the new schools, we use (total students-total students predicted in old schools)/(number of total new schools). This will gurantee the total students match. 
127 | 
128 | We recommend that CPS needs to use **ensemble_year_prediction_all.csv** for the final prediction. The year depends on the year you want to make prediction. It uses clogit to predict new schools and proportionally scale the students on old schools based on the enrollment last year so that the total number of students are correct. 
129 | 
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/Tool_Box/cohort_survival.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript
  2 | 
  3 | #############################
  4 | ####Cohort Survival Model####
  5 | #############################
  6 | 
  7 | ####CPS Team####
  8 | ####08/19/2014####
  9 | 
 10 | ####Moving Average####
 11 | #data: history 
 12 | #num: years of moving average
 13 | ma <- function(data, num) {
 14 |     pred <- vector(length=length(data)-num)
 15 |     for (i in 1:(length(data)-num)) {
 16 |         pred[i] <- mean(data[i:(i+1)])
 17 |     }
 18 |     return(pred)
 19 | }
 20 | 
 21 | ####Exponential Smoothing####
 22 | #data: history
 23 | #num: percentage of exponential smoothing
 24 | es <- function(data, num) {
 25 |     pred <- vector(length=length(data)-1)
 26 |     for (i in 1:(length(data)-1)) {
 27 |         if (i==1) {
 28 |             pred[i] <- data[i]
 29 |         }
 30 |         else {
 31 |             pred[i] <- num*data[i]+(1-num)*pred[i-1]
 32 |         }
 33 |     }
 34 |     return(pred)
 35 | }
 36 | 
 37 | ####All Models####
 38 | #enroll_vector: historical enrollment
 39 | #this will provide the best prediction based on six models
 40 | model <- function(enroll_vector) {
 41 |     data <- enroll_vector[enroll_vector!=0]
 42 |     if (length(data)==1) {
 43 |         cat(paste0("Single"), "\n")
 44 |         return(data)
 45 |     }
 46 |     else if (length(data)==2) {
 47 |         cat(paste0("Double"), "\n")
 48 |         return(mean(data))
 49 |     }
 50 |     else {
 51 |         candidate <- vector(length=6)
 52 |         value <- vector(length=6)
 53 |         #MA1
 54 |         pred <- ma(data, 1)
 55 |         real <- data[-1]
 56 |         candidate[1] <- mean((real-pred)^2)
 57 |         value[1] <- pred[length(pred)]
 58 |         #MA2
 59 |         pred <- ma(data, 2)
 60 |         real <- data[-c(1,2)]
 61 |         candidate[2] <- mean((real-pred)^2)
 62 |         value[2] <- pred[length(pred)]
 63 |         #ES2
 64 |         pred <- es(data, 0.2)
 65 |         real <- data[-1]
 66 |         candidate[3] <- mean((real-pred)^2)
 67 |         value[3] <- pred[length(pred)]
 68 |         #ES3
 69 |         pred <- es(data, 0.3)
 70 |         real <- data[-1]
 71 |         candidate[4] <- mean((real-pred)^2)
 72 |         value[4] <- pred[length(pred)]
 73 |         #ES4
 74 |         pred <- es(data, 0.4)
 75 |         real <- data[-1]
 76 |         candidate[5] <- mean((real-pred)^2)
 77 |         value[5] <- pred[length(pred)]
 78 |         #ES5
 79 |         pred <- es(data, 0.5)
 80 |         real <- data[-1]
 81 |         candidate[6] <- mean((real-pred)^2)
 82 |         value[6] <- pred[length(pred)]
 83 |         all_model <- c("MA1", "MA2", "ES2", "ES3", "ES4", "ES5")
 84 |         cat(paste0(all_model[which.min(candidate)], "\n"))
 85 |         return(value[which.min(candidate)])
 86 |     }
 87 | }
 88 | 
 89 | ####Cohort Survival Model####
 90 | #last_grade: name of last grade
 91 | #this_grade: name of this grade
 92 | #city_csr: city-level cohort survival rate
 93 | csm <- function(last_grade, this_grade, city_csr) {
 94 |     ratio <- unlist(this_grade)[-1]/unlist(last_grade)[-length(unlist(last_grade))]
 95 |     ratio[is.nan(ratio)] <- 0
 96 |     ratio[is.infinite(ratio)] <- 0
 97 |     if (sum(ratio)!=0) {
 98 |         temp <- model(ratio)
 99 |         if (temp!=0) {
100 |             return(round(unlist(last_grade)[length(unlist(last_grade))]*temp))
101 |         }
102 |         else {
103 |             return(round(unlist(last_grade)[length(unlist(last_grade))]*mean(ratio[ratio!=0])))
104 |         }
105 |     }
106 |     else {
107 |         return(round(unlist(last_grade)[length(unlist(last_grade))]*city_csr[,names(this_grade)]))
108 |     }
109 | }
110 | 
111 | ####Entry Level Model####
112 | #this_grade: name of this grade
113 | #city_el: city-level enrollment
114 | el <- function(this_grade,  city_el) {
115 |     if (sum(this_grade)!=0) {
116 |         temp <- model(unlist(this_grade))
117 |         if (temp!=0) {
118 |             return(round(temp))
119 |         }
120 |         else {
121 |             return(round(city_el[,names(this_grade)]))
122 |         }
123 |     }
124 |     else {
125 |         return(round(city_el[,names(this_grade)]))
126 |     }
127 | }
128 | 
129 | ####Predict Next Year Enrollment####
130 | #history: enrollment history
131 | #grade: the grade we want to make prediction
132 | #city_csr: city-level cohort survival rate
133 | #city_el: city-level enrollment
134 | pred_enroll <- function(history, grade, city_csr, city_el) {
135 |     pred <- data.frame(matrix(NA, 1, ncol(history)))
136 |     name <- colnames(history)
137 |     colnames(pred) <- name
138 |     pred$SchoolID <- unique(history$SchoolID)
139 |     for (i in 2:(ncol(grade)-1)) {
140 |         if (grade[i]!=0) { #active grade
141 |             cat(paste0(name[i], "\n"))
142 |             if (name[i]=="K" | name[i]=="Grade1" | name[i]=="Grade9" | name[i]=="LRE") {
143 |                 pred[i] <- el(history[i], city_el) #entry level model
144 |             }
145 |             else {
146 |                 if (history[nrow(history),i-1]!=0) { #check if last grade last year has enrollment
147 |                     pred[i] <- csm(history[i-1], history[i], city_csr) #cohort survival model
148 |                 }
149 |                 else {
150 |                     pred[i] <- el(history[i], city_el) #entry level model
151 |                 }
152 |             }
153 |         }
154 |         else {
155 |             pred[i] <- 0
156 |         }
157 |     }
158 |     pred$ATOT <- sum(pred[2:(length(pred)-1)])
159 |     return(pred)
160 | }
161 | 
162 | ####Main Function For Cohort Survival Model####
163 | #location: data source
164 | #year: year to predict
165 | cohort_survival <- function(location, year) {
166 |     #Read Data
167 |     active <- read.csv(paste0(location, "active", year, ".csv")) #active schools and grades
168 |     school1 <- read.csv(paste0(location, "true", year-1, ".csv")) #enrollment -1 year
169 |     school2 <- read.csv(paste0(location, "true", year-2, ".csv")) #enrollment -2 year
170 |     school3 <- read.csv(paste0(location, "true", year-3, ".csv")) #enrollment -3 year
171 |     school4 <- read.csv(paste0(location, "true", year-4, ".csv")) #enrollment -4 year
172 |     school5 <- read.csv(paste0(location, "true", year-5, ".csv")) #enrollment -5 year
173 |     school_list <- list(school5, school4, school3, school2, school1)
174 |     #Set Up
175 |     all_id <- unique(active$SchoolID)
176 |     all_school <- do.call("rbind", school_list)
177 |     all_school$SchoolID <- NULL
178 |     all_school$ATOT <- NULL
179 |     #City Level Cohort Survival Rate
180 |     city_csr <- data.frame(Grade2=0.98, Grade3=1.03, Grade4=0.93, Grade5=0.98, Grade6=1, Grade7=0.97, Grade8=0.98, Grade10=1.04, Grade11=0.84, Grade12=0.91)
181 |     city_el <- data.frame(matrix(apply(all_school, 2, function(x) {mean(x[x!=0])}), 1, ncol(all_school)))
182 |     colnames(city_el) <- colnames(all_school)
183 |     #Make Prediction
184 |     name <- c("SchoolID", "K", "Grade1", "Grade2", "Grade3", "Grade4", "Grade5", "Grade6", "Grade7", "Grade8", "Grade9", "Grade10", "Grade11", "Grade12", "LRE", "ATOT")
185 |     result <- data.frame(matrix(NA, length(all_id), length(name)))
186 |     colnames(result) <- name
187 |     for (i in 1:length(all_id)) {
188 |         id = all_id[i]
189 |         cat(paste0(id), "\n")
190 |         active_grade <- subset(active, active$SchoolID==id)
191 |         history <- data.frame(matrix(NA, length(school_list), length(name)))
192 |         colnames(history) <- name
193 |         for (j in 1:length(school_list)) {
194 |             school <- school_list[[j]]
195 |             index <- which(school$SchoolID==id)
196 |             if (length(index)!=0) {
197 |                 history[j,] <- school[index,]
198 |             }
199 |             else {
200 |                 history[j,] <- c(id, rep(0, length(name)-1))
201 |             }
202 |         }
203 |         result[i,] <- pred_enroll(history, active_grade, city_csr, city_el)
204 |     }
205 |     write.csv(x=result, file=paste0("csm_prediction_", year, ".csv"), row.names=F)
206 |     return(result)
207 | }


--------------------------------------------------------------------------------
/Tool_Box/eval_grade9_prediction.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | 
 3 | ################################################
 4 | ####Evaluate Conditional Logistic Regression####
 5 | ################################################
 6 | 
 7 | ####CPS Team####
 8 | ####08/18/2014####
 9 | 
10 | rm(list=ls())
11 | setwd("/home/zye/Model/")
12 | year <- 2013
13 | 
14 | ####Packages####
15 | suppressMessages(library(stringr))
16 | suppressMessages(library(reshape2))
17 | suppressMessages(library(ggplot2))
18 | 
19 | ####Accuracy####
20 | accuracy <- function(prob, true_school, len) {
21 |     acc <- vector(length=len)
22 |     temp <- t(apply(-prob[,-1], 1, rank, ties.method="random"))
23 |     for (k in 1:len) {
24 |         top_k <- (temp<=k)*1
25 |         acc[k] <- sum(true_school*top_k)/nrow(top_k)
26 |     }
27 |     return(acc)
28 | }
29 | 
30 | ####Evaluation####
31 | #True Enrollment This Year
32 | load(paste0("clogit_", year, "_data_", year, ".RData"))
33 | true_school <- dcast(data_this_year, StudentKey~SchoolKey, mean, value.var="SchoolSelected")[,-1]
34 | true_enrollment <- data.frame(SchoolKey=as.numeric(names(true_school)), Enrollment=round(colSums(true_school)))
35 | #Predictive Probability
36 | d <- read.csv(paste0("clogit_", year, "_prob_d.csv")) #distance
37 | dc <- read.csv(paste0("clogit_", year, "_prob_dc.csv")) #distance+catchment
38 | dcr <- read.csv(paste0("clogit_", year, "_prob_dcr.csv")) #distance+catchment+rating
39 | all <- read.csv(paste0("clogit_", year, "_prob_all.csv")) #all features
40 | #Predictive Enrollment
41 | d_prediction <- read.csv(paste0("clogit_", year, "_prediction_d.csv"))
42 | dc_prediction <- read.csv(paste0("clogit_", year, "_prediction_dc.csv"))
43 | dcr_prediction <- read.csv(paste0("clogit_", year, "_prediction_dcr.csv"))
44 | all_prediction <- read.csv(paste0("clogit_", year, "_prediction_all.csv"))
45 | baseline_prediction <- read.csv(paste0("baseline_", year, "_prediction_all.csv"))
46 | ensemble_prediction <- read.csv(paste0("ensemble_", year, "_prediction_all.csv"))
47 | #Compute Accuracy From Probability
48 | num <- 10
49 | d_acc <- data.frame(Model=rep("D", num), Accuracy=accuracy(d, true_school, num))
50 | dc_acc <- data.frame(Model=rep("D+C", num), Accuracy=accuracy(dc, true_school, num))
51 | dcr_acc <- data.frame(Model=rep("D+C+R", num), Accuracy=accuracy(dcr, true_school, num))
52 | all_acc <- data.frame(Model=rep("ALL", num), Accuracy=accuracy(all, true_school, num))
53 | acc_graph <- do.call("rbind", list(d_acc, dc_acc, dcr_acc, all_acc))
54 | acc_graph$Rank <- factor(1:num, levels=1:num)
55 | ggplot(acc_graph, aes(x=Rank, y=Accuracy, group=Model, colour=Model))+geom_line()
56 | #Compare Enrollment
57 | d_compare <- merge(d_prediction, true_enrollment, by="SchoolKey")
58 | dc_compare <- merge(dc_prediction, true_enrollment, by="SchoolKey")
59 | dcr_compare <- merge(dcr_prediction, true_enrollment, by="SchoolKey")
60 | all_compare <- merge(all_prediction, true_enrollment, by="SchoolKey")
61 | baseline_compare <- merge(baseline_prediction, true_enrollment, by="SchoolKey")
62 | ensemble_compare <- merge(ensemble_prediction, true_enrollment, by="SchoolKey")
63 | d_mae <- mean(abs(d_compare$Enrollment.x-d_compare$Enrollment.y))
64 | dc_mae <- mean(abs(dc_compare$Enrollment.x-dc_compare$Enrollment.y))
65 | dcr_mae <- mean(abs(dcr_compare$Enrollment.x-dcr_compare$Enrollment.y))
66 | all_mae <- mean(abs(all_compare$Enrollment.x-all_compare$Enrollment.y))
67 | baseline_mae <- mean(abs(baseline_compare$Enrollment.x-baseline_compare$Enrollment.y))
68 | ensemble_mae <- mean(abs(ensemble_compare$Enrollment.x-ensemble_compare$Enrollment.y))
69 | mae_graph <- data.frame(Model=factor(c("clogit", "baseline", "ensemble"), levels=c("clogit", "baseline", "ensemble")), MAE=c(all_mae, baseline_mae, ensemble_mae))
70 | ggplot(mae_graph, aes(x=Model, y=MAE))+geom_bar(stat="identity", fill="midnightblue")
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/Tool_Box/eval_new_grade9_prediction.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript
  2 | 
  3 | ###########################
  4 | ###Evaluation New School###
  5 | ###########################
  6 | 
  7 | ###Zhou Ye###
  8 | ###08/06/2014###
  9 | 
 10 | rm(list=ls())
 11 | setwd("/home/zye/Model")
 12 | year <- 2013
 13 | suppressMessages(library(stringr))
 14 | suppressMessages(library(reshape2))
 15 | suppressMessages(library(ggplot2))
 16 | result <- read.csv(paste0("clogit_", year, "_prob_all.csv"))
 17 | load(paste0("clogit_", year, "_data_", year-1, ".RData"))
 18 | load(paste0("clogit_", year, "_data_", year, ".RData"))
 19 | new_school <- setdiff(unique(data_this_year$SchoolKey), unique(data_last_year$SchoolKey))
 20 | 
 21 | ###Reshape###
 22 | reshape_enroll <- function(data) {
 23 |     school_key <- as.numeric(str_replace(names(data), "X", ""))
 24 |     enroll <- as.numeric(data)
 25 |     school_enroll <- data.frame(SchoolKey=school_key, Enrollment=enroll)
 26 |     return(school_enroll)
 27 | }
 28 | 
 29 | ###Distance###
 30 | rdist_earth_vec <- function(x1, x2, miles=T, R=NULL) {
 31 |     #modified from fields::rdist.earth
 32 |     if (is.null(R)) {
 33 |         if (miles) {
 34 |             R <- 3963.34
 35 |         }
 36 |         else {
 37 |             R <- 6378.388
 38 |         }
 39 |     }
 40 |     coslat1 <- cos((x1[,2]*pi)/180)
 41 |     sinlat1 <- sin((x1[,2]*pi)/180)
 42 |     coslon1 <- cos((x1[,1]*pi)/180)
 43 |     sinlon1 <- sin((x1[,1]*pi)/180)
 44 |     coslat2 <- cos((x2[,2]*pi)/180)
 45 |     sinlat2 <- sin((x2[,2]*pi)/180)
 46 |     coslon2 <- cos((x2[,1]*pi)/180)
 47 |     sinlon2 <- sin((x2[,1]*pi)/180)
 48 |     pp <- cbind(coslat1*coslon1, coslat1*sinlon1, sinlat1)*cbind(coslat2*coslon2, coslat2*sinlon2, sinlat2)
 49 |     pp <- rowSums(pp)
 50 |     return(R*acos(ifelse(abs(pp)>1, 1*sign(pp), pp)))
 51 | }
 52 | 
 53 | ###Prediction###
 54 | prediction <- round(colSums(result[,-1]))
 55 | pred_enroll <- reshape_enroll(prediction)
 56 | 
 57 | ###True Enrollment###
 58 | real <- dcast(data_this_year, StudentKey~SchoolKey, mean, value.var="SchoolSelected")
 59 | true_enroll <- reshape_enroll(colSums(real[,-1]))
 60 | 
 61 | ###School Location###
 62 | school_location <- unique(with(data_this_year, data.frame(SchoolKey, LongitudeHS, LatitudeHS)))
 63 | old_school_location <- subset(school_location, !SchoolKey%in%new_school)
 64 | old_school_location$Distance <- NA
 65 | 
 66 | ###Baseline###
 67 | last_year <- dcast(data_last_year, StudentKey~SchoolKey, mean, value.var="SchoolSelected")
 68 | last_enroll <- reshape_enroll(colSums(last_year[,-1]))
 69 | #Last Year Mean
 70 | baseline1 <- data.frame(SchoolKey=true_enroll$SchoolKey, Enrollment=NA)
 71 | for (i in 1:nrow(baseline1)) {
 72 |     index <- which(last_enroll$SchoolKey==baseline1[i,"SchoolKey"])
 73 |     if (length(index)>0) {
 74 |         baseline1[i,"Enrollment"] <- last_enroll[index,"Enrollment"]
 75 |     }
 76 | }
 77 | baseline1[is.na(baseline1$Enrollment),"Enrollment"] <- round(mean(baseline1$Enrollment, na.rm=T))
 78 | #Last Year Median
 79 | baseline2 <- data.frame(SchoolKey=true_enroll$SchoolKey, Enrollment=NA)
 80 | for (i in 1:nrow(baseline1)) {
 81 |     index <- which(last_enroll$SchoolKey==baseline2[i,"SchoolKey"])
 82 |     if (length(index)>0) {
 83 |         baseline2[i,"Enrollment"] <- last_enroll[index,"Enrollment"]
 84 |     }
 85 | }
 86 | baseline2[is.na(baseline2$Enrollment),"Enrollment"] <- round(median(baseline2$Enrollment, na.rm=T))
 87 | #KNN Mean
 88 | k <- 5
 89 | baseline3 <- data.frame(SchoolKey=true_enroll$SchoolKey, Enrollment=NA)
 90 | for (i in 1:nrow(baseline3)) {
 91 |     index <- which(last_enroll$SchoolKey==baseline3[i,"SchoolKey"])
 92 |     if (length(index)>0) {
 93 |         baseline3[i,"Enrollment"] <- last_enroll[index,"Enrollment"]
 94 |     }
 95 |     else {
 96 |         self_location <- subset(school_location, SchoolKey==baseline3[i,"SchoolKey"])
 97 |         self_coordinate <- with(self_location, t(replicate(nrow(old_school_location), c(LongitudeHS, LatitudeHS))))
 98 |         old_school_location <- within(old_school_location, {
 99 |             Distance <- rdist_earth_vec(cbind(LongitudeHS, LatitudeHS), self_coordinate)
100 |         })
101 |         close_school_location <- old_school_location[order(old_school_location$Distance),][1:k,]
102 |         baseline3[i,"Enrollment"] <- round(mean(subset(last_enroll, SchoolKey%in%close_school_location$SchoolKey)$Enrollment))
103 |     }
104 | }
105 | #KNN Median
106 | k <- 5
107 | baseline4 <- data.frame(SchoolKey=true_enroll$SchoolKey, Enrollment=NA)
108 | for (i in 1:nrow(baseline4)) {
109 |     index <- which(last_enroll$SchoolKey==baseline4[i,"SchoolKey"])
110 |     if (length(index)>0) {
111 |         baseline4[i,"Enrollment"] <- last_enroll[index,"Enrollment"]
112 |     }
113 |     else {
114 |         self_location <- subset(school_location, SchoolKey==baseline3[i,"SchoolKey"])
115 |         self_coordinate <- with(self_location, t(replicate(nrow(old_school_location), c(LongitudeHS, LatitudeHS))))
116 |         old_school_location <- within(old_school_location, {
117 |             Distance <- rdist_earth_vec(cbind(LongitudeHS, LatitudeHS), self_coordinate)
118 |         })
119 |         close_school_location <- old_school_location[order(old_school_location$Distance),][1:k,]
120 |         baseline4[i,"Enrollment"] <- round(median(subset(last_enroll, SchoolKey%in%close_school_location$SchoolKey)$Enrollment))
121 |     }
122 | }
123 | 
124 | ###Evaluation###
125 | pred <- subset(merge(pred_enroll, true_enroll, by="SchoolKey"), SchoolKey%in%new_school)
126 | base1 <- subset(merge(baseline1, true_enroll, by="SchoolKey"), SchoolKey%in%new_school)
127 | base2 <- subset(merge(baseline2, true_enroll, by="SchoolKey"), SchoolKey%in%new_school)
128 | base3 <- subset(merge(baseline3, true_enroll, by="SchoolKey"), SchoolKey%in%new_school)
129 | base4 <- subset(merge(baseline4, true_enroll, by="SchoolKey"), SchoolKey%in%new_school)
130 | pred_mae <- mean(abs(pred$Enrollment.x-pred$Enrollment.y))
131 | base1_mae <- mean(abs(base1$Enrollment.x-base1$Enrollment.y))
132 | base2_mae <- mean(abs(base2$Enrollment.x-base2$Enrollment.y))
133 | base3_mae <- mean(abs(base3$Enrollment.x-base3$Enrollment.y))
134 | base4_mae <- mean(abs(base4$Enrollment.x-base4$Enrollment.y))
135 | mae_graph <- data.frame(Model=factor(c("baseline1", "baseline2", "baseline3", "baseline4", "clogit"), levels=c("baseline1", "baseline2", "baseline3", "baseline4", "clogit")), MAE=c(base1_mae, base2_mae, base3_mae, base4_mae, pred_mae))
136 | ggplot(mae_graph, aes(x=Model, y=MAE))+geom_bar(stat="identity", fill="midnightblue")
137 | 
138 | 
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/Tool_Box/grade9_prediction.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript
  2 | 
  3 | ##########################
  4 | ####Grade 9 Prediction####
  5 | ##########################
  6 | 
  7 | ####CPS Team####
  8 | ####08/17/2014####
  9 | 
 10 | ####Packages####
 11 | suppressMessages(library(dplyr))
 12 | suppressMessages(library(fields))
 13 | suppressMessages(library(reshape2))
 14 | suppressMessages(library(clogitL1))
 15 | suppressMessages(library(survival))
 16 | suppressMessages(library(stringr))
 17 | 
 18 | ####Compute Distance on Map into a Vector####
 19 | #modified from fields::rdist.earth (see rdist.earth for more details)
 20 | #now x1 and x2 can be vectors
 21 | rdist_earth_vec <- function(x1, x2, miles=T, R=NULL) {
 22 |     if (is.null(R)) {
 23 |         if (miles) {
 24 |             R <- 3963.34
 25 |         }
 26 |         else {
 27 |             R <- 6378.388
 28 |         }
 29 |     }
 30 |     coslat1 <- cos((x1[,2]*pi)/180)
 31 |     sinlat1 <- sin((x1[,2]*pi)/180)
 32 |     coslon1 <- cos((x1[,1]*pi)/180)
 33 |     sinlon1 <- sin((x1[,1]*pi)/180)
 34 |     coslat2 <- cos((x2[,2]*pi)/180)
 35 |     sinlat2 <- sin((x2[,2]*pi)/180)
 36 |     coslon2 <- cos((x2[,1]*pi)/180)
 37 |     sinlon2 <- sin((x2[,1]*pi)/180)
 38 |     pp <- cbind(coslat1*coslon1, coslat1*sinlon1, sinlat1)*cbind(coslat2*coslon2, coslat2*sinlon2, sinlat2)
 39 |     pp <- rowSums(pp)
 40 |     return(R*acos(ifelse(abs(pp)>1, 1*sign(pp), pp)))
 41 | }
 42 | 
 43 | ###Construct Data Set for Conditional Logistic Regression###
 44 | #location: the path to the data
 45 | #year: the year we want make prediction
 46 | #num: Inf for all schools; otherwise only "num" schools are used
 47 | construct_data <- function(location, year, num) {
 48 |     #Process Data
 49 |     students <- read.csv(paste0(location, "students", year, ".csv"), stringsAsFactors=F) #student information
 50 |     students <- subset(students, !(EducationType%in%c("21 - 60%", "61 - 100%"))) #exclude LRE2/LRE3 students
 51 |     names(students)[names(students)=="Longitude"] <- "LongitudeHome"
 52 |     names(students)[names(students)=="Latitude"] <- "LatitudeHome"
 53 |     schools_static <- read.csv(paste0(location, "schools_static_features.csv"), stringsAsFactors=F) #school static information
 54 |     schools_progress <- read.csv(paste0(location, "schools_changing_features_", year, ".csv"), stringsAsFactors=F) #school progress feature
 55 |     schools_aggregate <- read.csv(paste0(location, "schools_aggregate_features_", year, ".csv"), stringsAsFactors=F) #school aggregate feature
 56 |     schools_change <- merge(schools_progress, schools_aggregate, by="SchoolID", all.x=T) #school changing feature
 57 |     schools_change$SchoolKey <- NULL
 58 |     schools_change[is.na(schools_change$White),]$White <- median(schools_change$White, na.rm=T) #impute race
 59 |     schools_change[is.na(schools_change$Black),]$Black <- median(schools_change$Black, na.rm=T) #impute race
 60 |     schools_change[is.na(schools_change$Hispanic),]$Hispanic <- median(schools_change$Hispanic, na.rm=T) #impute racex
 61 |     schools_change$Other <- pmax(0, 100-rowSums(schools_change[,c("White", "Black", "Hispanic")])) #other race
 62 |     enroll <- read.csv(paste0(location, "true", year-1, ".csv"), stringsAsFactors=F)
 63 |     enroll9 <- subset(enroll, select=c("SchoolID", "Grade9"))
 64 |     colnames(enroll9) <- c("SchoolID", "PreviousEnrollment")
 65 |     cat("Finish Reading Data...\n")
 66 |     #Inner Join to Obtain Final Student and School Data
 67 |     schools <- subset(schools_static, select=c(SchoolKey, SchoolID, SchoolType, SchoolSubType, SchoolStyle, Longitude, Latitude))
 68 |     colnames(schools) <- c("SchoolKey", "SchoolIDES", "SchoolTypeES", "SchoolSubTypeES", "SchoolStyleES", "LongitudeES", "LatitudeES") #elementary schools
 69 |     students_schools <- merge(students, schools, all.x=T, by.x="ThisGradeSchoolKey", by.y="SchoolKey")
 70 |     students_final <- students_schools #final students data
 71 |     colnames(schools) <- c("SchoolKey", "SchoolIDHS", "SchoolTypeHS", "SchoolSubTypeHS", "SchoolStyleHS", "LongitudeHS", "LatitudeHS") #high schools
 72 |     students_schools <- merge(students_schools, schools, all.x=T, by.x="NextGradeSchoolKey", by.y="SchoolKey")
 73 |     students_schools$DistanceHomeHS <- with(students_schools, rdist_earth_vec(cbind(LongitudeHome, LatitudeHome), cbind(LongitudeHS, LatitudeHS)))
 74 |     possible_schools <- tbl_df(students_schools) %>% 
 75 |         group_by(NextGradeSchoolKey) %>% 
 76 |         summarize(n=n(), AvgDist=mean(DistanceHomeHS, na.rm=T), n_dist=sum(!is.na(DistanceHomeHS))) %>%
 77 |         filter(n_dist>1) %>%
 78 |         arrange(desc(n_dist)) #remove NA distance schools
 79 |     if (is.finite(num)) {
 80 |         possible_schools <- possible_schools[1:num,] #sub-sampling
 81 |     }
 82 |     students_final <- subset(students_final, NextGradeSchoolKey%in%possible_schools$NextGradeSchoolKey)
 83 |     schools_final <- subset(schools, SchoolKey%in%possible_schools$NextGradeSchoolKey)
 84 |     schools_final <- merge(schools_final, enroll9, by.x="SchoolIDHS", by.y="SchoolID", all.x=T)
 85 |     schools_final <- merge(schools_final, schools_change, by.x="SchoolIDHS", by.y="SchoolID", all.x=T)
 86 |     schools_final <- within(schools_final, {
 87 |         PreviousEnrollment[is.na(PreviousEnrollment)] <- 0
 88 |         NewSchool <- as.numeric(PreviousEnrollment == 0)
 89 |     })
 90 |     #Outer Join to Obtain Data for Conditional Logistic Regression
 91 |     students_schools_final <- merge(unique(students_final), unique(schools_final), by=NULL, all=T)
 92 |     cat("Finish Constructing Table...\n")
 93 |     #Feature Computation and Missing Value
 94 |     #1. Distance
 95 |     students_schools_final$DistanceHomeHS <- with(students_schools_final, rdist_earth_vec(cbind(LongitudeHome, LatitudeHome), cbind(LongitudeHS, LatitudeHS)))
 96 |     median_distance <- median(students_schools_final$DistanceHomeHS, na.rm=T)
 97 |     students_schools_final[is.na(students_schools_final$DistanceHomeHS),]$DistanceHomeHS <- median_distance
 98 |     students_schools_final$DistanceESHS <- with(students_schools_final, rdist_earth_vec(cbind(LongitudeES, LatitudeES), cbind(LongitudeHS, LatitudeHS)))
 99 |     median_distance <- median(students_schools_final$DistanceESHS, na.rm=T)
100 |     students_schools_final[is.na(students_schools_final$DistanceESHS),]$DistanceESHS <- median_distance
101 |     #2. School Feature
102 |     students_schools_final <- within(students_schools_final, {
103 |         SchoolTypeES[is.na(SchoolTypeES)] <- "Unknown"
104 |         SchoolSubTypeES[is.na(SchoolSubTypeES)] <- "Unknown"
105 |         SchoolStyleES[is.na(SchoolStyleES)] <- "Unknown"
106 |         SchoolTypeHS[is.na(SchoolTypeHS)] <- "Unknown"
107 |         SchoolSubTypeHS[is.na(SchoolSubTypeHS)] <- "Unknown"
108 |         SchoolStyleHS[is.na(SchoolStyleHS)] <- "Unknown"
109 |         SchoolTypeMatch <- as.numeric(SchoolTypeES==SchoolTypeHS)
110 |         SchoolSubTypeMatch <- as.numeric(SchoolSubTypeES==SchoolSubTypeHS)
111 |         SchoolStyleMatch <- as.numeric(SchoolStyleES==SchoolStyleHS)
112 |         SchoolTypeHS <- factor(SchoolTypeHS)
113 |         SchoolSubTypeHS <- factor(SchoolSubTypeHS)
114 |         SchoolStyleHS <- factor(SchoolStyleHS)
115 |         SelectiveEnrollment <- as.numeric(SchoolStyleHS=="Selective Enrollment")
116 |         Race[Race %in% c("American Indian", "Hawaiian or Pacific Islander", "Multi", "N/A")] <- "Other"
117 |         Race <- factor(Race)
118 |         Language[!(Language %in% c("English", "Spanish", "Polish"))] <- "Other"
119 |         Language <- factor(Language)
120 |         BirthCountry[!(BirthCountry %in% c("United States", "Mexico", "China"))] <- "Other"
121 |         BirthCountry <- factor(BirthCountry)
122 |         Food <- factor(Food)
123 |         ESL[is.na(ESL)] <- "Unknown"
124 |         ESL <- factor(ESL)
125 |         SchoolChange[is.na(SchoolChange)] <- 0
126 |         SchoolGPA[is.na(SchoolGPA)] <- median(SchoolGPA, na.rm=T)
127 |     })
128 |     #3. ISAT
129 |     students_schools_final$AvgScore <- (students_schools_final$EighthMathISAT+students_schools_final$EighthReadingISAT)/2
130 |     median_ISAT <- median(students_schools_final$AvgScore, na.rm=T)
131 |     students_schools_final[is.na(students_schools_final$AvgScore),]$AvgScore <- median_ISAT
132 |     #4. Progress Report
133 |     students_schools_final <- within(students_schools_final, {
134 |         Rating[is.na(Rating)] <- "Unknown"
135 |         Safety[is.na(Safety)] <- "Unknown"
136 |         Probation[is.na(Probation)] <- "Unknown"
137 |         Rating <- factor(Rating)
138 |         RatingLevel1 <- as.numeric(Rating=="level 1")
139 |         Safety <- factor(Safety)
140 |         Probation <- factor(Probation)
141 |         Mobility[is.na(Mobility)] <- median(Mobility, na.rm=T)
142 |     })
143 |     #5. Other Features
144 |     students_schools_final <- within(students_schools_final, {
145 |         SchoolSelected <- as.numeric(NextGradeSchoolKey==SchoolKey)
146 |         SameAsPreviousSchool <- ifelse(is.na(ThisGradeSchoolKey), 0, as.numeric(ThisGradeSchoolKey==SchoolKey))
147 |         CatchmentSchool <- as.numeric(CatchmentSchoolKey==SchoolKey)
148 |         SchoolGPA[is.na(SchoolGPA)] <- median(GPA, na.rm=T)
149 |         SchoolAttendanceRate[is.na(SchoolAttendanceRate)] <- median(SchoolAttendanceRate, na.rm=T)
150 |         AttendanceRate[is.na(AttendanceRate)] <- median(AttendanceRate, na.rm=T)
151 |         PercentSameRace <- NA
152 |         PercentSameRace[Race=="Black, Non-Hispanic"] <- Black[Race=="Black, Non-Hispanic"]
153 |         PercentSameRace[Race=="Hispanic"] <- Hispanic[Race=="Hispanic"]
154 |         PercentSameRace[Race=="White, Non-Hispanic"] <- White["White, Non-Hispanic"]
155 |         PercentSameRace[is.na(PercentSameRace)] <- Other[is.na(PercentSameRace)]
156 |         PercentSameFood <- NA
157 |         PercentSameFood[Food=="Yes"] <- SchoolFood[Food=="Yes"]
158 |         PercentSameFood[Food!="Yes"] <- 1-SchoolFood[Food!="Yes"]
159 |         PercentSameESL <- NA
160 |         PercentSameESL[ESL=="Yes"] <- SchoolESL[ESL=="Yes"]
161 |         PercentSameESL[ESL!="Yes"] <- 1-SchoolESL[ESL!="Yes"]
162 |         PercentSameMale <- NA
163 |         PercentSameMale[Gender=="Male"] <- SchoolMale[Gender=="Male"]
164 |         PercentSameMale[Gender!="Male"] <- 1-SchoolMale[Gender!="Male"]
165 |         PercentSameHomeless <- NA
166 |         PercentSameHomeless[Homeless=="Yes"] <- SchoolHomeless[Homeless=="Yes"]
167 |         PercentSameHomeless[Homeless!="Yes"] <- 1-SchoolHomeless[Homeless!="Yes"]
168 |     })
169 |     cat("Finish Data Cleaning...\n")
170 |     return(students_schools_final)
171 | }
172 | 
173 | ####Make Prediction####
174 | #convert the prediction to probability
175 | #prediction: the prediction value from condition logistic regression
176 | #test_data: data for test
177 | make_prediction <- function(prediction, test_data) {
178 |     test <- data.frame(StudentKey=test_data$StudentKey,
179 |                        SchoolKey=test_data$SchoolKey,
180 |                        Pred=exp(as.numeric(prediction)))
181 |     pred_mat <- dcast(test, StudentKey~SchoolKey, mean, value.var="Pred")
182 |     pred_mat[,-1] <- pred_mat[,-1]/rowSums(pred_mat[,-1])
183 |     return(pred_mat)
184 | }
185 | 
186 | ####Reshape Probability Into Enrollment####
187 | #prob: probability from conditional logistic regression
188 | reshape_enroll <- function(prob) {
189 |     data <- round(colSums(prob[,-1]))
190 |     school_key <- as.numeric(str_replace(names(data), "X", ""))
191 |     enroll <- as.numeric(data)
192 |     school_enroll <- data.frame(SchoolKey=school_key, Enrollment=enroll)
193 |     return(school_enroll)
194 | }
195 | 
196 | ####Train Conditional Logistic Regression####
197 | clogit <- function(regression_formula, data_last_year, data_this_year) {
198 |     cat(paste0("Formula: ", regression_formula, "\n"))
199 |     feature <- model.matrix(as.formula(regression_formula), data_last_year)[,-1]
200 |     feature_test <- model.matrix(as.formula(regression_formula), data_this_year)[,-1]
201 |     feature <- feature[, colnames(feature)%in%colnames(feature_test)] #make sure the features are matched
202 |     feature <- feature[, apply(feature, 2, sd)>0]
203 |     feature_means <- colMeans(feature)
204 |     feature_sds <- apply(feature, 2, sd)
205 |     feature <- scale(feature, center=feature_means, scale=feature_sds)
206 |     feature_test <- scale(feature_test, center=feature_means, scale=feature_sds)
207 |     label <- data_last_year$SchoolSelected
208 |     start <- proc.time()
209 |     model <- clogitL1(x=feature, y=label, strata=data_last_year$StudentKey, alpha=1)
210 |     model_cv <- cv.clogitL1(model, numFolds=5)
211 |     end <- proc.time()
212 |     time <- end-start
213 |     cat(paste0("Training Takes ", as.numeric(time["elapsed"])/3600), "Hours...\n")
214 |     #Predict
215 |     coefficient <- summary(model_cv)$beta_minCV
216 |     if (is.nan(sum(coefficient))) {
217 |         coefficient <- summary(model_cv)$beta_minCV1se
218 |         if (is.nan(sum(coefficient))) {
219 |             cat("We Cannot Make Prediction!\n")
220 |             return()
221 |         }
222 |     }
223 |     prediction <- feature_test%*%coefficient 
224 |     prob <- make_prediction(prediction, data_this_year)
225 |     return(prob)
226 | }
227 | 
228 | ####Construct Baseline: Copy Last Year Enrollment####
229 | #data_last_year: last year training data
230 | #new_school: new school keys
231 | copy_last_year <- function(last_enroll, this_enroll, new_school) {
232 |     baseline <- data.frame(SchoolKey=this_enroll$SchoolKey, Enrollment=NA)
233 |     for (i in 1:nrow(baseline)) {
234 |         index <- which(last_enroll$SchoolKey==baseline[i,"SchoolKey"])
235 |         if (length(index)>0) {
236 |             baseline[i,"Enrollment"] <- last_enroll[index,"Enrollment"]
237 |         }
238 |     }
239 |     baseline[is.na(baseline$Enrollment),"Enrollment"] <- max(0, round((sum(this_enroll$Enrollment)-sum(baseline$Enrollment, na.rm=T))/sum(is.na(baseline$Enrollment))))
240 |     return(baseline)
241 | }
242 | 
243 | ####Conditional Logistic Regression####
244 | grade9_prediction <- function(location, year) {
245 |     cat("Process Last Year Data...\n")
246 |     data_last_year <- construct_data(location, year-1, Inf)
247 |     cat("Process This Year Data...\n")
248 |     data_this_year <- construct_data(location, year, Inf)
249 |     #Construct Design Matrix
250 |     cat("Creating Model Matrices...\n")
251 |     #Train/Predict
252 |     #1. Distance
253 |     regression_formula <- "SchoolSelected~ns(DistanceHomeHS,2)"
254 |     prob_d <- clogit(regression_formula, data_last_year, data_this_year)
255 |     enrollment_d <- reshape_enroll(prob_d)
256 |     #2. Distance+Catchment
257 |     regression_formula <- "SchoolSelected~ns(DistanceHomeHS,2)+CatchmentSchool"
258 |     prob_dc <- clogit(regression_formula, data_last_year, data_this_year)
259 |     enrollment_dc <- reshape_enroll(prob_dc)
260 |     #3. Distance+Catchment+Rating
261 |     regression_formula <- "SchoolSelected~Rating*ns(DistanceHomeHS,2)+CatchmentSchool"
262 |     prob_dcr <- clogit(regression_formula, data_last_year, data_this_year)
263 |     enrollment_dcr <- reshape_enroll(prob_dcr)
264 |     #4. All
265 |     regression_formula <- "SchoolSelected~Rating*ns(DistanceHomeHS,2)+ns(PercentSameRace,2)+ns(PercentSameESL,2)+ns(PercentSameMale,2)+ns(Mobility,2)+ns(SchoolAttendanceRate,2)+CatchmentSchool+SchoolTypeMatch+PreviousEnrollment"
266 |     prob_all <- clogit(regression_formula, data_last_year, data_this_year)
267 |     enrollment_all <- reshape_enroll(prob_all)
268 |     cat("Finish Conditional Logit Prediction ^_^Y\n")
269 |     #Baseline
270 |     new_school <- setdiff(unique(data_this_year$SchoolKey), unique(data_last_year$SchoolKey))
271 |     this_year <- dcast(data_this_year, StudentKey~SchoolKey, mean, value.var="SchoolSelected")[,-1]
272 |     this_enroll <- data.frame(SchoolKey=as.numeric(names(this_year)), Enrollment=round(colSums(this_year)))
273 |     last_year <- dcast(data_last_year, StudentKey~SchoolKey, mean, value.var="SchoolSelected")[,-1]
274 |     last_enroll <- data.frame(SchoolKey=as.numeric(names(last_year)), Enrollment=round(colSums(last_year)))
275 |     baseline <- copy_last_year(last_enroll, this_enroll, new_school)
276 |     cat("Finish Baseline Prediction ^_^Y\n")
277 |     #Ensemble
278 |     total_new_this_year <- with(enrollment_all, sum(Enrollment[SchoolKey%in%new_school]))
279 |     total_old_this_year <- sum(this_enroll$Enrollment)-total_new_this_year
280 |     proportion <- with(baseline, {Enrollment[!SchoolKey%in%new_school]/sum(Enrollment[!SchoolKey%in%new_school])})
281 |     ensemble <- within(enrollment_all, {Enrollment[!SchoolKey%in%new_school]=round(total_old_this_year*proportion)})
282 |     cat("Finish Ensemble Prediction ^_^Y\n")
283 |     #Save Files
284 |     save(data_last_year, file=paste0("clogit_", year, "_data_", year-1, ".RData")) #last year data
285 |     save(data_this_year, file=paste0("clogit_", year, "_data_", year, ".RData")) #this year data
286 |     write.csv(x=prob_d, file=paste0("clogit_", year, "_prob_d.csv"), row.names=F) #distance
287 |     write.csv(x=prob_dc, file=paste0("clogit_", year, "_prob_dc.csv"), row.names=F) #distance+catchment+rating
288 |     write.csv(x=prob_dcr, file=paste0("clogit_", year, "_prob_dcr.csv"), row.names=F) #distance+catchment+rating
289 |     write.csv(x=prob_all, file=paste0("clogit_", year, "_prob_all.csv"), row.names=F) #all features
290 |     write.csv(x=enrollment_d, file=paste0("clogit_", year, "_prediction_d.csv"), row.names=F) #distance
291 |     write.csv(x=enrollment_dc, file=paste0("clogit_", year, "_prediction_dc.csv"), row.names=F) #distance+catchment
292 |     write.csv(x=enrollment_dcr, file=paste0("clogit_", year, "_prediction_dcr.csv"), row.names=F) #distance+catchment+rating
293 |     write.csv(x=enrollment_all, file=paste0("clogit_", year, "_prediction_all.csv"), row.names=F) #all features
294 |     write.csv(x=baseline, file=paste0("baseline_", year, "_prediction_all.csv"), row.names=F) #baseline
295 |     write.csv(x=ensemble, file=paste0("ensemble_", year, "_prediction_all.csv"), row.names=F) #final model
296 |     cat("Finish Saving Files ^_^Y\n")
297 |     return(prob_all)
298 | }
299 | 
300 | 
301 | 
302 | 
303 | 
304 | 


--------------------------------------------------------------------------------
/Tool_Box/main.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | 
 3 | #################################
 4 | ####Main Script For Modelling####
 5 | #################################
 6 | 
 7 | ####CPS Team####
 8 | ####08/17/2014####
 9 | 
10 | rm(list=ls())
11 | setwd("/home/zye/Model/")
12 | 
13 | ####Cohort Survival Model####
14 | source("cohort_survival.R")
15 | location <- "/mnt/data/cps/Data/"
16 | year_to_predict <- 2013
17 | result <- cohort_survival(location, year_to_predict)
18 | 
19 | ####Model Diagnostics####
20 | source("model_diagnostics.R")
21 | location <- "/mnt/data/cps/Data/"
22 | year_to_train <- 2012
23 | threshold <- 50
24 | grade <- "Grade9"
25 | model <- diagnostic(location, year, threshold, grade)
26 | year_to_predict <- 2013
27 | result <- prediction(location, year, threshold, grade, model)
28 | 
29 | ####Conditional Logistic Regression####
30 | source("grade9_prediction.R")
31 | location <- "/mnt/data/cps/Data/"
32 | year_to_predict <- 2012
33 | result <- grade9_prediction(location, year_to_predict)
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/Tool_Box/model_diagnostics.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/Rscript
  2 | 
  3 | #########################
  4 | ####Error Diagnostics####
  5 | #########################
  6 | 
  7 | ####CPS Team####
  8 | ####08/19/2014####
  9 | 
 10 | ####Packages####
 11 | suppressMessages(library(rpart))
 12 | suppressMessages(library(rpart.plot))
 13 | 
 14 | ####Construct Data for Modeling####
 15 | #prediction: cohort survival prediction
 16 | #enrollment: true enrollment data
 17 | #changing_feature: school features changing every year
 18 | #static_feature: school features not changing every year
 19 | #aggregate_feature: school features aggregated from the database
 20 | #grade: the grade we want to classify
 21 | construct_data <- function(prediction, enrollment, changing_feature, static_feature, aggregate_feature, grade) {
 22 |     pred <- data.frame(SchoolID=prediction$SchoolID, Prediction=prediction[,grade])
 23 |     enroll <- data.frame(SchoolID=enrollment$SchoolID, PreviousEnrollment=enrollment[,grade])
 24 |     group <- with(static_feature, data.frame(SchoolID, SchoolType, SchoolStyle, SchoolGradeGroup))
 25 |     data <- merge(pred, enroll, by="SchoolID", all.x=T)
 26 |     data <- merge(data, group, by="SchoolID", all.x=T)
 27 |     data <- merge(data, changing_feature, by="SchoolID", all.x=T)
 28 |     data <- merge(data, aggregate_feature, by="SchoolID", all.x=T)
 29 |     data <- within(data, {
 30 |         PreviousEnrollment[is.na(PreviousEnrollment)] <- 0
 31 |         SchoolType <- factor(SchoolType, exclude=NULL)
 32 |         SchoolStyle <- factor(SchoolStyle, exclude=NULL)
 33 |         SchoolGradeGroup <- factor(SchoolGradeGroup, exclude=NULL)
 34 |         White[is.na(White)] <- median(White, na.rm=T)
 35 |         Black[is.na(Black)] <- median(Black, na.rm=T)
 36 |         Hispanic[is.na(Hispanic)] <- median(Hispanic, na.rm=T)
 37 |         AUSL <- factor(AUSL, exclude=NULL)
 38 |         Mobility[is.na(Mobility)] <- median(Mobility, na.rm=T)
 39 |         Rating <- factor(Rating, exclude=NULL)
 40 |         Probation <- factor(Probation, exclude=NULL)
 41 |         Health <- factor(Health, exclude=NULL)
 42 |         Safety <- factor(Safety, exclude=NULL)
 43 |         Family <- factor(Family, exclude=NULL)
 44 |         Environment <- factor(Environment, exclude=NULL)
 45 |         Instruction <- factor(Instruction, exclude=NULL)
 46 |         Leader <- factor(Leader, exclude=NULL)
 47 |         Teacher <- factor(Teacher, exclude=NULL)
 48 |         SchoolFood[is.na(SchoolFood)] <- median(SchoolFood, na.rm=T)
 49 |         SchoolESL[is.na(SchoolESL)] <- median(SchoolESL, na.rm=T)
 50 |         SchoolMale[is.na(SchoolMale)] <- median(SchoolMale, na.rm=T)
 51 |         SchoolHomeless[is.na(SchoolHomeless)] <- median(SchoolHomeless, na.rm=T)
 52 |         SchoolGPA[is.na(SchoolGPA)] <- median(SchoolGPA, na.rm=T)
 53 |         SchoolAttendanceRate[is.na(SchoolAttendanceRate)] <- median(SchoolAttendanceRate, na.rm=T)
 54 |     })
 55 |     return(data)
 56 | }
 57 | 
 58 | ####Train Decision Tree Model####
 59 | #location: data source
 60 | #year: year to train
 61 | #threshold: >= threshold -> 1; otherwise 0
 62 | diagnostic <- function(location, year, threshold, grade) {
 63 |     prediction_train <- read.csv(paste0(location, "csp", year, ".csv"))
 64 |     enrollment_train <- read.csv(paste0(location, "true", year-1, ".csv"))
 65 |     change_train <- read.csv(paste0(location, "schools_changing_features_", year, ".csv"), stringsAsFactors=F)
 66 |     aggregate_train <- read.csv(paste0(location, "schools_aggregate_features_", year, ".csv"), stringsAsFactors=F)
 67 |     static_train <- read.csv(paste0(location, "schools_static_features.csv"), stringsAsFactors=F)
 68 |     true_enrollment <- read.csv(paste0(location, "true", year, ".csv"))
 69 |     data_train_temp <- construct_data(prediction_train, enrollment_train, change_train, static_train, aggregate_train, grade)
 70 |     enroll <- data.frame(SchoolID=true_enrollment$SchoolID, CurrentEnrollment=true_enrollment[,grade])
 71 |     data_train <- merge(data_train_temp, enroll, by="SchoolID")
 72 |     data_train <- within(data_train, {
 73 |         Label <- rep(0, nrow(data_train))
 74 |         Label[abs(CurrentEnrollment-Prediction)>=threshold] <- 1
 75 |         Label <- factor(Label)
 76 |     })
 77 |     if (length(unique(data_train$Label))==1) {
 78 |         cat("Threshold is not appropriate!\n")
 79 |         return()
 80 |     }
 81 |     model <- rpart(formula=Label~Prediction+PreviousEnrollment+
 82 |                        SchoolType+SchoolStyle+SchoolGradeGroup+White+
 83 |                        Black+Hispanic+Mobility+Rating+Probation+
 84 |                        SchoolESL+SchoolMale+SchoolAttendanceRate,
 85 |                    data=data_train)
 86 |     print(prp(x=model, type=3))
 87 |     return(model)
 88 | }
 89 | 
 90 | ####Make Prediction####
 91 | #location: data source
 92 | #year: year to test
 93 | #threshold: >= threshold -> 1; otherwise 0
 94 | #model: decision tree model
 95 | prediction <- function(location, year, threshold, grade, model) {
 96 |     prediction_test <- read.csv(paste0(location, "csp", year, ".csv"))
 97 |     enrollment_test <- read.csv(paste0(location, "true", year-1, ".csv"))
 98 |     change_test <- read.csv(paste0(location, "schools_changing_features_", year, ".csv"), stringsAsFactors=F)
 99 |     aggregate_test <- read.csv(paste0(location, "schools_aggregate_features_", year, ".csv"), stringsAsFactors=F)
100 |     static_test <- read.csv(paste0(location, "schools_static_features.csv"), stringsAsFactors=F)
101 |     data_test <- construct_data(prediction_test, enrollment_test, change_test, static_test, aggregate_test, grade)
102 |     result <- as.data.frame(predict(object=model, newdata=data_test, type="prob"))
103 |     output <- data.frame(SchoolID=data_test$SchoolID, Probability=result[,"1"])
104 |     write.csv(x=output, file=paste0("misprojection_", year, ".csv"), row.names=F)
105 |     return(output)
106 | }
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------