├── NEWS.md ├── R ├── sysdata.rda ├── OpenSDPsynthR-package.r ├── zzz.R ├── gamma_gk.R ├── diagnostics.R ├── markov_transitions.R ├── CEDS.R ├── baselines.R └── utils.R ├── .gitignore ├── tests ├── testthat.R └── testthat │ ├── test-all.R │ ├── test-validators.R │ ├── test-age_calc.R │ ├── test-gen.R │ ├── test-markov.R │ ├── test-mapCEDS.R │ └── test-baseline.R ├── tools ├── figs │ ├── open_sdp_logo_red.png │ ├── README-ellDiagnostic-1.png │ ├── README-visualdiagnostics-1.png │ └── README-visualdiagnostics-2.png └── example_collab.Rmd ├── vignettes ├── tools │ └── figs │ │ ├── README-ellDiagnostic-1.png │ │ ├── README-unnamed-chunk-10-1.png │ │ ├── README-unnamed-chunk-12-1.png │ │ ├── README-unnamed-chunk-14-1.png │ │ ├── README-unnamed-chunk-16-1.png │ │ ├── README-unnamed-chunk-4-1.png │ │ ├── README-unnamed-chunk-6-1.png │ │ ├── README-unnamed-chunk-8-1.png │ │ ├── README-visualdiagnostics-1.png │ │ └── README-visualdiagnostics-2.png ├── sim_control.Rmd └── sim_diagnostics.Rmd ├── .Rbuildignore ├── data-raw ├── program_baseline.csv ├── grade_matrix.csv ├── age_grade_baseline.csv ├── ellDist.csv ├── CEDS_SDP_map.csv └── build_data.R ├── man ├── OpenSDPsynthR.Rd ├── m_sum.Rd ├── namedList.Rd ├── zeroNA.Rd ├── assert.Rd ├── rescale_gpa.Rd ├── convert_grade.Rd ├── get_baseline.Rd ├── gen_ontrack.Rd ├── diag_offset.Rd ├── get_sim_groupvars.Rd ├── gen_gpa.Rd ├── sdp_cleaner.Rd ├── gen_assess.Rd ├── get_code_values.Rd ├── rescale.Rd ├── gen_ps.Rd ├── num_grade.Rd ├── num_clip.Rd ├── gen_annual_gpa.Rd ├── gen_credits.Rd ├── expand_grid_df.Rd ├── gen_student_years.Rd ├── map_CEDS.Rd ├── unscale.Rd ├── gen_hs_annual.Rd ├── assign_grade.Rd ├── assign_hs_outcomes.Rd ├── gen_annual_status.Rd ├── gen_grad.Rd ├── validate_probability_list.Rd ├── recode_options.Rd ├── validate_sim_parameter.Rd ├── assign_baseline.Rd ├── gen_initial_status.Rd ├── gen_nsc.Rd ├── tm_convert.Rd ├── grade_transitions.Rd ├── recode_credits.Rd ├── gen_students.Rd ├── tidy_sequence.Rd ├── assign_schools.Rd ├── gen_ps_enrollment.Rd ├── rand_vect_cont.Rd ├── simpop.Rd ├── ceds_cleaner.Rd ├── group_rescale.Rd ├── make_inds.Rd ├── cond_prob.Rd ├── school_transitions.Rd ├── markov_cond_list.Rd ├── make_binary_series.Rd ├── gen_schools.Rd ├── better_sim.lm.Rd ├── gamma_GK.Rd ├── fit_series.Rd ├── gen_outcome_model.Rd ├── make_markov_series.Rd ├── age_calc.Rd └── sim_control.Rd ├── todo.md ├── OpenSDP.data.Rproj ├── DESCRIPTION ├── CONTRIBUTING.md ├── LICENSE ├── NAMESPACE ├── CONDUCT.md ├── README.Rmd ├── README.md └── inst └── CG_Analyze_Example.Rmd /NEWS.md: -------------------------------------------------------------------------------- 1 | ## Release - beta 2 | 3 | * rollout 4 | -------------------------------------------------------------------------------- /R/sysdata.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/R/sysdata.rda -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | inst/doc 6 | *.dll 7 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(OpenSDPsynthR) 3 | 4 | test_check("OpenSDPsynthR") 5 | -------------------------------------------------------------------------------- /tools/figs/open_sdp_logo_red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/tools/figs/open_sdp_logo_red.png -------------------------------------------------------------------------------- /tools/figs/README-ellDiagnostic-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/tools/figs/README-ellDiagnostic-1.png -------------------------------------------------------------------------------- /tools/figs/README-visualdiagnostics-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/tools/figs/README-visualdiagnostics-1.png -------------------------------------------------------------------------------- /tools/figs/README-visualdiagnostics-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/tools/figs/README-visualdiagnostics-2.png -------------------------------------------------------------------------------- /vignettes/tools/figs/README-ellDiagnostic-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-ellDiagnostic-1.png -------------------------------------------------------------------------------- /vignettes/tools/figs/README-unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /vignettes/tools/figs/README-unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /vignettes/tools/figs/README-unnamed-chunk-14-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-14-1.png -------------------------------------------------------------------------------- /vignettes/tools/figs/README-unnamed-chunk-16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-16-1.png -------------------------------------------------------------------------------- /vignettes/tools/figs/README-unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /vignettes/tools/figs/README-unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /vignettes/tools/figs/README-unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /vignettes/tools/figs/README-visualdiagnostics-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-visualdiagnostics-1.png -------------------------------------------------------------------------------- /vignettes/tools/figs/README-visualdiagnostics-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-visualdiagnostics-2.png -------------------------------------------------------------------------------- /R/OpenSDPsynthR-package.r: -------------------------------------------------------------------------------- 1 | #' OpenSDPsynthR 2 | #' 3 | #' @name OpenSDPsynthR 4 | #' @description A package to generate OpenSDP data 5 | #' @docType package 6 | NULL 7 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.travis\.yml$ 4 | ^CONDUCT\.md$ 5 | ^README\.Rmd$ 6 | ^README-.*\.png$ 7 | ^data-raw$ 8 | ^todo.md$ 9 | ^demo.R$ 10 | ^erratta.R$ 11 | -------------------------------------------------------------------------------- /data-raw/program_baseline.csv: -------------------------------------------------------------------------------- 1 | ELL,IEP,FRPL,count,prob 2 | 0,0,0,56838,55.00 3 | 0,0,1,6078,3.7 4 | 0,0,2,40026,25.93 5 | 0,1,0,8178,4.44 6 | 0,1,1,1200,0.73 7 | 0,1,2,10223,5.4 8 | 1,0,0,1089,0.64 9 | 1,0,1,360,0.18 10 | 1,0,2,4411,3.41 11 | 1,1,0,121,0.09 12 | 1,1,1,62,0.05 13 | 1,1,2,809,0.43 14 | -------------------------------------------------------------------------------- /man/OpenSDPsynthR.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/OpenSDPsynthR-package.r 3 | \docType{package} 4 | \name{OpenSDPsynthR} 5 | \alias{OpenSDPsynthR} 6 | \alias{OpenSDPsynthR-package} 7 | \title{OpenSDPsynthR} 8 | \description{ 9 | A package to generate OpenSDP data 10 | } 11 | -------------------------------------------------------------------------------- /man/m_sum.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{m_sum} 4 | \alias{m_sum} 5 | \title{Sum matrix elements in a list} 6 | \usage{ 7 | m_sum(l) 8 | } 9 | \arguments{ 10 | \item{l}{a list} 11 | } 12 | \value{ 13 | a list of summed matrix elements 14 | } 15 | \description{ 16 | Sum matrix elements in a list 17 | } 18 | -------------------------------------------------------------------------------- /man/namedList.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_control.R 3 | \name{namedList} 4 | \alias{namedList} 5 | \title{Create a named list} 6 | \usage{ 7 | namedList(...) 8 | } 9 | \arguments{ 10 | \item{...}{arguments to pass to the list} 11 | } 12 | \value{ 13 | a named list 14 | } 15 | \description{ 16 | Create a named list 17 | } 18 | -------------------------------------------------------------------------------- /todo.md: -------------------------------------------------------------------------------- 1 | ## TO-DO 2 | 3 | - unit tests for map_CEDS like functions + error handling 4 | - unit tests for all user-level generator functions 5 | - unit tests for numeric accuracy 6 | X - Create baselines for `frpl`, `iep`, `gifted`, `ell`, and `grade` 7 | X - Define input requirements for sequence generation 8 | X - Define process for generating sequences with and without initial states 9 | X - Define diagnostics 10 | -------------------------------------------------------------------------------- /man/zeroNA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{zeroNA} 4 | \alias{zeroNA} 5 | \title{Convert NA values to 0 in a vector} 6 | \usage{ 7 | zeroNA(x) 8 | } 9 | \arguments{ 10 | \item{x}{a numeric vector containing NAs} 11 | } 12 | \value{ 13 | a numeric vector where all NA values are 0 14 | } 15 | \description{ 16 | Convert NA values to 0 in a vector 17 | } 18 | -------------------------------------------------------------------------------- /man/assert.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{assert} 4 | \alias{assert} 5 | \title{Simple error assertions} 6 | \usage{ 7 | assert(expr, error) 8 | } 9 | \arguments{ 10 | \item{expr}{logical expression} 11 | 12 | \item{error}{character, error message} 13 | } 14 | \value{ 15 | An error message, or nothing 16 | } 17 | \description{ 18 | Simple error assertions 19 | } 20 | -------------------------------------------------------------------------------- /man/rescale_gpa.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_outcomes.R 3 | \name{rescale_gpa} 4 | \alias{rescale_gpa} 5 | \title{Rescale scaled GPA to be on 0-4 scale} 6 | \usage{ 7 | rescale_gpa(x) 8 | } 9 | \arguments{ 10 | \item{x}{a scaled normal variable} 11 | } 12 | \value{ 13 | a GPA rounded to tenths, from 0 to 1 14 | } 15 | \description{ 16 | Rescale scaled GPA to be on 0-4 scale 17 | } 18 | -------------------------------------------------------------------------------- /man/convert_grade.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{convert_grade} 4 | \alias{convert_grade} 5 | \title{Convert auto-generated grades to better grades} 6 | \usage{ 7 | convert_grade(x) 8 | } 9 | \arguments{ 10 | \item{x}{a vector of grades formatted with g} 11 | } 12 | \value{ 13 | a character vector of grades 14 | } 15 | \description{ 16 | Convert auto-generated grades to better grades 17 | } 18 | -------------------------------------------------------------------------------- /man/get_baseline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/baselines.R 3 | \name{get_baseline} 4 | \alias{get_baseline} 5 | \title{Function to load in SDP default baseline data} 6 | \usage{ 7 | get_baseline(bl) 8 | } 9 | \arguments{ 10 | \item{bl}{a character naming the type of baseline available} 11 | } 12 | \value{ 13 | the restored baseline object 14 | } 15 | \description{ 16 | Function to load in SDP default baseline data 17 | } 18 | -------------------------------------------------------------------------------- /man/gen_ontrack.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_outcomes.R 3 | \name{gen_ontrack} 4 | \alias{gen_ontrack} 5 | \title{Calculate on_track based on credits} 6 | \usage{ 7 | gen_ontrack(gpa_ontrack) 8 | } 9 | \arguments{ 10 | \item{gpa_ontrack}{a data.frame with cumulative credits} 11 | } 12 | \value{ 13 | \code{gpa_ontrack} with ontrack indicators appended 14 | } 15 | \description{ 16 | Calculate on_track based on credits 17 | } 18 | -------------------------------------------------------------------------------- /OpenSDP.data.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: XeLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /man/diag_offset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/baselines.R 3 | \name{diag_offset} 4 | \alias{diag_offset} 5 | \title{Offset the diagonal values of a matrix} 6 | \usage{ 7 | diag_offset(matrix, offset = 1L) 8 | } 9 | \arguments{ 10 | \item{matrix}{a square matrix} 11 | 12 | \item{offset}{an integer value to offset the diagonal by} 13 | } 14 | \value{ 15 | a square matrix 16 | } 17 | \description{ 18 | Offset the diagonal values of a matrix 19 | } 20 | -------------------------------------------------------------------------------- /man/get_sim_groupvars.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_defaults.R 3 | \name{get_sim_groupvars} 4 | \alias{get_sim_groupvars} 5 | \title{Get grouping variables} 6 | \usage{ 7 | get_sim_groupvars(control = sim_control()) 8 | } 9 | \arguments{ 10 | \item{control}{a control list produced by \code{\link{sim_control}}} 11 | } 12 | \value{ 13 | a character vector of grouping terms in control lists 14 | } 15 | \description{ 16 | Get grouping variables 17 | } 18 | -------------------------------------------------------------------------------- /man/gen_gpa.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_outcomes.R 3 | \name{gen_gpa} 4 | \alias{gen_gpa} 5 | \title{Generate a final GPA for students} 6 | \usage{ 7 | gen_gpa(data, control = sim_control()) 8 | } 9 | \arguments{ 10 | \item{data}{a dataframe with variables} 11 | 12 | \item{control}{a sim_control parmeter, default is \code{sim_control}} 13 | } 14 | \value{ 15 | a numeric vector 16 | } 17 | \description{ 18 | Generate a final GPA for students 19 | } 20 | -------------------------------------------------------------------------------- /man/sdp_cleaner.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleaners.R 3 | \name{sdp_cleaner} 4 | \alias{sdp_cleaner} 5 | \title{Convert a population simulation into an analysis file} 6 | \usage{ 7 | sdp_cleaner(simouts) 8 | } 9 | \arguments{ 10 | \item{simouts}{result of the \code{simpop} function} 11 | } 12 | \value{ 13 | an analysis file as a single dataframe for HS outcomes 14 | } 15 | \description{ 16 | Convert a population simulation into an analysis file 17 | } 18 | -------------------------------------------------------------------------------- /man/gen_assess.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_outcomes.R 3 | \name{gen_assess} 4 | \alias{gen_assess} 5 | \title{Generate an assessment table} 6 | \usage{ 7 | gen_assess(data, control = sim_control()) 8 | } 9 | \arguments{ 10 | \item{data}{student-year data} 11 | 12 | \item{control}{output from \code{sim_control}} 13 | } 14 | \value{ 15 | a two-column dataframe with math and reading scores 16 | } 17 | \description{ 18 | Generate an assessment table 19 | } 20 | -------------------------------------------------------------------------------- /man/get_code_values.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CEDS.R 3 | \name{get_code_values} 4 | \alias{get_code_values} 5 | \title{Get values of codes from the CEDS Crosswalk List} 6 | \usage{ 7 | get_code_values(x) 8 | } 9 | \arguments{ 10 | \item{x}{the data.frame character element that contains the codes} 11 | } 12 | \value{ 13 | a list with the labels and levels properly formatted 14 | } 15 | \description{ 16 | Get values of codes from the CEDS Crosswalk List 17 | } 18 | -------------------------------------------------------------------------------- /man/rescale.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{rescale} 4 | \alias{rescale} 5 | \title{Reliabily rescale numerics with missingness} 6 | \usage{ 7 | rescale(x) 8 | } 9 | \arguments{ 10 | \item{x}{a numeric vector} 11 | } 12 | \value{ 13 | x mean centered and divided by it's standard deviation 14 | } 15 | \description{ 16 | Reliabily rescale numerics with missingness 17 | } 18 | \details{ 19 | If \code{sd(x)} is undefined, this returns a zero 20 | } 21 | -------------------------------------------------------------------------------- /man/gen_ps.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_outcomes.R 3 | \name{gen_ps} 4 | \alias{gen_ps} 5 | \title{Generate postsecondary enrollment outcome} 6 | \usage{ 7 | gen_ps(data, control = sim_control()) 8 | } 9 | \arguments{ 10 | \item{data}{cohort data} 11 | 12 | \item{control}{output from \code{sim_control}} 13 | } 14 | \value{ 15 | a two-column dataframe with probabilities and binary outcome 16 | } 17 | \description{ 18 | Generate postsecondary enrollment outcome 19 | } 20 | -------------------------------------------------------------------------------- /man/num_grade.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{num_grade} 4 | \alias{num_grade} 5 | \title{Convert a character representation of grades to numeric} 6 | \usage{ 7 | num_grade(grade) 8 | } 9 | \arguments{ 10 | \item{grade}{a character vector of grades with character labels} 11 | } 12 | \value{ 13 | a numeric vector, length of grade, representing grade levels as numbers 14 | } 15 | \description{ 16 | Convert a character representation of grades to numeric 17 | } 18 | -------------------------------------------------------------------------------- /man/num_clip.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{num_clip} 4 | \alias{num_clip} 5 | \title{Clip a vector to be between a minimum and a maximum} 6 | \usage{ 7 | num_clip(x, min, max) 8 | } 9 | \arguments{ 10 | \item{x}{a numeric vector} 11 | 12 | \item{min}{numeric, a floor} 13 | 14 | \item{max}{numeric, a ceiling} 15 | } 16 | \value{ 17 | x, truncated to be between min and max 18 | } 19 | \description{ 20 | Clip a vector to be between a minimum and a maximum 21 | } 22 | -------------------------------------------------------------------------------- /man/gen_annual_gpa.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_outcomes.R 3 | \name{gen_annual_gpa} 4 | \alias{gen_annual_gpa} 5 | \title{Generate annual GPA sequence for students based on final GPA} 6 | \usage{ 7 | gen_annual_gpa(gpa_ontrack) 8 | } 9 | \arguments{ 10 | \item{gpa_ontrack}{a data.frame containing final GPA for a student} 11 | } 12 | \value{ 13 | \code{gpa_ontrack} appended with gpa by year 14 | } 15 | \description{ 16 | Generate annual GPA sequence for students based on final GPA 17 | } 18 | -------------------------------------------------------------------------------- /man/gen_credits.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_outcomes.R 3 | \name{gen_credits} 4 | \alias{gen_credits} 5 | \title{Generate credit sequence for students based on their final GPA} 6 | \usage{ 7 | gen_credits(gpa_ontrack) 8 | } 9 | \arguments{ 10 | \item{gpa_ontrack}{a data.frame containing final GPA for a student} 11 | } 12 | \value{ 13 | \code{gpa_ontrack} but appended with credits by year 14 | } 15 | \description{ 16 | Generate credit sequence for students based on their final GPA 17 | } 18 | -------------------------------------------------------------------------------- /man/expand_grid_df.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{expand_grid_df} 4 | \alias{expand_grid_df} 5 | \title{Expand dataframe into a complete grid} 6 | \usage{ 7 | expand_grid_df(...) 8 | } 9 | \arguments{ 10 | \item{...}{dataframe} 11 | } 12 | \value{ 13 | an expanded data frame 14 | } 15 | \description{ 16 | Expand dataframe into a complete grid 17 | } 18 | \details{ 19 | From SO {\url{http://stackoverflow.com/questions/11693599/alternative-to-expand-grid-for-data-frames}} 20 | } 21 | -------------------------------------------------------------------------------- /man/gen_student_years.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_defaults.R 3 | \name{gen_student_years} 4 | \alias{gen_student_years} 5 | \title{Generate annual student observations} 6 | \usage{ 7 | gen_student_years(data, control = sim_control()) 8 | } 9 | \arguments{ 10 | \item{data}{students to generate annual data for} 11 | 12 | \item{control}{a list, defined by \code{\link{sim_control}}} 13 | } 14 | \value{ 15 | a data.frame 16 | } 17 | \description{ 18 | Generate annual student observations 19 | } 20 | -------------------------------------------------------------------------------- /man/map_CEDS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CEDS.R 3 | \name{map_CEDS} 4 | \alias{map_CEDS} 5 | \title{Map to and from CEDS column names} 6 | \usage{ 7 | map_CEDS(user, category = NULL, CEDS = NULL) 8 | } 9 | \arguments{ 10 | \item{user}{character vector of variable names to match} 11 | 12 | \item{category}{character, a category of CEDS data to match} 13 | 14 | \item{CEDS}{optional} 15 | } 16 | \value{ 17 | mapped CEDS names 18 | } 19 | \description{ 20 | Map to and from CEDS column names 21 | } 22 | -------------------------------------------------------------------------------- /man/unscale.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{unscale} 4 | \alias{unscale} 5 | \title{Unscale a scaled variable} 6 | \usage{ 7 | unscale(x, mean, sd) 8 | } 9 | \arguments{ 10 | \item{x}{numeric vector that has been scaled} 11 | 12 | \item{mean}{a numeric, the mean to add to x} 13 | 14 | \item{sd}{a numeric, the standardized factor to divide x by} 15 | } 16 | \value{ 17 | x rescaled with mean and sd specified by the user 18 | } 19 | \description{ 20 | Unscale a scaled variable 21 | } 22 | -------------------------------------------------------------------------------- /man/gen_hs_annual.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_outcomes.R 3 | \name{gen_hs_annual} 4 | \alias{gen_hs_annual} 5 | \title{Generate annual HS outcomes} 6 | \usage{ 7 | gen_hs_annual(hs_outcomes, stu_year) 8 | } 9 | \arguments{ 10 | \item{hs_outcomes}{a data frame with final gpa, hs_status, and sid} 11 | 12 | \item{stu_year}{a data frame with student enrollment} 13 | } 14 | \value{ 15 | an expanded table with credits and gpa information 16 | } 17 | \description{ 18 | Generate annual HS outcomes 19 | } 20 | -------------------------------------------------------------------------------- /man/assign_grade.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/baselines.R 3 | \name{assign_grade} 4 | \alias{assign_grade} 5 | \title{Assign student a grade} 6 | \usage{ 7 | assign_grade(age, ability) 8 | } 9 | \arguments{ 10 | \item{age}{age of the student in years} 11 | 12 | \item{ability}{a modifier that signifies student ability?} 13 | } 14 | \value{ 15 | a vector of grade levels 16 | } 17 | \description{ 18 | Assign student a grade 19 | } 20 | \examples{ 21 | age <- c(9, 10, 11, 12) 22 | assign_grade(age = age) 23 | } 24 | -------------------------------------------------------------------------------- /man/assign_hs_outcomes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_defaults.R 3 | \name{assign_hs_outcomes} 4 | \alias{assign_hs_outcomes} 5 | \title{Assign high school outcomes} 6 | \usage{ 7 | assign_hs_outcomes(data, control = sim_control()) 8 | } 9 | \arguments{ 10 | \item{data}{a dataframe with certain high school attributes} 11 | 12 | \item{control}{control parameters from the \code{sim_control()} function} 13 | } 14 | \value{ 15 | an outcome dataframe 16 | } 17 | \description{ 18 | Assign high school outcomes 19 | } 20 | -------------------------------------------------------------------------------- /man/gen_annual_status.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_defaults.R 3 | \name{gen_annual_status} 4 | \alias{gen_annual_status} 5 | \title{Generate annual status trajectories per student} 6 | \usage{ 7 | gen_annual_status(data, control = sim_control()) 8 | } 9 | \arguments{ 10 | \item{data}{student-year data} 11 | 12 | \item{control}{control list} 13 | } 14 | \value{ 15 | the \code{data} object, with additional variables appended 16 | } 17 | \description{ 18 | Generate annual status trajectories per student 19 | } 20 | -------------------------------------------------------------------------------- /man/gen_grad.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_outcomes.R 3 | \name{gen_grad} 4 | \alias{gen_grad} 5 | \title{Generate a high school graduation for students} 6 | \usage{ 7 | gen_grad(data, control = sim_control()) 8 | } 9 | \arguments{ 10 | \item{data}{a dataframe with variables} 11 | 12 | \item{control}{a sim_control parmeter, default is \code{sim_control}} 13 | } 14 | \value{ 15 | a data.frame with two values, a probability and a binary outcome 16 | } 17 | \description{ 18 | Generate a high school graduation for students 19 | } 20 | -------------------------------------------------------------------------------- /man/validate_probability_list.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{validate_probability_list} 4 | \alias{validate_probability_list} 5 | \title{Validate probability list formatting and structure} 6 | \usage{ 7 | validate_probability_list(list) 8 | } 9 | \arguments{ 10 | \item{list}{list that is to be passed as a probability list to simulation functions} 11 | } 12 | \value{ 13 | Logical TRUE if list is valid, error if list is not valid. 14 | } 15 | \description{ 16 | Validate probability list formatting and structure 17 | } 18 | -------------------------------------------------------------------------------- /man/recode_options.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CEDS.R 3 | \name{recode_options} 4 | \alias{recode_options} 5 | \title{Recode options} 6 | \usage{ 7 | recode_options(data, from = c("SDP", "CEDS")) 8 | } 9 | \arguments{ 10 | \item{data}{a data.frame to recode the variables to CEDS from} 11 | 12 | \item{from}{the data definitions you are recoding from} 13 | } 14 | \value{ 15 | the data object, but with all values matching CEDS specification 16 | recoded to meet the CEDS specification 17 | } 18 | \description{ 19 | Recode options 20 | } 21 | -------------------------------------------------------------------------------- /man/validate_sim_parameter.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{validate_sim_parameter} 4 | \alias{validate_sim_parameter} 5 | \title{Validate outcome simulation list formatting and structure} 6 | \usage{ 7 | validate_sim_parameter(list) 8 | } 9 | \arguments{ 10 | \item{list}{list that is to be passed as a parameter list to outcome simulation functions} 11 | } 12 | \value{ 13 | Logical TRUE if list is valid, error if list is not valid. 14 | } 15 | \description{ 16 | Validate outcome simulation list formatting and structure 17 | } 18 | -------------------------------------------------------------------------------- /man/assign_baseline.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/baselines.R 3 | \name{assign_baseline} 4 | \alias{assign_baseline} 5 | \title{Append baseline data to initial data} 6 | \usage{ 7 | assign_baseline(baseline = NULL, data) 8 | } 9 | \arguments{ 10 | \item{baseline}{character value of the default baseline to assign} 11 | 12 | \item{data}{a data.frame to append the baseline to} 13 | } 14 | \value{ 15 | the data.frame passed by the user with an additional variable appended 16 | } 17 | \description{ 18 | Append baseline data to initial data 19 | } 20 | -------------------------------------------------------------------------------- /man/gen_initial_status.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_defaults.R 3 | \name{gen_initial_status} 4 | \alias{gen_initial_status} 5 | \title{Generate initial student status indicators} 6 | \usage{ 7 | gen_initial_status(data, baseline) 8 | } 9 | \arguments{ 10 | \item{data}{that includes the pre-requsites for generating each status} 11 | 12 | \item{baseline}{character, name of a baseline status to calculate} 13 | } 14 | \value{ 15 | the data with status variables appended 16 | } 17 | \description{ 18 | Generate initial student status indicators 19 | } 20 | -------------------------------------------------------------------------------- /man/gen_nsc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_defaults.R 3 | \name{gen_nsc} 4 | \alias{gen_nsc} 5 | \title{Generate postsecondary institutions} 6 | \usage{ 7 | gen_nsc(n, names = NULL, method = NULL) 8 | } 9 | \arguments{ 10 | \item{n}{number of institutions to generate} 11 | 12 | \item{names}{names to use for schools} 13 | 14 | \item{method}{default NULL, can be set to "scorecard" to use college scorecard data} 15 | } 16 | \value{ 17 | a data.frame of names, IDs, and enrollment weights 18 | } 19 | \description{ 20 | Generate postsecondary institutions 21 | } 22 | -------------------------------------------------------------------------------- /man/tm_convert.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{tm_convert} 4 | \alias{tm_convert} 5 | \title{Covnert a matrix to a row-wise transition matrix} 6 | \usage{ 7 | tm_convert(matrix) 8 | } 9 | \arguments{ 10 | \item{matrix}{a matrix tof counts} 11 | } 12 | \value{ 13 | matrix M divided by the sum of its rows 14 | } 15 | \description{ 16 | Covnert a matrix to a row-wise transition matrix 17 | } 18 | \examples{ 19 | base_mat <- structure(c(44985, 740, 781, 7640), .Dim = c(2L, 2L), 20 | .Dimnames = list(c("0", "1"), c("0", "1"))) 21 | tm_convert(base_mat) 22 | } 23 | -------------------------------------------------------------------------------- /man/grade_transitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/baselines.R 3 | \name{grade_transitions} 4 | \alias{grade_transitions} 5 | \title{Generate a grade advancement transition matrix} 6 | \usage{ 7 | grade_transitions(ngrades = 15L, diag_limit = 0.975) 8 | } 9 | \arguments{ 10 | \item{ngrades}{integer, the number of grade levels to simulate} 11 | 12 | \item{diag_limit}{the minimum probability of a student advancing to the next grade} 13 | } 14 | \value{ 15 | a matrix, ngrades x ngrades of grade transition probabilities 16 | } 17 | \description{ 18 | Generate a grade advancement transition matrix 19 | } 20 | -------------------------------------------------------------------------------- /man/recode_credits.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{recode_credits} 4 | \alias{recode_credits} 5 | \title{Recode numeric data into plausible credit data} 6 | \usage{ 7 | recode_credits(x, top = 6) 8 | } 9 | \arguments{ 10 | \item{x}{numeric, vector of numerics to be truncated} 11 | 12 | \item{top}{integer, maximum allowable value} 13 | } 14 | \value{ 15 | a vector, length of x, with truncated values 16 | } 17 | \description{ 18 | Recode numeric data into plausible credit data 19 | } 20 | \details{ 21 | enforces no negative numbers and truncates data at a user specified 22 | maximum 23 | } 24 | -------------------------------------------------------------------------------- /man/gen_students.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_defaults.R 3 | \name{gen_students} 4 | \alias{gen_students} 5 | \title{Generate student-level attributes} 6 | \usage{ 7 | gen_students(nstu, control = sim_control()) 8 | } 9 | \arguments{ 10 | \item{nstu}{integer, number of students to simulate} 11 | 12 | \item{control}{a list, defined by \code{\link{sim_control}}} 13 | } 14 | \value{ 15 | a data.frame 16 | } 17 | \description{ 18 | Generate student-level attributes 19 | } 20 | \details{ 21 | The default is to generate students in racial groups and male/female 22 | in proportion to the U.S. population. 23 | } 24 | -------------------------------------------------------------------------------- /man/tidy_sequence.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{tidy_sequence} 4 | \alias{tidy_sequence} 5 | \title{Tidy a two-state markov sequence for output} 6 | \usage{ 7 | tidy_sequence(seq, states) 8 | } 9 | \arguments{ 10 | \item{seq}{a vector of sequence elements, with only two states} 11 | 12 | \item{states}{a vector of length two naming both possible states} 13 | } 14 | \value{ 15 | a data.frame 16 | } 17 | \description{ 18 | Tidy a two-state markov sequence for output 19 | } 20 | \examples{ 21 | tidy_sequence(seq = c("Yes", "No", "No", "No", "Yes", "Yes"), 22 | states = c("Yes", "No")) 23 | } 24 | -------------------------------------------------------------------------------- /man/assign_schools.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_defaults.R 3 | \name{assign_schools} 4 | \alias{assign_schools} 5 | \title{Assign student enrollment spells to a school ID} 6 | \usage{ 7 | assign_schools(student, schools, method = NULL) 8 | } 9 | \arguments{ 10 | \item{student}{data frame of student-year observations} 11 | 12 | \item{schools}{data frame of schools to assign from} 13 | 14 | \item{method}{currently unused, will allow for different assignment methods} 15 | } 16 | \value{ 17 | student, with additional column schid appended 18 | } 19 | \description{ 20 | Assign student enrollment spells to a school ID 21 | } 22 | -------------------------------------------------------------------------------- /man/gen_ps_enrollment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_outcomes.R 3 | \name{gen_ps_enrollment} 4 | \alias{gen_ps_enrollment} 5 | \title{Generate a student-year long table of postsecondary enrollments} 6 | \usage{ 7 | gen_ps_enrollment(hs_outcomes, nsc, control) 8 | } 9 | \arguments{ 10 | \item{hs_outcomes}{a dataframe of high school outcomes} 11 | 12 | \item{nsc}{a dataframe of postsecondary institutions} 13 | 14 | \item{control}{a control object from \code{sim_control()}} 15 | } 16 | \value{ 17 | a table of enrollments 18 | } 19 | \description{ 20 | Generate a student-year long table of postsecondary enrollments 21 | } 22 | -------------------------------------------------------------------------------- /man/rand_vect_cont.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{rand_vect_cont} 4 | \alias{rand_vect_cont} 5 | \title{Generate positive random numbers that sum to a value} 6 | \usage{ 7 | rand_vect_cont(N, M, sd = 1) 8 | } 9 | \arguments{ 10 | \item{N}{number of numbers to generate} 11 | 12 | \item{M}{constraint on the sum} 13 | 14 | \item{sd}{standard deviation of values} 15 | } 16 | \value{ 17 | a vector of numerics, length N 18 | } 19 | \description{ 20 | Generate positive random numbers that sum to a value 21 | } 22 | \examples{ 23 | out <- rand_vect_cont(N = 10, M = 2, sd = 1) 24 | sum(out) 25 | length(out) 26 | } 27 | -------------------------------------------------------------------------------- /man/simpop.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_defaults.R 3 | \name{simpop} 4 | \alias{simpop} 5 | \title{Grand simulation} 6 | \usage{ 7 | simpop(nstu, seed = NULL, control = sim_control()) 8 | } 9 | \arguments{ 10 | \item{nstu}{integer, number of students to simulate} 11 | 12 | \item{seed}{integer, random seed to make simulation reproducible across 13 | sessions, optional} 14 | 15 | \item{control}{a list, defined by \code{\link{sim_control}}} 16 | } 17 | \value{ 18 | a list with simulated data 19 | } 20 | \description{ 21 | Grand simulation 22 | } 23 | \examples{ 24 | \dontrun{ 25 | out <- simpop(nstu = 20, seed = 213) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /man/ceds_cleaner.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleaners.R 3 | \name{ceds_cleaner} 4 | \alias{ceds_cleaner} 5 | \title{CEDS oputput function} 6 | \usage{ 7 | ceds_cleaner(simouts, output = "directory", directory = NULL) 8 | } 9 | \arguments{ 10 | \item{simouts}{a simulation list resulting from a call to the \code{\link{simpop}} 11 | function} 12 | 13 | \item{output}{a character, default "directory", specifying where the output should good} 14 | 15 | \item{directory}{a path to a directory to store the output files} 16 | } 17 | \value{ 18 | Nothing. Output is saved out to a file on disk. 19 | } 20 | \description{ 21 | CEDS oputput function 22 | } 23 | -------------------------------------------------------------------------------- /man/group_rescale.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{group_rescale} 4 | \alias{group_rescale} 5 | \title{Rescaling variables in groups} 6 | \usage{ 7 | group_rescale(data, var, group_var, newvar = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{a dataframe containing variables you wish to rescale} 11 | 12 | \item{var}{name of the variable to be rescaled} 13 | 14 | \item{group_var}{character vector of the grouping terms} 15 | 16 | \item{newvar}{optional character vector for the name of the new rescaled variable} 17 | } 18 | \value{ 19 | data with the newvar appended 20 | } 21 | \description{ 22 | Rescaling variables in groups 23 | } 24 | -------------------------------------------------------------------------------- /man/make_inds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/CEDS.R 3 | \name{make_inds} 4 | \alias{make_inds} 5 | \title{Append indicator variables to a data frame based on a single factor variable} 6 | \source{ 7 | \url{http://stackoverflow.com/questions/35943455/creating-indicator-variable-columns-in-dplyr-chain} 8 | } 9 | \usage{ 10 | make_inds(data, col) 11 | } 12 | \arguments{ 13 | \item{data}{a data frame} 14 | 15 | \item{col}{character, name of the factor column to generate indicators for} 16 | } 17 | \value{ 18 | a data frame with the factor levels appended as columns 19 | } 20 | \description{ 21 | Append indicator variables to a data frame based on a single factor variable 22 | } 23 | -------------------------------------------------------------------------------- /man/cond_prob.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/baselines.R 3 | \name{cond_prob} 4 | \alias{cond_prob} 5 | \title{Generate conditional probabilities by group} 6 | \usage{ 7 | cond_prob(data, factor, newvar, prob_list) 8 | } 9 | \arguments{ 10 | \item{data}{dataframe to add variable to} 11 | 12 | \item{factor}{grouping variable that probability of \code{newvar} is conditional on} 13 | 14 | \item{newvar}{name, character, of new variable defined by \code{prob_list}} 15 | 16 | \item{prob_list}{a list, defining the way \code{newvar} should be generated} 17 | } 18 | \value{ 19 | data.frame with \code{newvar} appended to dataframe 20 | } 21 | \description{ 22 | Generate conditional probabilities by group 23 | } 24 | -------------------------------------------------------------------------------- /man/school_transitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/baselines.R 3 | \name{school_transitions} 4 | \alias{school_transitions} 5 | \title{Generate a random transition matrix for school enrollments} 6 | \usage{ 7 | school_transitions(nschls = 15L, diag_limit = 0.9) 8 | } 9 | \arguments{ 10 | \item{nschls}{integer, the number of schools available, default = 15} 11 | 12 | \item{diag_limit}{a numeric between 0 and 1 that sets the minimum probability that a student 13 | will stay schools} 14 | } 15 | \value{ 16 | a transition matrix nschls X nschls of transition probabilities 17 | } 18 | \description{ 19 | Generate a random transition matrix for school enrollments 20 | } 21 | \examples{ 22 | out <- school_transitions(12) 23 | out 24 | } 25 | -------------------------------------------------------------------------------- /man/markov_cond_list.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{markov_cond_list} 4 | \alias{markov_cond_list} 5 | \title{Compute conditional Markov sequences in a pipeline} 6 | \usage{ 7 | markov_cond_list(groupname, n, lst, ...) 8 | } 9 | \arguments{ 10 | \item{groupname}{variable that identifies the list element to use} 11 | 12 | \item{n}{number of elements to generate, usually defined by \link[dplyr]{n} in dplyr} 13 | 14 | \item{lst}{probability list that contains function and function parameters} 15 | 16 | \item{...}{additional arguments passed into the function} 17 | } 18 | \value{ 19 | A sequence generated by functions in the probability list 20 | } 21 | \description{ 22 | Compute conditional Markov sequences in a pipeline 23 | } 24 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: OpenSDPsynthR 2 | Title: Package to generate synthetic data for the OpenSDP Project 3 | Version: 0.1.1.9000 4 | Authors@R: person("Jared", "Knowles", email = "jknowles@gmail.com", role = c("aut", "cre")) 5 | Description: Generate realistic synthetic unit-level education system data suitable for analysis. 6 | Depends: R (>= 3.0.2), 7 | dplyr, 8 | methods, 9 | lme4 (>= 1.1-11) 10 | Imports: lubridate, 11 | markovchain, 12 | magrittr, 13 | wakefield, 14 | purrr, 15 | mvtnorm, 16 | simglm, 17 | tidyr, 18 | stringr, 19 | lazyeval 20 | Suggests: testthat, 21 | knitr, 22 | rmarkdown, 23 | ggplot2 24 | Remotes: jknowles/simglm 25 | License: MIT + file LICENSE 26 | Encoding: UTF-8 27 | LazyData: false 28 | RoxygenNote: 6.1.0 29 | VignetteBuilder: knitr 30 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | OpenSDP invites contributions which will help education analysts use data to 2 | address challenges facing our schools. As our community grows, we will be 3 | formalizing our contribution processes. In the near term, we invite you to use 4 | our resources freely. If you see a way to make them better or have suggestions 5 | for new content, please submit feedback either through our Slack team 6 | [(opensdp.slack.com)](https://opensdp.slack.com) or by submitting an issue to the 7 | relevant repository. We welcome pull requests with updates and corrections to 8 | our existing analytic packages. If you are interested in sharing a repository 9 | of your own work with the education community through OpenSDP, please reach out 10 | to us at opensdp@gse.harvard.edu. 11 | 12 | Visit the OpenSDP website at [opensdp.github.io](https://opensdp.github.io). 13 | -------------------------------------------------------------------------------- /man/make_binary_series.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/markov_transitions.R 3 | \name{make_binary_series} 4 | \alias{make_binary_series} 5 | \title{Create an autocorrelated binary series} 6 | \usage{ 7 | make_binary_series(n = 100, mean = 0.5, corr = 0) 8 | } 9 | \arguments{ 10 | \item{n}{integer, length of series to generate} 11 | 12 | \item{mean}{numeric between 0 and 1, proportion of cases with value 1} 13 | 14 | \item{corr}{numeric between -1 and 1, how correlated should series be?} 15 | } 16 | \value{ 17 | A binary series 18 | } 19 | \description{ 20 | Generate a binary series that is autocorrelated using the 21 | Markov method from \code{\link{make_markov_series}} 22 | } 23 | \examples{ 24 | make_binary_series(n=12,mean=0.5,corr=0.9) 25 | make_binary_series(n=100,mean=0.5,corr=0.1) 26 | } 27 | -------------------------------------------------------------------------------- /man/gen_schools.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_defaults.R 3 | \name{gen_schools} 4 | \alias{gen_schools} 5 | \title{Generate a roster of schools to assign students to} 6 | \usage{ 7 | gen_schools(control) 8 | } 9 | \arguments{ 10 | \item{control}{simulation control parameters from \code{\link{sim_control}}} 11 | } 12 | \value{ 13 | a data.frame with schools and their attributes 14 | } 15 | \description{ 16 | Generate a roster of schools to assign students to 17 | } 18 | \details{ 19 | Controls include: 20 | 21 | \describe{ 22 | \item{n}{number of schools} 23 | \item{mean}{a vector of means for the school attributes} 24 | \item{sigma}{a covariance matrix for the school attributes} 25 | \item{names}{vector to draw names from} 26 | \item{best_schl}{a character value specifiying the ID of the best school} 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /data-raw/grade_matrix.csv: -------------------------------------------------------------------------------- 1 | tofrom, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 2 | 0, 0.1, 0.88, 0.02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 3 | 1, 0.01, 0.06, 0.92, 0.01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 4 | 2, 0, 0.01, 0.02, 0.96, 0.01, 0, 0, 0, 0, 0, 0, 0, 0, 0 5 | 3, 0, 0, 0.02, 0.07, 0.89, 0.02, 0, 0, 0, 0, 0, 0, 0, 0 6 | 4, 0, 0, 0, 0.01, 0.02, 0.95, 0.02, 0, 0, 0, 0, 0, 0, 0 7 | 5, 0, 0, 0, 0, 0.01, 0.02, 0.95, 0.02, 0, 0, 0, 0, 0, 0 8 | 6, 0, 0, 0, 0, 0, 0.01, 0.02, 0.95, 0.02, 0, 0, 0, 0, 0 9 | 7, 0, 0, 0, 0, 0, 0, 0.01, 0.02, 0.93, 0.04, 0, 0, 0, 0 10 | 8, 0, 0, 0, 0, 0, 0, 0, 0.01, 0.02, 0.93, 0.04, 0, 0, 0 11 | 9, 0, 0, 0, 0, 0, 0, 0, 0, 0.01, 0.08, 0.87, 0.04, 0, 0 12 | 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.01, 0.08, 0.87, 0.04, 0 13 | 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.01, 0.08, 0.87, 0.04 14 | 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.15, 0.85 15 | 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 16 | -------------------------------------------------------------------------------- /man/better_sim.lm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{better_sim.lm} 4 | \alias{better_sim.lm} 5 | \title{Generate simulated predictions from a linear model that account for 6 | model and parameter error} 7 | \usage{ 8 | better_sim.lm(object, nsim, newdata, resid_error = FALSE) 9 | } 10 | \arguments{ 11 | \item{object}{an object of class lm or glm} 12 | 13 | \item{nsim}{number of simulations per observation to generate} 14 | 15 | \item{newdata}{dataframe containing the observations to generate predictions 16 | for} 17 | 18 | \item{resid_error}{should the model residual error be added to predictions, 19 | default is FALSE} 20 | } 21 | \value{ 22 | a matrix of predictions nrow(newdata) x nsim columns 23 | } 24 | \description{ 25 | Generate simulated predictions from a linear model that account for 26 | model and parameter error 27 | } 28 | -------------------------------------------------------------------------------- /man/gamma_GK.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gamma_gk.R 3 | \name{gamma_GK} 4 | \alias{gamma_GK} 5 | \title{Estimate the Goodman and Kruskal gamma statistic} 6 | \usage{ 7 | gamma_GK(x, y = NULL, print = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{an unordered factor variable} 11 | 12 | \item{y}{an unordered factor variable} 13 | 14 | \item{print}{a logical vector indicating whether results should be printed to the console} 15 | } 16 | \value{ 17 | A named list with gamma, standard error of gamma, p-value of gamma, and statistical significance 18 | } 19 | \description{ 20 | Estimate the correlation between two unordered factor variables using the Goodman and Kruskal gamma statistic 21 | } 22 | \note{ 23 | Yadda yadda yadda 24 | } 25 | \references{ 26 | Adapted from Simon Jackman from: \url{http://jackman.stanford.edu/classes/151B/06/class0517.r} 27 | } 28 | \author{ 29 | Jared E. Knowles 30 | } 31 | -------------------------------------------------------------------------------- /man/fit_series.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/markov_transitions.R 3 | \name{fit_series} 4 | \alias{fit_series} 5 | \title{Identify the parameters that define a series of binary outcomes} 6 | \usage{ 7 | fit_series(series, return = c("matrix", "fit"), ...) 8 | } 9 | \arguments{ 10 | \item{series}{a vector of 0 and 1 values} 11 | 12 | \item{return}{a character with two options, matrix returns a transition 13 | matrix, "fit" returns a \code{\link{markovchain}} object} 14 | 15 | \item{...}{additional arguments to pass to \code{\link{markovchainFit}}} 16 | } 17 | \value{ 18 | Either a transition matrix or a list with parameters mean and cor 19 | defining the transitions in the vector 20 | } 21 | \description{ 22 | Identify the parameters that define a series of binary outcomes 23 | } 24 | \examples{ 25 | series <- make_markov_series(10, matrix(c(0.3, 0.7, 0.25, 0.75), 26 | nrow = 2, byrow =TRUE)) 27 | fit_series(series) 28 | } 29 | -------------------------------------------------------------------------------- /data-raw/age_grade_baseline.csv: -------------------------------------------------------------------------------- 1 | age,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,Total 2 | 0,1,3,0,0,0,0,1,0,0,0,0,0,0,0,5 3 | 1,8,1,1,0,0,0,0,0,0,0,0,0,0,0,10 4 | 2,88,0,0,0,0,0,0,0,0,0,0,0,0,0,88 5 | 3,565,2,0,0,0,0,0,0,0,0,0,0,0,0,567 6 | 4,839,43,0,0,0,0,0,0,0,0,0,0,0,0,882 7 | 5,117,8627,75,1,0,0,0,0,0,0,0,0,0,0,8820 8 | 6,0,1120,8449,89,1,0,0,0,0,0,0,0,0,0,9659 9 | 7,0,12,1595,8098,97,0,0,0,0,0,0,0,0,0,9802 10 | 8,0,1,35,1872,7503,111,1,0,0,0,0,0,0,0,9523 11 | 9,0,1,2,65,2090,7338,139,0,0,0,0,0,0,0,9635 12 | 10,0,0,0,0,104,2193,7351,153,2,0,0,0,0,0,9803 13 | 11,0,0,0,0,0,153,2244,7092,166,1,0,0,0,1,9657 14 | 12,0,0,0,0,1,2,159,2384,6862,222,1,0,0,1,9632 15 | 13,0,0,0,0,0,0,1,211,2373,6583,267,0,0,2,9437 16 | 14,0,0,0,0,0,0,0,6,310,2388,6691,228,0,0,9623 17 | 15,0,0,0,0,0,0,0,0,5,299,3087,6127,233,7,9758 18 | 16,0,0,0,0,0,0,0,0,0,11,994,2743,5914,252,9914 19 | 17,0,0,0,0,0,0,0,0,0,0,283,791,2349,5777,9200 20 | 18,0,0,0,0,0,0,0,0,0,0,71,212,480,1942,2705 21 | 19,0,0,0,1,0,0,0,0,0,0,12,33,93,343,482 22 | 20,0,0,0,0,0,0,0,1,0,0,7,12,19,126,165 23 | 21,0,0,0,0,0,0,0,0,0,0,3,1,8,13,25 24 | 22,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 President and Fellows of Harvard College 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /data-raw/ellDist.csv: -------------------------------------------------------------------------------- 1 | age,black,asian,hispanic,amerind,white,other,multiracial,hawaiian_pi,total 2 | 1,0.000,,0.000,,0.000,,,, 3 | 2,0.333,0.500,0.000,,0.158,,,, 4 | 3,0.000,0.000,0.000,,0.000,,,, 5 | 4,0.000,0.450,0.610,0.000,0.000,0.000,0.000,0.000,0.000 6 | 5,0.000,0.450,0.620,0.000,0.000,0.000,0.000,0.00,0.000 7 | 6,0.024,0.431,0.614,0.081,0.011,0.500,0.019,0.00,0.132 8 | 7,0.032,0.408,0.636,0.091,0.009,0.375,0.009,0.00,0.131 9 | 8,0.019,0.305,0.559,0.083,0.007,0.500,0.014,0,0.105 10 | 9,0.017,0.182,0.432,0.116,0.005,0.000,0.000,0,0.079 11 | 10,0.016,0.138,0.348,0.037,0.005,0.000,0.012,0,0.060 12 | 11,0.009,0.113,0.238,0.000,0.002,0.500,0.005,0,0.039 13 | 12,0.009,0.108,0.195,0.027,0.002,0.200,0.000,0,0.031 14 | 13,0.010,0.088,0.184,0.038,0.003,0.000,0.000,0,0.029 15 | 14,0.013,0.084,0.158,0.000,0.001,0.000,0.000,0,0.025 16 | 15,0.013,0.114,0.188,0.000,0.002,0.000,0.000,0,0.029 17 | 16,0.007,0.065,0.167,0.000,0.002,0.000,0.000,0,0.024 18 | 17,0.011,0.065,0.175,0.000,0.002,0.000,0.000,0,0.023 19 | 18,0.007,0.084,0.155,0.027,0.002,0.000,0.000,0,0.020 20 | 19,0.018,0.159,0.192,0.200,0.003,0.000,0.050,0,0.033 21 | 20,0.039,0.385,0.291,0.167,0.000,0.000,0.058,0,0, 22 | 21,0.071,0.625,0.222,0.500,0.000,1.000,0.000,0,0.088 23 | 22,0.000,0.000,0.143,0.000,0.000,,0.018,0,0,0, 24 | -------------------------------------------------------------------------------- /tests/testthat/test-all.R: -------------------------------------------------------------------------------- 1 | # All tests 2 | 3 | # expand.grid.df 4 | 5 | context("Test expand_grid_df") 6 | ex_data <- data.frame(id = 100:116, age = c(3:19), 7 | race = c(rep(c("amerind", "asian", "black", "hispanic", 8 | "multiracial", "white", "hawaiian_pi"), 2), 9 | "other", "bad", "error")) 10 | 11 | test_that("Expand grid returns expected objects", { 12 | out <- expand_grid_df(ex_data, data.frame(year = 2006:2012)) 13 | expect_is(out, "data.frame") 14 | expect_equal(ncol(out), 4) 15 | expect_identical(names(out)[4], "year") 16 | expect_equal(nrow(out), nrow(ex_data)*7) 17 | expect_identical(out[1:5, 1:3], ex_data[1:5, 1:3]) 18 | out <- expand_grid_df(ex_data, c("year" = 2006:2012)) 19 | expect_is(out, "data.frame") 20 | expect_equal(ncol(out), 4) 21 | expect_identical(names(out)[4], "y") 22 | expect_equal(nrow(out), nrow(ex_data) * 7) 23 | expect_identical(out[1:5, 1:3], ex_data[1:5, 1:3]) 24 | out <- expand_grid_df(ex_data, 2006:2012) 25 | expect_is(out, "data.frame") 26 | expect_equal(ncol(out), 4) 27 | expect_identical(names(out)[4], "y") 28 | expect_equal(nrow(out), nrow(ex_data) * 7) 29 | expect_identical(out[1:5, 1:3], ex_data[1:5, 1:3]) 30 | }) 31 | 32 | 33 | # assign_grade 34 | 35 | # cond_prob 36 | 37 | # createSeries 38 | 39 | 40 | # findTransitions 41 | 42 | # get_baseline 43 | 44 | # get_code_values 45 | 46 | # ell 47 | 48 | # xwalk 49 | -------------------------------------------------------------------------------- /man/gen_outcome_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_outcomes.R 3 | \name{gen_outcome_model} 4 | \alias{gen_outcome_model} 5 | \title{Generate data for a multilevel outcome} 6 | \usage{ 7 | gen_outcome_model(fixed, fixed_param, random_var, fact_vars, 8 | cov_param = NULL, cor_vars = NULL, ngrps, unbalanceRange, 9 | type = "binary", with_err_gen = NULL, error_var = NULL) 10 | } 11 | \arguments{ 12 | \item{fixed}{a formula with RHS only that specifies the variables to use} 13 | 14 | \item{fixed_param}{a vector of numerics for the coefficients of variables in fixed} 15 | 16 | \item{random_var}{a numeric, length 1, variance of random (school level) component} 17 | 18 | \item{fact_vars}{for each variable in fixed that is a factor, a definition...} 19 | 20 | \item{cov_param}{a list, defining any continuous variables} 21 | 22 | \item{cor_vars}{correlations between fixed variables} 23 | 24 | \item{ngrps}{number of schools} 25 | 26 | \item{unbalanceRange}{range of enrollments in each school} 27 | 28 | \item{type}{character, either "binary" or "linear" to choose outcome variable 29 | type to generate} 30 | 31 | \item{with_err_gen}{name of a distribution function to generate the errors, optional} 32 | 33 | \item{error_var}{integer values to pass to err_gen, optional} 34 | } 35 | \value{ 36 | a list with two elements 37 | } 38 | \description{ 39 | Generate data for a multilevel outcome 40 | } 41 | \examples{ 42 | zed2 <- do.call(gen_outcome_model, sim_control()$gpa_sim_parameters) 43 | } 44 | -------------------------------------------------------------------------------- /man/make_markov_series.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/markov_transitions.R 3 | \name{make_markov_series} 4 | \alias{make_markov_series} 5 | \title{Create a series of binary variables based on a transition matrix} 6 | \source{ 7 | \itemize{ 8 | \item \url{http://stats.stackexchange.com/questions/14175/how-to-generate-random-auto-correlated-binary-time-series-data} 9 | \item \url{https://en.wikipedia.org/wiki/Examples_of_Markov_chains} 10 | } 11 | } 12 | \usage{ 13 | make_markov_series(n, tm, burnin = NULL, ...) 14 | } 15 | \arguments{ 16 | \item{n}{an integer representing the length of the series} 17 | 18 | \item{tm}{a Transition Matrix that describes the transition matrix} 19 | 20 | \item{burnin}{integer, number of values to simulate before drawing from the series} 21 | 22 | \item{...}{additional arguments to apss to \code{\link{markovchainSequence}}} 23 | } 24 | \value{ 25 | a vector of length \code{n} with a random series of 0 and 1 generated 26 | from the \code{tm} 27 | } 28 | \description{ 29 | Generate a 30 | } 31 | \details{ 32 | A Transition Matrix is a 2x2 matrix where: 33 | \itemize{ 34 | \item Element 1, 1 is the probability of moving to state 0 conditional on being in state 0 35 | \item Element 2, 1 is the probability of moving to state 0 conditional on being in state 1 36 | \item Element 1, 2 is the probability of moving to state 1 conditional on being in state 0 37 | \item Element 2, 2 is the probability of moving to state 1 conditional on being in state 1 38 | } 39 | } 40 | \examples{ 41 | make_markov_series(10, matrix(c(0.6,0.4,0.9,0.1), nrow=2, byrow=TRUE)) 42 | } 43 | -------------------------------------------------------------------------------- /man/age_calc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{age_calc} 4 | \alias{age_calc} 5 | \title{Function to calculate age from date of birth.} 6 | \source{ 7 | This function was developed in part from this response on the R-Help mailing list. 8 | } 9 | \usage{ 10 | age_calc(dob, enddate = Sys.Date(), units = "months", precise = TRUE) 11 | } 12 | \arguments{ 13 | \item{dob}{a vector of class \code{Date} representing the date of birth/start date} 14 | 15 | \item{enddate}{a vector of class Date representing the when the observation's 16 | age is of interest, defaults to current date.} 17 | 18 | \item{units}{character, which units of age should be calculated? allowed values are 19 | days, months, and years} 20 | 21 | \item{precise}{logical indicating whether or not to calculate with leap year 22 | and leap second precision} 23 | } 24 | \value{ 25 | A numeric vector of ages the same length as the dob vector 26 | } 27 | \description{ 28 | his function calculates age in days, months, or years from a 29 | date of birth to another arbitrary date. This returns a numeric vector in 30 | the specified units. 31 | } 32 | \examples{ 33 | a <- as.Date(seq(as.POSIXct('1987-05-29 018:07:00'), len=26, by="21 day")) 34 | b <- as.Date(seq(as.POSIXct('2002-05-29 018:07:00'), len=26, by="21 day")) 35 | 36 | age <- age_calc(a, units='years') 37 | age 38 | age <- age_calc(a, units='months') 39 | age 40 | age <- age_calc(a, as.Date('2005-09-01')) 41 | age 42 | } 43 | \seealso{ 44 | See also \code{\link{difftime}} which this function uses and mimics 45 | some functionality but at higher unit levels. 46 | } 47 | \author{ 48 | Jason P. Becker 49 | } 50 | -------------------------------------------------------------------------------- /tests/testthat/test-validators.R: -------------------------------------------------------------------------------- 1 | # Test validation tools 2 | 3 | 4 | tm_gifted_f <- matrix( 5 | c(500, 1, 2, 500), 6 | nrow = 2, 7 | byrow = TRUE, 8 | dimnames = list(c("1", "0"), c("1", "0")) 9 | ) 10 | tm_gifted_m <- tm_gifted_f 11 | tm_gifted_m[1, 1] <- tm_gifted_m[1, 1] + 25 12 | tm_gifted_m <- tm_convert(tm_gifted_m) 13 | tm_gifted_f <- tm_convert(tm_gifted_f) 14 | 15 | gifted_list <- list( 16 | "Male" = list(f = make_markov_series, 17 | pars = list(tm = tm_gifted_m, 18 | # Use quote so for each call in the loop sample is redrawn 19 | t0 = quote( 20 | sample(c("1", "0"), 1, prob = c(10, 90)) 21 | ))), 22 | "Female" = list(f = make_markov_series, 23 | pars = list(tm = tm_gifted_f, 24 | t0 = quote( 25 | sample(c("1", "0"), 1, prob = c(8, 92)) 26 | ))), 27 | "GROUPVARS" = c("Sex") 28 | ) 29 | 30 | 31 | # 32 | 33 | grad_sim_parameters <- list( 34 | fixed = ~ 1 + math_ss + scale_gpa + gifted + iep + frpl + ell + male, 35 | random_var = 0.09948, 36 | cov_param = list( 37 | dist_fun = c("rnorm", "rnorm", rep("rbinom", 5)), 38 | var_type = rep("lvl1", 7), 39 | opts = list( 40 | list(mean = 0, sd = 1), 41 | list(mean = 0, sd = 1), 42 | list(size = 1, prob = 0.1), 43 | list(size = 1, prob = 0.2), 44 | list(size = 1, prob = 0.45), 45 | list(size = 1, prob = 0.1), 46 | list(size = 1, prob = 0.47) 47 | ) 48 | ), 49 | cor_vars = c( 50 | 0.5136, 0.453, -0.276, -0.309, -0.046, -0.033, 51 | 0.2890, -0.1404, -0.2674, -0.0352,-0.1992, 52 | -0.1354, -0.2096, -0.0305, -0.0290, 53 | 0.1433, -0.0031, 0.1269, 54 | 0.0601, 0.0066, 55 | 0.0009 56 | ), 57 | fixed_param = c( 58 | 1.7816, 0.10764, 1.05872, -0.07352, -0.07959, 59 | -0.331647,-0.22318254, 0.0590 60 | ), 61 | ngrps = 30, 62 | unbalanceRange = c(100, 1500) 63 | ) 64 | 65 | assess_sim_par <- OpenSDPsynthR::sim_control()$assess_sim_par 66 | gpa_sim_par <- OpenSDPsynthR::sim_control()$gpa_sim_par 67 | 68 | test_that("Validators work for two level and three level sim", { 69 | expect_true(validate_sim_parameter(assess_sim_par)) 70 | expect_true(validate_sim_parameter(gpa_sim_par)) 71 | expect_true(validate_sim_parameter(grad_sim_parameters)) 72 | }) 73 | 74 | -------------------------------------------------------------------------------- /tests/testthat/test-age_calc.R: -------------------------------------------------------------------------------- 1 | # age, retention, and moves calculations 2 | # From eeptools package 3 | context("Test age calculator") 4 | 5 | test_that("Leap year calculations work", { 6 | # from @larmarange 7 | expect_equal(age_calc(as.Date('2004-01-15'), as.Date('2004-02-16')), 1.034483, 8 | tol = .00001) 9 | expect_equal(age_calc(as.Date('2005-01-15'), as.Date('2005-02-16')), 1.035714, 10 | tol = .00001) 11 | expect_equal(age_calc(as.Date('1995-01-15'), as.Date('2003-02-16')), 12 | age_calc(as.Date('1994-01-15'), as.Date('2002-02-16'))) 13 | expect_false(age_calc(as.Date('1996-01-15'), as.Date('2004-02-16')) == 14 | age_calc(as.Date('1994-01-15'), as.Date('2002-02-16'))) 15 | }) 16 | 17 | test_that("All function parameters result in a numeric calculations with sane inputs", { 18 | tests <- expand.grid(precise = c(TRUE, FALSE), 19 | units = c("days", "months", "years"), 20 | dob = c("atomic", "vector"), 21 | enddate = c("atomic", "vector")) 22 | 23 | safe.ifelse <- function(cond, yes, no) structure(ifelse(cond, yes, no), class = class(yes)) 24 | 25 | for(i in 1:nrow(tests)){ 26 | atomDOB <- as.Date(as.POSIXct('1987-05-29 018:07:00')) 27 | vecDOB <- as.Date(seq(as.POSIXct('1987-05-29 018:07:00'), len=26, by="21 day")) 28 | vecED <- as.Date(seq(as.POSIXct('2017-05-29 018:07:00'), len=26, by="21 day")) 29 | atomED <- as.Date(as.POSIXct('2017-05-29 018:07:00')) 30 | 31 | dob <- safe.ifelse(tests[i, "dob"] == "atomic", atomDOB, vecDOB) 32 | enddate <- safe.ifelse(tests[i, "enddate"] == "atomic", atomED, vecED) 33 | 34 | out <- age_calc(dob = dob, enddate = enddate, units = tests[i, ]$units, 35 | precise = tests[i, ]$precise) 36 | expect_true(class(out) %in% c("difftime", "numeric")) 37 | } 38 | 39 | }) 40 | 41 | test_that("Bad inputs yield correct errors", { 42 | expect_error(age_calc('2004-01-15', '2004-02-16'), 43 | "Both dob and enddate must be Date class objects") 44 | expect_error(age_calc(as.Date('2004-01-15'), '2004-02-16'), 45 | "Both dob and enddate must be Date class objects") 46 | expect_error(age_calc('2004-01-15', as.Date('2004-02-16')), 47 | "Both dob and enddate must be Date class objects") 48 | expect_error(age_calc(as.Date('2004-02-16'), as.Date('2004-01-15')), 49 | "End date must be a date after date of birth") 50 | 51 | }) 52 | -------------------------------------------------------------------------------- /tests/testthat/test-gen.R: -------------------------------------------------------------------------------- 1 | # Test gen 2 | 3 | context("Test high level data generation functions") 4 | 5 | #TODO make gen_students produce consistent number of columns 6 | test_that("gen_students produces students", { 7 | set.seed(21421) 8 | out <- gen_students(n = 10) 9 | expect_equal(nrow(out), 10) 10 | expect_is(out, "data.frame") 11 | expect_identical(names(out), 12 | c("sid", "Sex", "Birthdate", "Race", "White", 13 | "Black.or.African.American", 14 | "Hispanic.or.Latino.Ethnicity", 15 | "Demographic.Race.Two.or.More.Races", "id_type")) 16 | expect_is(out[, "sid"], "factor") 17 | expect_is(out[, "Sex"], "factor") 18 | expect_is(out[, "Birthdate"], "Date") 19 | expect_is(out[, "Race"], "factor") 20 | expect_is(out[, "White"], "character") 21 | expect_is(out[, "Black.or.African.American"], "character") 22 | out <- gen_students(n = 100) 23 | expect_equal(nrow(out), 100) 24 | expect_equal(ncol(out), 11) 25 | }) 26 | 27 | test_that("gen_student_years longitudinal data", { 28 | set.seed(2352) 29 | out <- gen_students(n = 100) 30 | out_l <- gen_student_years(data = out) 31 | expect_message(gen_student_years(data = out), regexp = NA) 32 | expect_equal(ncol(out_l), 8) 33 | # Figure out how to test number of rows produced 34 | expect_equal(min(out_l$year), sim_control()$minyear) 35 | expect_equal(max(out_l$year), sim_control()$maxyear) 36 | expect_identical(names(out_l)[1:4], c("sid", "year", "Birthdate", "age")) 37 | expect_identical(names(out_l)[5:8], c("enrollment_status", "cohort_grad_year", 38 | "cohort_year", "exit_type")) 39 | }) 40 | 41 | 42 | # context("Test annual status variables") 43 | # ex_data <- data.frame(id = 100:116, 44 | # Sex = sample(c("Male", "Female"), 17, replace = TRUE), 45 | # Race = sample(c("White", "Asian"), 17, replace = TRUE)) 46 | # 47 | # gen_annual_status(ex_data) 48 | # 49 | # test_that("Does it work?", { 50 | # 51 | # }) 52 | # 53 | # cond_vars <- get_sim_groupvars(control) 54 | # stu_year <- left_join(stu_year, demog_master[, c(idvar, cond_vars)], 55 | # by = idvar) 56 | # stu_year <- gen_annual_status(stu_year, control = control) 57 | 58 | 59 | # gen_annual_status 60 | # gen_initial_status 61 | # simpop 62 | # gen_student_years 63 | # make_inds 64 | # sim_control 65 | 66 | test_that("Simulation control works", { 67 | expect_is(sim_control(), "list") 68 | 69 | }) 70 | -------------------------------------------------------------------------------- /data-raw/CEDS_SDP_map.csv: -------------------------------------------------------------------------------- 1 | entity,category,CEDS_name,sdp_name,CEDS_Option_set,SDP_option_match 2 | K12Student,Demographic,Birthdate,dob,YYYY-MM-DD,YYYY-MM-DD 3 | K12Student,Demographic,White,white,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected 4 | K12Student,Demographic,Asian,asian,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected 5 | K12Student,Demographic,American Indian or Alaska Native,amerind,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected 6 | K12Student,Demographic,Black or African American,black,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected 7 | K12Student,Demographic,Hispanic or Latino Ethnicity,hispanic,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected 8 | K12Student,Demographic,Demographic Race Two or More Races,multiracial,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected 9 | K12Student,Demographic,Native Hawaiian or Other Pacific Islander,hawaiian_pi,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected 10 | K12Student,Demographic,,other 11 | K12Student,Limited English Proficiency,Limited English Proficiency Status,ell,Yes - Yes;No - No,1 - Yes;0 - No;NA - Not Selected 12 | K12Student,Disability,Primary Disability Type,disab_type,AUT - Autism; DB - Deaf-blindness; DD - Developmental Delay; EMN - Emotional disturbance; HI - Hearing Impariment; ID - Intellectual Disability; MD - Multiple disabilities; OI - Orthopedic impairment; OHI - Other health impairment; SLD - Specific learning disability; SLI - Speech or language impairment; TBI - Traumatic brain injury; VI - Visual impairment 13 | K12Student,Enrollment,Gifted and Talented Indicator,gifted,Yes - Yes; No - No; Unknown - Unknown,1 - Yes;0 - No;NA - Not Selected 14 | K12Student,Economically Disadvantaged,Economic Disadvantage Status,frpl,Yes - Yes;No - No,1 - Yes;0 - No;NA - Not Selected 15 | K12Student,Disability,IDEA Indicator,iep,Yes - Yes; No - No,1 - Yes;0 - No 16 | K12Student,Identity,Student Identifier,sid,xxxx 17 | K12Student,Demographic,Sex,sex,Male - Male;Female - Female;NotSelected - Not Selected 18 | K12Student,Demographic,,age,, 19 | K12Student,Enrollment,Entry Grade Level,grade_level,IT - Infant/toddler;PR - Preschool;PK - Prekindergarten;TK - Transitional Kindergarten;KG - Kindergarten;01 - First grade;02 - Second grade;03 - Third grade;04 - Fourth grade;05 - Fifth grade;06 - Sixth grade;07 - Seventh grade;08 - Eigth grade;09 - Ninth grade;10 - Tenth grade;11 - Eleventh grade;12 - Twelfth grade;13 - Grade 13;PS - Postsecondary;UG - Ungraded;Other - Other,-9 - Ungraded;-1 - Prekindergarten;-1 - Transitional Kindergarten;-1 - Preschool;0 - Kindergarten;1 - First grade;2 - Second grade;3 - Third grade;4 - Fourth grade;5 - Fifth grade;6 - Sixth grade;7 - Seventh grade;8 - Eigth grade;9 - Ninth grade;10 - Tenth grade;11 - Eleventh grade;12 - Twelfth grade;13 - Grade 13;13 - Postsecondary 20 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | #' @importFrom stats rbinom runif pnorm setNames coef rnbinom vcov simulate update 2 | #' rnorm rpois sd sigma terms time reshape 3 | #' @importFrom stats na.omit 4 | #' @importFrom utils data head tail write.csv zip 5 | #' @importFrom methods new 6 | 7 | .onAttach = function(...) { 8 | if (!interactive()) return() 9 | msg = "Welcome to OpenSDP." # nocov 10 | packageStartupMessage(paste(strwrap(msg), collapse = "\n")) # nocov 11 | } 12 | 13 | 14 | utils::globalVariables(c("Sex", "iep", "Race", "frpl", "age")) 15 | 16 | 17 | .onLoad <- function(libname = find.package("OpenSDPsynthR"), pkgname = "OpenSDPsynthR"){ 18 | # CRAN Note avoidance 19 | if(getRversion() >= "2.15.1") 20 | utils::globalVariables( 21 | # from simpop and subset function calls 22 | c("sid", "grad", "first_flag", "schools_race_prob", "White", 23 | "initschid", "Lodgment_method", "PHI_Ind", "Sw_amt", "Alow_ben_amt", 24 | "ontrack_yr1", "grade", "math_ss", "rdg_ss", 25 | "ontrack_yr4", "cum_credits_yr1", "cum_credits_yr4", "cum_credits_yr1_ela", 26 | "cum_credits_yr4_ela", "cum_credits_yr1_math", "cum_credits_yr4_math", 27 | "cum_gpa_yr1", "cum_gpa_yr4", "yr_seq", 28 | "scale_gpa", "grad_prob", "ps_prob", "ps", 29 | "cum_credits", "credits_earned", "credits_attempted", "grad", 30 | "hs_status", "status_after", "event", 31 | "ps_transfer", "opeid", "ps_change_ind", "short_name", 32 | "type", 33 | # we use the magrittr pipe 34 | ".", 35 | # CLEANER 36 | "grade", "sid", "schid", "gifted", "chrt_ninth", "name", "keep", "math_ss", 37 | "rdg_ss", "test_math_8_std", "test_ela_8_std", "test_composite_8", 38 | "yr_seq", "ontrack", "cum_credits", "cum_credits_ela", "cum_credits_math", 39 | "cum_gpa", "status_after", "scale_gpa", "gpa", "grad_prob", "grad", 40 | "hs_status", "ps_prob", "ps", "diploma_type", "class_rank", "opeid", 41 | "first_ps", "last_ps", "chrt_grad", "variable", "value", "temp", 42 | "1_enrl_1oct_grad", "2_enrl_1oct_grad", "3_enrl_1oct_grad", "4_enrl_1oct_grad", 43 | "5_enrl_1oct_grad", "1_enrl_1oct_ninth", "2_enrl_1oct_ninth", "3_enrl_1oct_ninth", 44 | "4_enrl_1oct_ninth", "5_enrl_1oct_ninth", "enrl_ever_w2_grad_any", 45 | "enrl_ever_w2_ninth_any", "ps_type", "enrl_1oct_grad", "enrl_1oct_ninth", 46 | "enrl_ever_w2_ninth", "enrl_ever_w2_grad", "enrl_ever_w2_grad_2yr", 47 | "enrl_ever_w2_grad_4yr", "enrl_ever_w2_ninth_2yr", "enrl_ever_w2_ninth_4yr", 48 | "term", "ps_short_name", "enroll_count", "all4", "persist", "chrt", 49 | "observed", 50 | # control parameters 51 | "race_list", "frl_list", "school_list", "gifted_list", 52 | # more simpop 53 | "flag", "grade_diff", "cohort_year", "subject", "assess_id", "score", 54 | "ntests", "schid", "cohort_grad_year" 55 | ) 56 | ) 57 | invisible() 58 | } 59 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(age_calc) 4 | export(assign_baseline) 5 | export(assign_grade) 6 | export(assign_hs_outcomes) 7 | export(assign_schools) 8 | export(better_sim.lm) 9 | export(ceds_cleaner) 10 | export(cond_prob) 11 | export(diag_offset) 12 | export(expand_grid_df) 13 | export(fit_series) 14 | export(gamma_GK) 15 | export(gen_annual_gpa) 16 | export(gen_annual_status) 17 | export(gen_assess) 18 | export(gen_credits) 19 | export(gen_gpa) 20 | export(gen_grad) 21 | export(gen_hs_annual) 22 | export(gen_initial_status) 23 | export(gen_nsc) 24 | export(gen_ontrack) 25 | export(gen_outcome_model) 26 | export(gen_ps) 27 | export(gen_ps_enrollment) 28 | export(gen_schools) 29 | export(gen_student_years) 30 | export(gen_students) 31 | export(get_baseline) 32 | export(get_code_values) 33 | export(get_sim_groupvars) 34 | export(grade_transitions) 35 | export(group_rescale) 36 | export(make_binary_series) 37 | export(make_inds) 38 | export(make_markov_series) 39 | export(map_CEDS) 40 | export(markov_cond_list) 41 | export(num_clip) 42 | export(num_grade) 43 | export(rand_vect_cont) 44 | export(recode_credits) 45 | export(recode_options) 46 | export(rescale) 47 | export(rescale_gpa) 48 | export(school_transitions) 49 | export(sdp_cleaner) 50 | export(sim_control) 51 | export(simpop) 52 | export(tidy_sequence) 53 | export(tm_convert) 54 | export(unscale) 55 | export(validate_probability_list) 56 | export(validate_sim_parameter) 57 | export(zeroNA) 58 | import(dplyr) 59 | import(lme4) 60 | importFrom(dplyr,left_join) 61 | importFrom(lazyeval,interp) 62 | importFrom(lubridate,year) 63 | importFrom(magrittr,"%<>%") 64 | importFrom(magrittr,"%>%") 65 | importFrom(markovchain,createSequenceMatrix) 66 | importFrom(markovchain,markovchainFit) 67 | importFrom(markovchain,markovchainSequence) 68 | importFrom(methods,new) 69 | importFrom(mvtnorm,rmvnorm) 70 | importFrom(purrr,map) 71 | importFrom(simglm,sim_glm) 72 | importFrom(simglm,sim_reg) 73 | importFrom(stats,coef) 74 | importFrom(stats,na.omit) 75 | importFrom(stats,pnorm) 76 | importFrom(stats,rbinom) 77 | importFrom(stats,reshape) 78 | importFrom(stats,rlnorm) 79 | importFrom(stats,rnbinom) 80 | importFrom(stats,rnorm) 81 | importFrom(stats,rpois) 82 | importFrom(stats,runif) 83 | importFrom(stats,sd) 84 | importFrom(stats,setNames) 85 | importFrom(stats,sigma) 86 | importFrom(stats,simulate) 87 | importFrom(stats,terms) 88 | importFrom(stats,time) 89 | importFrom(stats,update) 90 | importFrom(stats,vcov) 91 | importFrom(stringr,str_trunc) 92 | importFrom(tidyr,crossing) 93 | importFrom(tidyr,gather) 94 | importFrom(utils,data) 95 | importFrom(utils,head) 96 | importFrom(utils,tail) 97 | importFrom(utils,write.csv) 98 | importFrom(utils,zip) 99 | importFrom(wakefield,dob) 100 | importFrom(wakefield,level) 101 | importFrom(wakefield,race) 102 | importFrom(wakefield,sex) 103 | -------------------------------------------------------------------------------- /tools/example_collab.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Example Collaboration" 3 | author: "Jared Knowles" 4 | date: "April 14, 2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, echo=TRUE, results='hide', message=FALSE, warning=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | library(OpenSDPsynthR) 11 | simouts <- simpop(nstu = 1250L, seed = 488234, 12 | control = sim_control(nschls = 9L, 13 | minyear=1997, maxyear=2013)) 14 | 15 | ``` 16 | 17 | ## Identify 18 | 19 | ```{r identifyElements} 20 | names(simouts$assessment) 21 | names(simouts$demog_master) 22 | names(simouts$stu_year) 23 | ``` 24 | 25 | ## Clean 26 | 27 | TBD 28 | 29 | ## Connect 30 | 31 | ```{r kwikconnect} 32 | analyze_table <- left_join(simouts$demog_master, simouts$assessment, 33 | by = "sid") 34 | analyze_table <- left_join(analyze_table, simouts$stu_year, 35 | by = c("sid", "year")) 36 | 37 | analyze_table %>% group_by(sid) %>% 38 | summarize(nobs = n()) %>% select(-sid) %>% summary 39 | 40 | analyze_table <- analyze_table %>% filter(grade.x %in% 41 | c("3", "4", "5", "6", "7")) 42 | 43 | ``` 44 | 45 | 46 | ## Analyze 47 | 48 | ```{r quickmodels} 49 | 50 | # simple 51 | 52 | simp_mod <- lm(math_ss ~ Race + grade.x + frpl + ell + iep + gifted, 53 | data = analyze_table) 54 | 55 | summary(simp_mod) 56 | 57 | # school effect 58 | 59 | sch_mod <- lm(math_ss ~ Race + grade.x + frpl + ell + iep + gifted + 60 | schid.x, 61 | data = analyze_table) 62 | 63 | summary(sch_mod) 64 | 65 | ``` 66 | 67 | 68 | 69 | ## Report 70 | 71 | ```{r} 72 | library(broom) 73 | library(ggplot2) 74 | 75 | plot_mod <- tidy(sch_mod) 76 | plot_mod$model <- "school" 77 | tmp <- tidy(simp_mod) 78 | tmp$model <- "simple" 79 | plot_mod <- bind_rows(plot_mod, tmp); rm(tmp) 80 | 81 | 82 | ggplot(plot_mod[plot_mod$term %in% c("frpl1", "ell1", "iep1"), ], 83 | aes(x = term, y = estimate, ymin = estimate-std.error, 84 | ymax = estimate+std.error, group = model, color = model)) + 85 | geom_linerange(position = position_dodge(width = 1), size = 2) + 86 | theme_bw() + geom_hline(yintercept = 0, color = I("red")) + 87 | theme(legend.position = "bottom") 88 | 89 | 90 | 91 | 92 | ``` 93 | 94 | 95 | ```{r schoolEffects} 96 | ggplot(plot_mod[plot_mod$term %in% c("schid.x1", "schid.x2", 97 | "schid.x3", "schid.x4", 98 | "schid.x5", "schid.x6", 99 | "schid.x7", "schid.x8", 100 | "schid.x9"), ], 101 | aes(x = term, y = estimate, ymin = estimate-std.error, 102 | ymax = estimate+std.error)) + 103 | geom_linerange(size = 2) + 104 | theme_bw() + geom_hline(yintercept = 0, color = I("red")) + 105 | theme(legend.position = "bottom") 106 | 107 | ``` 108 | 109 | -------------------------------------------------------------------------------- /R/gamma_gk.R: -------------------------------------------------------------------------------- 1 | ##' Estimate the Goodman and Kruskal gamma statistic 2 | ##' 3 | ##' Estimate the correlation between two unordered factor variables using the Goodman and Kruskal gamma statistic 4 | ##' 5 | ##' @param x an unordered factor variable 6 | ##' @param y an unordered factor variable 7 | ##' @param print a logical vector indicating whether results should be printed to the console 8 | ##' @return A named list with gamma, standard error of gamma, p-value of gamma, and statistical significance 9 | ##' @note Yadda yadda yadda 10 | ##' @export 11 | ##' @author Jared E. Knowles 12 | ##' @references Adapted from Simon Jackman from: \url{http://jackman.stanford.edu/classes/151B/06/class0517.r} 13 | gamma_GK <- function(x, y = NULL, print = FALSE){ 14 | concordant <- function(x){ 15 | ## get sum(matrix values > r AND > c) 16 | ## for each matrix[r, c] 17 | mat.lr <- function(r,c){ 18 | lr <- x[(r.x > r) & (c.x > c)] 19 | sum(lr) 20 | } 21 | 22 | ## get row and column index for each 23 | ## matrix element 24 | r.x <- row(x) 25 | c.x <- col(x) 26 | 27 | ## return the sum of each matrix[r, c] * sums 28 | ## using mapply to sequence thru each matrix[r, c] 29 | sum(x * mapply(mat.lr, r = r.x, c = c.x)) 30 | } 31 | 32 | discordant <- function(x){ 33 | ## get sum(matrix values > r AND < c) 34 | ## for each matrix[r, c] 35 | mat.ll <- function(r,c){ 36 | ll <- x[(r.x > r) & (c.x < c)] 37 | sum(ll) 38 | } 39 | 40 | ## get row and column index for each 41 | ## matrix element 42 | r.x <- row(x) 43 | c.x <- col(x) 44 | 45 | ## return the sum of each matrix[r, c] * sums 46 | ## using mapply to sequence thru each matrix[r, c] 47 | sum(x * mapply(mat.ll, r = r.x, c = c.x)) 48 | } 49 | 50 | if(is.table(x) | is.matrix(x)){ 51 | c <- concordant(x) 52 | d <- discordant(x) 53 | n <- sum(x) 54 | } 55 | else{ 56 | tab <- table(x,y) 57 | c <- concordant(tab) 58 | d <- discordant(tab) 59 | n <- sum(tab) 60 | } 61 | gamma <- (c - d) / (c + d) 62 | 63 | arg <- (c+d)/(n*(1-(gamma^2))) 64 | stdError <- 1/sqrt(arg) 65 | z <- gamma/stdError 66 | if(print==TRUE){ 67 | cat("Goodman-Kruskal gamma statistic:\n") 68 | cat(paste("Concordant Pairs",c,"\n")) 69 | cat(paste("Discordant Pairs",d,"\n\n")) 70 | cat(paste("Estimate of gamma:", 71 | signif(gamma,.Options$digits), 72 | "Standard error:", 73 | signif(stdError,.Options$digits), 74 | "\n\n")) 75 | 76 | cat(paste("H0: gamma = 0 vs HA: two-sided\n")) 77 | cat(paste("z:", 78 | signif(z, .Options$digits), 79 | "p-value:", 80 | signif(2*(1-pnorm(abs(z))), .Options$digits), 81 | "\n\n")) 82 | if(c<51 | d<51){ 83 | cat("Warning: p-values are based on a normal approximation to the\n") 84 | cat("sampling distribution of the z test statistic, which is commonly\n") 85 | cat("considered to be good only if C and D are both > 50.\n") 86 | } 87 | } 88 | return(list(gamma = signif(gamma,.Options$digits), 89 | se = signif(stdError,.Options$digits), 90 | z =signif(z, .Options$digits), 91 | sig = signif(2*(1-pnorm(abs(z))), .Options$digits))) 92 | invisible(NULL) 93 | } 94 | -------------------------------------------------------------------------------- /CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at opensdp@gse.harvard.edu. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /R/diagnostics.R: -------------------------------------------------------------------------------- 1 | # # # # # # Simulation diagnostics 2 | # # # # # 3 | # # # # # # Ever FRL percentage 4 | # # # 5 | # # simouts$stu_year %>% group_by(sid) %>% 6 | # # summarize(everFRL = ifelse(any(frpl == "1"), 1, 0)) %>% 7 | # # select(everFRL) %>% unlist %>% table 8 | # # 9 | # # simouts$stu_year %>% group_by(sid) %>% 10 | # # summarize(everELL = ifelse(any(ell == "1"), 1, 0)) %>% 11 | # # select(everELL) %>% unlist %>% table 12 | # # 13 | # # simouts$stu_year %>% group_by(sid) %>% 14 | # # summarize(everIEP = ifelse(any(iep == "1"), 1, 0)) %>% 15 | # # select(everIEP) %>% unlist %>% table 16 | # # 17 | # # 18 | # # simouts$stu_year %>% group_by(sid) %>% 19 | # # summarize(everGifted = ifelse(any(gifted == "1"), 1, 0)) %>% 20 | # # select(everGifted) %>% unlist %>% table 21 | # # 22 | # # table(simouts$stu_year$enrollment_status) 23 | # # table(simouts$stu_year$grade_advance) 24 | # 25 | # # ggplot(assess, aes(x = math_ss, y = math_ssb, group = frpl, color = frpl)) + 26 | # # geom_point(alpha = I(0.3)) + geom_smooth(se = FALSE) + 27 | # # geom_abline(slope = 1, intercept= 0) 28 | # # ggplot(assess, aes(x = math_ssb, group = frpl, color = frpl)) + 29 | # # geom_density(alpha = I(0.3)) + facet_wrap(~age) 30 | # # assess$math_ssc <- mapply(perturb_race, assess$math_ssb, assess$Race, assess$math_sd) 31 | # 32 | # # 33 | # simoutsA <- simpop(nstu = 1000L, seed = 488234, 34 | # control = sim_control(nschls = 3L, minyear=1990, 35 | # maxyear=2010)) 36 | # 37 | # # Document that these race codes need to be replace din perturb functions and the like 38 | # simoutsB <- simpop(nstu = 1000L, seed = 488234, 39 | # control = sim_control(nschls = 3L, race_groups = c("Black", "White", "Hispanic"), 40 | # race_prob = c(0.3, 0.6, 0.1))) 41 | 42 | # mod_sim <- do.call(gen_outcome_model, sim_control()$ps_sim_parameters) 43 | # mod_sim$sim_model@resp$family$linkinv(unlist(ranef(mod_sim$sim_model)$clustID)) 44 | 45 | # 46 | # simoutsC <- simpop(nstu = 1000L, seed = 488234, 47 | # control = sim_control(nschls = 3L, n_cohorts = 5L)) 48 | 49 | # tm <- matrix( 50 | # c(800, 20, 5, 800), 51 | # nrow = 2, 52 | # byrow = TRUE, 53 | # dimnames = list(c("1", "0"), c("1", "0")) 54 | # ) 55 | # mytm <- tm_convert(tm) 56 | # myGL <- list(GROUPVARS = "ALL", 57 | # "ALL" = list(f = make_markov_series, 58 | # pars = list(tm = mytm)) 59 | # ) 60 | # 61 | # 62 | # simoutsD <- simpop(nstu = 1000L, seed = 488234, 63 | # control = sim_control(nschls = 3L, 64 | # gifted_list = myGL)) 65 | # 66 | # 67 | # 68 | # 69 | # myGradSim <- list( 70 | # fixed = ~ 1 + math_ss + scale_gpa + gifted + iep + frpl + ell + male, 71 | # random_var = 0.8948, 72 | # cov_param = list( 73 | # dist_fun = c("rnorm", "rnorm", rep("rbinom", 5)), 74 | # var_type = rep("lvl1", 7), 75 | # opts = list( 76 | # list(mean = 0, sd = 1), 77 | # list(mean = 0, sd = 1), 78 | # list(size = 1, prob = 0.1), 79 | # list(size = 1, prob = 0.2), 80 | # list(size = 1, prob = 0.45), 81 | # list(size = 1, prob = 0.1), 82 | # list(size = 1, prob = 0.47) 83 | # ) 84 | # ), 85 | # cor_vars = c( 86 | # 0.5136, 0.453, -0.276, -0.309, -0.046, -0.033, 87 | # 0.2890, -0.1404, -0.2674, -0.0352,-0.1992, 88 | # -0.1354, -0.2096, -0.0305, -0.0290, 89 | # 0.1433, -0.0031, 0.1269, 90 | # 0.0601, 0.0066, 91 | # 0.0009 92 | # ), 93 | # fixed_param = c( 94 | # 1.6816, 0.30764, 1.05872, -0.07352, -0.07959, 95 | # -0.331647,-0.22318254, 0.0590 96 | # ), 97 | # ngrps = nschls + 5, 98 | # unbalanceRange = c(100, 1500) 99 | # ) 100 | # 101 | # simouts <- simpop(nstu = 4000L, seed = 53232, 102 | # control = sim_control(nschls = 12L, 103 | # grad_sim_parameters = )) 104 | -------------------------------------------------------------------------------- /R/markov_transitions.R: -------------------------------------------------------------------------------- 1 | #' Create a series of binary variables based on a transition matrix 2 | #' 3 | #' @param n an integer representing the length of the series 4 | #' @param tm a Transition Matrix that describes the transition matrix 5 | #' @param burnin integer, number of values to simulate before drawing from the series 6 | #' @param ... additional arguments to apss to \code{\link{markovchainSequence}} 7 | #' @return a vector of length \code{n} with a random series of 0 and 1 generated 8 | #' from the \code{tm} 9 | #' @description Generate a 10 | #' @details A Transition Matrix is a 2x2 matrix where: 11 | #' \itemize{ 12 | #' \item Element 1, 1 is the probability of moving to state 0 conditional on being in state 0 13 | #' \item Element 2, 1 is the probability of moving to state 0 conditional on being in state 1 14 | #' \item Element 1, 2 is the probability of moving to state 1 conditional on being in state 0 15 | #' \item Element 2, 2 is the probability of moving to state 1 conditional on being in state 1 16 | #' } 17 | #' @source \itemize{ 18 | #' \item \url{http://stats.stackexchange.com/questions/14175/how-to-generate-random-auto-correlated-binary-time-series-data} 19 | #' \item \url{https://en.wikipedia.org/wiki/Examples_of_Markov_chains} 20 | #' } 21 | #' @importFrom markovchain markovchainSequence 22 | #' @export 23 | #' @examples 24 | #' make_markov_series(10, matrix(c(0.6,0.4,0.9,0.1), nrow=2, byrow=TRUE)) 25 | make_markov_series <- function(n, tm, burnin = NULL, ...){ 26 | stopifnot(is.matrix(tm)) 27 | stopifnot(n > 0) 28 | if(any(tm > 1)){ 29 | warning("TM elements exceed 1, adjusting by dividing by rowSums") 30 | tm <- tm / rowSums(tm) 31 | } 32 | mc <- new("markovchain", transitionMatrix = tm) 33 | if(!is.null(burnin)){ 34 | series <- markovchainSequence(n+burnin, mc, ...) 35 | series <- series[burnin:(n+burnin)] 36 | } else{ 37 | series <- markovchainSequence(n, mc, ...) 38 | } 39 | return(series) 40 | } 41 | 42 | #' Create an autocorrelated binary series 43 | #' 44 | #' @param n integer, length of series to generate 45 | #' @param mean numeric between 0 and 1, proportion of cases with value 1 46 | #' @param corr numeric between -1 and 1, how correlated should series be? 47 | #' @description Generate a binary series that is autocorrelated using the 48 | #' Markov method from \code{\link{make_markov_series}} 49 | #' 50 | #' @return A binary series 51 | #' @export 52 | #' @examples 53 | #' make_binary_series(n=12,mean=0.5,corr=0.9) 54 | #' make_binary_series(n=100,mean=0.5,corr=0.1) 55 | make_binary_series <- function(n = 100, mean = 0.5, corr = 0){ 56 | p01 <- corr * (1 - mean) / mean 57 | tm <- matrix(c(1 - p01, p01, corr, 1 - corr), nrow=2, byrow=T) 58 | tm <- matrix(c(0.2, 1.24, 1.3, -0.4), nrow=2, byrow=T) 59 | if(any(tm > 1 | tm < 0)){ 60 | tm[1, ] <- prop.table(sqrt(tm[1, ]^2)) 61 | tm[2, ] <- prop.table(sqrt(tm[1, ]^2)) 62 | } 63 | tm <- t(apply(tm, 1, function(x)(x-min(x))/(max(x)-min(x)))) 64 | make_markov_series(n, tm) 65 | } 66 | 67 | 68 | #' Identify the parameters that define a series of binary outcomes 69 | #' 70 | #' @param series a vector of 0 and 1 values 71 | #' @param return a character with two options, matrix returns a transition 72 | #' matrix, "fit" returns a \code{\link{markovchain}} object 73 | #' @param ... additional arguments to pass to \code{\link{markovchainFit}} 74 | #' @return Either a transition matrix or a list with parameters mean and cor 75 | #' defining the transitions in the vector 76 | #' @importFrom markovchain markovchainFit 77 | #' @export 78 | #' @examples 79 | #' series <- make_markov_series(10, matrix(c(0.3, 0.7, 0.25, 0.75), 80 | #' nrow = 2, byrow =TRUE)) 81 | #' fit_series(series) 82 | fit_series <- function(series, return = c("matrix", "fit"), ...){ 83 | if(missing(return)){ 84 | return <- "matrix" 85 | } 86 | # TODO check to ensure this coerces into a true transition matrix 87 | # seqMat <- createSequenceMatrix(series) 88 | out <- markovchainFit(series, ...) 89 | if(return == "matrix"){ 90 | return(out$estimate) 91 | } else if(return == "fit") { 92 | return(out) 93 | } 94 | } 95 | 96 | 97 | -------------------------------------------------------------------------------- /tests/testthat/test-markov.R: -------------------------------------------------------------------------------- 1 | 2 | # make_markov_series 3 | 4 | context("Test Markov Generator") 5 | 6 | test_that("Markov Chain works", { 7 | statesNames <- c("No", "Yes") 8 | tm <- matrix(c(800, 10, 200, 200), nrow = 2, byrow = TRUE, 9 | dimnames = list(statesNames, statesNames)) 10 | expect_warning(make_markov_series(10, tm = tm)) 11 | expect_error(make_markov_series(0, tm = tm)) 12 | tm <- tm / rowSums(tm) 13 | expect_equal(length(make_markov_series(10, tm = tm)), 10) 14 | expect_equal(length(make_markov_series(10, tm = tm, t0 = "Yes")), 10) 15 | expect_equal(length(make_markov_series(10, tm = tm, t0 = "Yes", 16 | include.t0 = TRUE)), 11) 17 | expect_equal(length(make_markov_series(10, tm = tm, t0 = "No", 18 | include.t0 = TRUE)), 11) 19 | expect_true(all(make_markov_series(10, tm = tm, 20 | t0 = "Yes", include.t0 = TRUE) %in% statesNames)) 21 | }) 22 | 23 | 24 | # Test accuracy 25 | 26 | set.seed(458397) 27 | 28 | test_fun <- function(basetm, runLength, ...){ 29 | zzz <- make_markov_series(runLength, tm = basetm) 30 | out <- fit_series(zzz, return = "fit", ...) 31 | return(all(c(all(out$lowerEndpointMatrix < basetm), 32 | all(basetm < out$upperEndpointMatrix)))) 33 | } 34 | 35 | test_that("Markov series are correct", { 36 | dumbTM <- structure(c(0.5, 0.5, 0.5, 0.5), .Dim = c(2L, 2L), .Dimnames = list( 37 | c("No", "Yes"), c("No", "Yes"))) 38 | zzz <- replicate(500, test_fun(basetm = dumbTM, runLength = 75)) 39 | per_true <- length(zzz[zzz == TRUE]) / length(zzz) 40 | expect_true(0.87 < per_true) 41 | expect_true(0.92 > per_true) 42 | statesNames <- c("No", "Yes") 43 | dumbTM2 <- matrix(c(0.5, 0.5, 0.5, 0.5), nrow = 2, byrow = TRUE, 44 | dimnames = list(statesNames, statesNames)) 45 | zzy <- replicate(500, test_fun(basetm = dumbTM2, runLength = 10, 46 | possibleStates = statesNames)) 47 | per_true2 <- length(zzy[zzy == TRUE]) / length(zzy) 48 | expect_true(per_true2 < per_true) 49 | }) 50 | 51 | test_that("fit series respects user input", { 52 | dumbTM <- structure(c(0.5, 0.5, 0.5, 0.5), .Dim = c(2L, 2L), .Dimnames = list( 53 | c("No", "Yes"), c("No", "Yes"))) 54 | zzz <- make_markov_series(20, tm = dumbTM) 55 | out <- fit_series(zzz, return = "fit") 56 | expect_is(out$estimate, "markovchain") 57 | expect_is(out, "list") 58 | expect_equal(out$confidenceLevel, 0.95) 59 | out <- fit_series(zzz, return = "fit", confidencelevel = 0.9) 60 | expect_equal(out$confidenceLevel, 0.9) 61 | out <- fit_series(zzz, return = "matrix") 62 | expect_is(out, "markovchain") 63 | }) 64 | 65 | # fit_series 66 | 67 | # outList <- replicate(25, createAutocorBinSeries(n=10,mean=0.4,corr=0.8), 68 | # simplify = "array") 69 | # 70 | # outList <- apply(outList, 2, function(x) table(paste0(head(x, -1), tail(x,-1)))) 71 | # 72 | # map(outList, function(x) sum(x[names(x) == "00"])) %>% reduce_right(sum) 73 | # map(outList, function(x) sum(x[names(x) == "10"])) %>% reduce_right(sum) 74 | # map(outList, function(x) sum(x[names(x) == "01"])) %>% reduce_right(sum) 75 | # map(outList, function(x) sum(x[names(x) == "11"])) %>% reduce_right(sum) 76 | # 77 | # apply(outList, 2, findTransitions) 78 | 79 | # 80 | # out <- findTransitions(series) 81 | # 82 | # p01=corr*(1-mean)/mean 83 | # createSeries(n,matrix(c(1-p01,p01,corr,1-corr),nrow=2,byrow=T)) 84 | # 85 | # series <- createAutocorBinSeries(n=5000, mean=0.1, corr=0.8) 86 | # out <- findTransitions(series) 87 | # out <- findTransitions(series, return = "simple") 88 | # mapply(function(x, y) (x*(1-y) / y), seq(0, 1, 0.1), seq(0, 1, 0.1)) 89 | # mapply(function(x, y) (x*(1-y) / y), seq(1, 0, -0.1), seq(1, 0, -0.1)) 90 | # mapply(function(x, y) (x*(1-y) / y), seq(0, 1, 0.1), seq(1, 0, -0.1)) 91 | # mapply(function(x, y) (x*(1-y) / y), seq(1, 0, -0.1), seq(0, 1, 0.1)) 92 | # p01=corr*(1-mean)/mean 93 | # 94 | # testdf <- expand.grid(mean = seq(0.01, 0.99, 0.1), corr = seq(0.01, 0.99, 0.1)) 95 | # testdf$p01 <- mapply(function(x, y) (x*(1-y) / y), testdf$corr, testdf$mean) 96 | # 97 | # createSeries(100 , matrix(c(1-90, 90, 0.91, .09),nrow=2,byrow=T)) 98 | -------------------------------------------------------------------------------- /tests/testthat/test-mapCEDS.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # map_CEDS 4 | # recode_options 5 | 6 | # Gen fake data 7 | 8 | context("Test CEDS variable mapping") 9 | 10 | test_that("Error is appropriate when supplying dataframe...", { 11 | SDP <- data.frame("sid" = NA, "ell" = NA, "iep" = NA, "frpl" = NA) 12 | expect_error(map_CEDS(SDP), "passing or reassigning names(object)", fixed=TRUE) 13 | SDP <- as.tbl(SDP) 14 | expect_error(map_CEDS(SDP), "Please supply a character vector of names to be converted.", fixed=TRUE) 15 | }) 16 | 17 | test_that("Check names return dimensions", { 18 | allSDPnames <- c("dob", "white", "asian", "amerind", "black", "hispanic", 19 | "multiracial", "hawaiian_pi", "other", "ell", "disab_type", 20 | "gifted", "frpl", "iep", "sid", "sex", "age") 21 | cleaned <- map_CEDS(allSDPnames) 22 | expect_equal(length(allSDPnames), length(cleaned)) 23 | allCEDSnames <- c("Birthdate", "White", "Asian", "American Indian or Alaska Native", 24 | "Black or African American", "Hispanic or Latino Ethnicity", 25 | "Demographic Race Two or More Races", 26 | "Native Hawaiian or Other Pacific Islander", 27 | "", "Limited English Proficiency Status", 28 | "Primary Disability Type", 29 | "Gifted and Talented Indicator", "Economic Disadvantage Status", 30 | "IDEA Indicator", "Student Identifier", "Sex") 31 | cleaned <- map_CEDS(allCEDSnames) 32 | expect_equal(length(allCEDSnames), length(cleaned)) 33 | }) 34 | 35 | test_that("Errors make sense when column names are mixed", { 36 | mixNames <- c("Birthdate", "white", "asian", "black", "hispanic", 37 | "Primary Disability Type") 38 | expect_message(map_CEDS(mixNames), "Not all names successfully matched.", fixed = TRUE) 39 | expect_message(map_CEDS(mixNames), "Returning NA for those not matched.", fixed = TRUE) 40 | expect_message(map_CEDS(mixNames), "Majority of names match CEDS names", fixed = TRUE) 41 | expect_message(map_CEDS(mixNames), "returning CEDS names", fixed = TRUE) 42 | expect_message(map_CEDS(mixNames), "Both SDP and CEDS names detected.", fixed = TRUE) 43 | cleaned <- map_CEDS(mixNames) 44 | expect_equal(length(cleaned[is.na(cleaned)]), 2) 45 | expect_equal(length(cleaned[!is.na(cleaned)]), 4) 46 | expect_identical(cleaned[!is.na(cleaned)], c("White", "Asian", "Black or African American", 47 | "Hispanic or Latino Ethnicity")) 48 | mixNames <- c("Birthdate", "White", "Asian", "black", "hispanic", 49 | "Primary Disability Type") 50 | expect_message(map_CEDS(mixNames), "Not all names successfully matched.", fixed = TRUE) 51 | expect_message(map_CEDS(mixNames), "Returning NA for those not matched.", fixed = TRUE) 52 | expect_message(map_CEDS(mixNames), "Majority of names match SDP names", fixed = TRUE) 53 | expect_message(map_CEDS(mixNames), "returning SDP names", fixed = TRUE) 54 | expect_message(map_CEDS(mixNames), "Both SDP and CEDS names detected.", fixed = TRUE) 55 | cleaned <- map_CEDS(mixNames) 56 | expect_equal(length(cleaned[is.na(cleaned)]), 2) 57 | expect_equal(length(cleaned[!is.na(cleaned)]), 4) 58 | expect_identical(cleaned[!is.na(cleaned)], c("dob", "white", "asian", "disab_type")) 59 | }) 60 | 61 | test_that("get_code_values extracts values", { 62 | test_string <- c("Yes - Yes;No - No;NotSelected - Not Selected") 63 | codes <- get_code_values(test_string) 64 | expect_equal(length(codes), 2) 65 | expect_identical(names(codes), c("values", "labels")) 66 | expect_null(names(codes$values)) 67 | expect_null(names(codes$labels)) 68 | expect_equal(length(codes$values), 3) 69 | expect_equal(length(codes$labels), 3) 70 | expect_identical(codes$labels, c("Yes", "No", "Not Selected")) 71 | expect_identical(codes$values, c("Yes", "No", "NotSelected")) 72 | bad_string <- c("Yes - Yes,No - No,NotSelected - Not Selected") 73 | codes <- get_code_values(bad_string) 74 | 75 | }) 76 | 77 | test_that("make_inds makes indicator variables", { 78 | testDat <- data.frame(id = 1:30, 79 | group = sample(letters[1:5], 30, replace=TRUE)) 80 | outDat <- make_inds(testDat, col = "group") 81 | expect_true(ncol(outDat) > ncol(testDat)) 82 | expect_equal(ncol(outDat), 7) 83 | expect_equal(nrow(testDat), nrow(outDat)) 84 | expect_identical(names(outDat)[3:7], letters[1:5]) 85 | expect_identical(names(outDat)[1:2], names(testDat)[1:2]) 86 | }) 87 | # 88 | # test_that("recode_options successfully recodes", { 89 | # 90 | # 91 | # 92 | # }) 93 | 94 | # recode_options 95 | -------------------------------------------------------------------------------- /man/sim_control.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sim_control.R 3 | \name{sim_control} 4 | \alias{sim_control} 5 | \title{Set control parameters for simulated data} 6 | \usage{ 7 | sim_control(nschls = 2L, best_school = NULL, race_groups = NULL, 8 | race_prob = NULL, ses_list = NULL, minyear = 2002, 9 | maxyear = 2017, n_cohorts = 8L, gifted_list = NULL, 10 | iep_list = NULL, ell_list = NULL, ps_transfer_list = NULL, 11 | grade_levels = NULL, school_means = NULL, school_cov_mat = NULL, 12 | school_names = NULL, postsec_names = NULL, 13 | gpa_sim_parameters = NULL, grad_sim_parameters = NULL, 14 | ps_sim_parameters = NULL, assess_sim_par = NULL, 15 | assessment_adjustment = NULL, grad_adjustment = NULL, 16 | ps_adjustment = NULL, gpa_adjustment = NULL, assess_grades = NULL, 17 | n_postsec = 35L, postsec_method = "scorecard") 18 | } 19 | \arguments{ 20 | \item{nschls}{integer- number of schools to create, default is 2} 21 | 22 | \item{best_school}{character, format is a number, padded with a leading 0, 23 | indicates the school that will be the highest performing} 24 | 25 | \item{race_groups}{character - vector of labels for race groups} 26 | 27 | \item{race_prob}{numerics - same length as \code{race_groups}} 28 | 29 | \item{ses_list}{a probability list defining probabilities of being low 30 | socioeconomic status, see Details} 31 | 32 | \item{minyear}{integer - the first year student records are observed} 33 | 34 | \item{maxyear}{integer - the last year student records are observed} 35 | 36 | \item{n_cohorts}{integer - the number of birth-year cohorts to produce} 37 | 38 | \item{gifted_list}{a probability list defining the probability of being in 39 | a gifted and talented program} 40 | 41 | \item{iep_list}{a probability list defining probabilities of being on an 42 | individualized education plan} 43 | 44 | \item{ell_list}{a probability list defining probabilities for being an English 45 | language learner} 46 | 47 | \item{ps_transfer_list}{a probability list for transferring postsecondary 48 | institutions after enrolling} 49 | 50 | \item{grade_levels}{a probability list for grade promotion and retention} 51 | 52 | \item{school_means}{numeric - a named vector of means for school level attributes} 53 | 54 | \item{school_cov_mat}{matrix - a covariance matrix for the school level attributes} 55 | 56 | \item{school_names}{character - a vector to draw school names from} 57 | 58 | \item{postsec_names}{character - a vector to draw postsecondary institution 59 | names from} 60 | 61 | \item{gpa_sim_parameters}{list - parameters to pass to \code{gen_outcome_model}} 62 | 63 | \item{grad_sim_parameters}{list - parameters to pass to \code{gen_outcome_model}} 64 | 65 | \item{ps_sim_parameters}{list - parameters to pass to \code{gen_outcome_model}} 66 | 67 | \item{assess_sim_par}{list - parameters to pass to \code{gen_outcome_model}} 68 | 69 | \item{assessment_adjustment}{list - parameters to adjust assessment scores by 70 | for bias} 71 | 72 | \item{grad_adjustment}{list - parameters to adjust graduation probabilities by 73 | for bias} 74 | 75 | \item{ps_adjustment}{list - parameters to adjust postsecondary enrollment 76 | probabilities by for bias} 77 | 78 | \item{gpa_adjustment}{list - parameters to adjust gpa for bias} 79 | 80 | \item{assess_grades}{character - grade levels to generate assessment scores for} 81 | 82 | \item{n_postsec}{numeric - number of postsecondary schools to assign to} 83 | 84 | \item{postsec_method}{character - options "scorecard" or NULL} 85 | } 86 | \value{ 87 | a named list 88 | } 89 | \description{ 90 | Set control parameters for simulated data 91 | } 92 | \details{ 93 | This function has a full set of default values that are designed to 94 | produce realistic data. These defaults can be overridden by specifying any 95 | of the arguments to be overridden as an option to the function call. 96 | 97 | There are two unique data structures that are used to set options for simulations. 98 | The first is a \code{probability_list}, a list which defines a grouping factor, and 99 | for each level of the grouping factor a function and parameters to generate 100 | probability distribution from. 101 | 102 | The \code{sim_parameters} data structure defines the parameters for the outcome 103 | simulation. Outcomes are simulated using a simulated multilevel model structure 104 | and this data structure contains the parameters that describe the model and the 105 | error structure of data generated from that model. 106 | 107 | To modify either of these elements, use the \code{\link{validate_probability_list}} 108 | or \code{\link{validate_sim_parameter}} helper functions to ensure that all 109 | of the parameters are defined with valid values. 110 | } 111 | -------------------------------------------------------------------------------- /R/CEDS.R: -------------------------------------------------------------------------------- 1 | #' Map to and from CEDS column names 2 | #' 3 | #' @param user character vector of variable names to match 4 | #' @param category character, a category of CEDS data to match 5 | #' @param CEDS optional 6 | #' 7 | #' @return mapped CEDS names 8 | #' @export 9 | map_CEDS <- function(user, category = NULL, CEDS = NULL){ 10 | if(!all(class(user) %in% c("factor", "character"))){ # is all right conditional? 11 | msg <- "Please supply a character vector of names to be converted. Consider 12 | passing or reassigning names(object)" 13 | stop(msg) 14 | } 15 | # TODO: Create a crosswalk from CEDS to SDP and SDP to CEDS with clearer lookup 16 | sdp <- xwalk$sdp_name[match(user, xwalk$CEDS_name)] 17 | ceds <- xwalk$CEDS_name[match(user, xwalk$sdp_name)] 18 | checkLength <- function(x, y){ 19 | x_l <- length(x[!is.na(x)]) 20 | y_l <- length(y[!is.na(y)]) 21 | if(x_l > 0 & y_l > 0){ 22 | msg <- ("Both SDP and CEDS names detected.") 23 | } else{ 24 | msg <- " " # empty space to allow compound messages below 25 | } 26 | if(x_l > y_l){ 27 | message("Majority of names match SDP names, returning SDP names. ", msg) 28 | return(x) 29 | } else if(y_l > x_l){ 30 | message("Majority of names match CEDS names, returning CEDS names. ", msg) 31 | return(y) 32 | } else { 33 | stop("Equal number of CEDS and SDP names identified, check names.") 34 | } 35 | } 36 | out <- checkLength(sdp, ceds) 37 | if(any(is.na(out))){ 38 | message("Not all names successfully matched. Returning NA for those not matched.") 39 | } 40 | if(all(is.na(out))){ 41 | warning("No names could be matched, please check names for typos.") 42 | out <- NULL 43 | } else{ 44 | return(out) 45 | } 46 | } 47 | 48 | #' Get values of codes from the CEDS Crosswalk List 49 | #' 50 | #' @param x the data.frame character element that contains the codes 51 | #' @importFrom purrr map 52 | #' @importFrom magrittr %>% 53 | #' @export 54 | #' @return a list with the labels and levels properly formatted 55 | get_code_values <- function(x){ 56 | tmp <- unlist(strsplit(x, split = ";")) 57 | tmp <- sapply(tmp, strsplit, split = "[[:punct:]]") 58 | values <- map(tmp, 1) %>% unlist 59 | labels <- map(tmp, 2) %>% unlist 60 | labels <- trimws(labels) 61 | names(labels) <- NULL 62 | values <- trimws(values) 63 | names(values) <- NULL 64 | return(list(values = values, labels = labels)) 65 | } 66 | 67 | # TODO: This should always append 68 | #' Append indicator variables to a data frame based on a single factor variable 69 | #' 70 | #' @param data a data frame 71 | #' @param col character, name of the factor column to generate indicators for 72 | #' @source \url{http://stackoverflow.com/questions/35943455/creating-indicator-variable-columns-in-dplyr-chain} 73 | #' @return a data frame with the factor levels appended as columns 74 | #' @export 75 | make_inds <- function(data, col) { 76 | for(i in col) { 77 | idx <- which(names(data)==i) 78 | v <- data[[idx]] 79 | if(class(v) != "factor"){ 80 | warning("Factor not supplied, coercing...") 81 | v <- factor(v) 82 | } 83 | # stopifnot(class(v)=="factor") 84 | m <- matrix(0, nrow=nrow(data), ncol=nlevels(v)) 85 | m[cbind(seq_along(v), as.integer(v))]<-1 86 | colnames(m) <- paste(levels(v)) 87 | r <- data.frame(m) 88 | # Only need this if you want to drop original column 89 | # if (idx > 1) { 90 | # r <- cbind(data[1:(idx-1)],r) 91 | # } 92 | # if (idx < ncol(data)) { 93 | # r <- cbind(r, data[(idx+1):ncol(data)]) 94 | # } 95 | data <- cbind(data, r) 96 | } 97 | data 98 | } 99 | 100 | #' Recode options 101 | #' 102 | #' @param data a data.frame to recode the variables to CEDS from 103 | #' @param from the data definitions you are recoding from 104 | #' 105 | #' @return the data object, but with all values matching CEDS specification 106 | #' recoded to meet the CEDS specification 107 | #' @export 108 | recode_options <- function(data, from = c("SDP", "CEDS")){ 109 | if(missing(from)){ 110 | from <- "CEDS" 111 | } 112 | stopifnot(all(names(data) %in% xwalk$CEDS_name) | 113 | all(names(data) %in% xwalk$sdp_name)) 114 | recode_ceds_value <- function(var, options){ 115 | var <- options$labels[match(var, options$values)] 116 | return(var) 117 | } 118 | for(i in names(data)){ 119 | if(from == "SDP"){ 120 | codes <- get_code_values(xwalk$SDP_option_match[xwalk$sdp_name == i]) 121 | } else{ 122 | CEDS <- xwalk$schema[xwalk$CEDS_name == i][[1]] 123 | } 124 | data[, i] <- recode_ceds_value(data[, i], codes) 125 | } 126 | return(data) 127 | } 128 | 129 | # # x and y are vectors of labels 130 | # reconcile_labels <- function(x, y){ 131 | # list(x, y[pmatch(tolower(x), tolower(y))]) 132 | # } 133 | # 134 | # try_label <- function(x, y, value){ 135 | # lookup <- reconcile_labels(x, y) 136 | # if(is.na(y[value])){ 137 | # out <- lookup[[2]][which(lookup[[1]] == value)] 138 | # if(length(out) != 0){ 139 | # return(out) 140 | # } else{ 141 | # return(NA) 142 | # } 143 | # } else{ 144 | # return(y[value]) 145 | # } 146 | # } 147 | -------------------------------------------------------------------------------- /tests/testthat/test-baseline.R: -------------------------------------------------------------------------------- 1 | # Test baselines 2 | 3 | context("Test baseline extraction") 4 | 5 | test_that("ELL baseline works", { 6 | out <- get_baseline("ell") 7 | expect_is(out, "list") 8 | expect_equal(length(out), 3) 9 | expect_identical(names(out), c("data", "keys", "fun")) 10 | expect_equal(out$fun(1), 1) 11 | expect_is(out$data, "data.frame") 12 | expect_true(all(out$keys %in% names(out$data))) 13 | }) 14 | 15 | test_that("Error messages are informative", { 16 | expect_error(get_baseline("iep"), "Baseline not currently defined. Maybe you can write your own?") 17 | }) 18 | 19 | 20 | context("Test assigning baseline values") 21 | 22 | ex_data <- data.frame(id = 100:116, age = c(3:19), 23 | race = c(rep(c("amerind", "asian", "black", "hispanic", 24 | "multiracial", "white", "hawaiian_pi"), 2), 25 | "other", "bad", "error")) 26 | 27 | ex_data$ell <- assign_baseline(baseline = "ell", data = ex_data) 28 | 29 | test_that("Baseline returns the proper object type", { 30 | zzz <- assign_baseline(baseline = "ell", data = ex_data) 31 | expect_is(zzz, "integer") 32 | }) 33 | 34 | 35 | test_that("Baseline assignment works for ELL example", { 36 | expect_true(table(is.na(ex_data$ell))[[2]] == 3) 37 | expect_true(table(is.na(ex_data$ell))[[1]] == 14) 38 | expect_true(all(ex_data$ell %in% c(0L, 1L, NA))) 39 | }) 40 | 41 | # Check for error reporting 42 | 43 | 44 | ex_data <- data.frame(id = 100:116, age = c(3:19), 45 | race = c(rep(c("amerind", "asian", "black", "hispanic", 46 | "multiracial", "white", "hawaiian_pi"), 2), 47 | "other", "bad", "error")) 48 | 49 | test_that("Baseline returns the proper object type for program", { 50 | zzz <- assign_baseline(baseline = "program", data = ex_data) 51 | expect_is(zzz, "data.frame") 52 | expect_identical(names(zzz), 53 | c("id", "age", "race", "ell", "iep", "frpl")) 54 | }) 55 | 56 | test_that("Baseline assignment works for program example", { 57 | zzz <- assign_baseline(baseline = "program", data = ex_data) 58 | expect_true(all(zzz$ell %in% c(0L, 1L))) 59 | expect_true(all(zzz$iep %in% c(0L, 1L))) 60 | expect_true(all(zzz$frpl %in% c(0L, 1L))) 61 | }) 62 | 63 | test_that("Baseline for program is well defined", { 64 | bl <- get_baseline("program") 65 | expect_is(bl, "list") 66 | expect_identical(names(bl), c("data", "keys", "fun")) 67 | expect_is(bl$data, "data.frame") 68 | expect_null(bl$keys) 69 | expect_is(bl$fun, "function") 70 | out <- bl$fun() 71 | expect_is(out, "data.frame") 72 | expect_identical(names(out), c("ell", "iep", "frpl")) 73 | }) 74 | 75 | # grade baseline 76 | test_that("Baseline returns the proper object type for grade", { 77 | zzz <- assign_baseline("grade", ex_data) 78 | expect_is(zzz, "data.frame") 79 | expect_identical(names(zzz), 80 | c("id", "age", "race", "grade")) 81 | }) 82 | 83 | test_that("Baseline assignment works for program example", { 84 | zzz <- assign_baseline("grade", ex_data) 85 | expect_is(zzz$grade, "character") 86 | expect_true(all(unique(zzz$grade) %in% 87 | c("1", "10", "11", "12", "2", "3", "4", "5", "6", 88 | "7", "8", "9", "KG", "PK"))) 89 | }) 90 | 91 | test_that("Baseline for grade is well defined", { 92 | bl <- get_baseline("grade") 93 | expect_is(bl, "list") 94 | expect_identical(names(bl), c("data", "keys", "fun")) 95 | expect_is(bl$data, "data.frame") 96 | expect_identical(bl$keys, "age") 97 | expect_is(bl$fun, "function") 98 | out <- bl$fun(2) 99 | expect_is(out, "character") 100 | }) 101 | 102 | # cond_prob 103 | ses_list <- list( 104 | "white" = list(f = runif, 105 | pars = list(min = 10, max = 20)), 106 | "hispanic" = list(f = runif, 107 | pars = list(min = 25, max = 30)), 108 | "black" = list(f = runif, 109 | pars = list(min = 35, max = 40)), 110 | "asian" = list(f = runif, 111 | pars = list(min = 45, max = 50)), 112 | "multiracial" = list(f = runif, 113 | pars = list(min = 100, max = 105)), 114 | "amerind" = list(f = runif, 115 | pars = list(min = 0, max = 5)), 116 | "other" = list(f = runif, 117 | pars = list(min = -10, max = -5)), 118 | "hawaiian_pi" = list(f = runif, 119 | pars = list(min = -20, max = -15)) 120 | ) 121 | 122 | 123 | context("Test conditional probability") 124 | 125 | test_that("warnings but functioning", { 126 | test_1 <- ex_data[1:5, ] 127 | expect_warning(cond_prob(test_1, factor = "race", 128 | newvar = "ses", prob_list = ses_list)) 129 | zzz <- cond_prob(test_1, factor = "race", 130 | newvar = "ses", prob_list = ses_list) 131 | expect_is(zzz, "data.frame") 132 | expect_identical(names(zzz), c("id", "age", "race", "ses")) 133 | expect_identical(zzz[, 1:3], ex_data[1:5, ]) 134 | expect_gt(zzz$ses[zzz$race == "amerind"], 0) 135 | expect_lt(zzz$ses[zzz$race == "amerind"], 5) 136 | expect_gt(zzz$ses[zzz$race == "asian"], 45) 137 | expect_lt(zzz$ses[zzz$race == "asian"], 50) 138 | expect_gt(zzz$ses[zzz$race == "black"], 35) 139 | expect_lt(zzz$ses[zzz$race == "black"], 40) 140 | expect_gt(zzz$ses[zzz$race == "hispanic"], 25) 141 | expect_lt(zzz$ses[zzz$race == "hispanic"], 30) 142 | expect_gt(zzz$ses[zzz$race == "multiracial"], 100) 143 | expect_lt(zzz$ses[zzz$race == "multiracial"], 105) 144 | }) 145 | 146 | 147 | ## Test ses baseline 148 | ## Test baseline assignment and conditional probability 149 | ## Test school assignment 150 | ## Test school transition assignment 151 | 152 | 153 | -------------------------------------------------------------------------------- /vignettes/sim_control.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Controlling the Simulation in OpenSDPsynthR" 3 | author: "Jared E. Knowles" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{Controlling the Data Simulation} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ## Introduction 13 | 14 | The `OpenSDPsynthR` package provides a set of functions to allow users to 15 | generate synthetic, but authentic, student-level data. Synthetic data is 16 | intended to make it easy to collaborate with analysts across the country 17 | tackling similar problems but using a shared vocabulary. The synthetic data 18 | will allow users to collaborate directly on code and analyses and verify that 19 | their analysis is working on synthetic data before translating it to live 20 | local data. 21 | 22 | This vignette explains how the package is structured so that it can be modified 23 | to meet the needs of users. 24 | 25 | ```{r} 26 | library(OpenSDPsynthR) 27 | default_sim <- sim_control() 28 | ``` 29 | 30 | ```{r} 31 | names(default_sim) 32 | ``` 33 | 34 | There are over 25 user parameters that can be modified to control the simulation. 35 | The goal of these parameters is to allow the simulated student population to 36 | reflect a variety of possible educational environments ranging from small 37 | rural communities to large urban school districts. 38 | 39 | These parameters can have complex structures to allow for conditional and random 40 | generation of data. Parameters fall into four categories: 41 | 42 | - **vectors:** a single list of parameters like school names, category names, or 43 | school IDs 44 | - **conditional probability list:** an R list that contains a variable to group by, 45 | a function to generate data with, and a list of parameters for that function for 46 | each group in the grouping variable 47 | - **outcome simulation parameters:** an R list of arguments to pass to the `simglm` 48 | function 49 | - **outcome adjustments:** an R list of lists, with functions that modify a variable 50 | in an existing data set 51 | 52 | ### Vectors 53 | 54 | The following vectors can be modified by the users: 55 | 56 | - `nschls`: integer, number of high schools to assign students to 57 | - `best_schl`: character, length 1, school ID for the highest performing school, 58 | e.g. ("01") 59 | - `race_groups`: character, length ?, names of racial subgroups to create in the 60 | simulation, defaults to US Census Groups 61 | - `race_prob`: numeric, length = length(race_groups), proportion of population 62 | in each racial group 63 | - `minyear`: integer, length 1, the first year of student data available 64 | - `maxyear`: integer, length 1, the last year of student data available 65 | - `n_cohorts`: integer, length 1, the number of graduation cohorts to create 66 | - `school_names`: character, length = `nschls`, names of schools 67 | - `assess_grades`: character, grade levels to simulate assessment scores for 68 | - `postsec_names`: character, length = `n_postsec`, names of postsecondary schools 69 | - `postsec_method`: character, length = 1, name of method to draw postsecondary 70 | schools from 71 | 72 | ### Conditional Probability List 73 | 74 | A conditional probability list is a list of lists in R. The `GROUPVARS` element 75 | specifies the grouping variable to conditionally assign probabilities. For 76 | example, if students are assigned gifted and talented status differently based 77 | on their sex, then this would specify `Sex`. The other elements of the list 78 | will be a separate list for each valid value of `Sex` -- in this case `Male` 79 | and `Female`. 80 | 81 | `Male` and `Female` are both lists that have two elements: `f` and `pars`. `f` 82 | defines a function that is used to generate the variable, and `pars` contains 83 | all of the parameters for that function. 84 | 85 | ```{r} 86 | str(default_sim$gifted_list) 87 | ``` 88 | 89 | - `gifted_list`: a list defining how students are assigned to gifted and talented 90 | programs 91 | - `iep_list`: a list defining how students are assigned to special education 92 | programs 93 | - `ses_list`: a list deifning how students are assigned to free and reduced 94 | price lunch status 95 | - `ell_list`: a list defining how students are assigned to English Language 96 | Learner status 97 | - `ps_transfer_list`: a list defining the likelihood a student transfers 98 | postsecondary institutions 99 | 100 | 101 | ### Outcome Simulation Controls 102 | 103 | Outcome simulation controls are lists with parameters to pass to the `simreg` 104 | function in the `simglm` package, which simulates hierarchical data and 105 | outcomes. 106 | 107 | Each of these simulations requires the user to specify: 108 | 109 | - `fixed`: a RHS formula of the format `~ 1 + var1 + var2` defining the level 110 | 1 variables for the simulation 111 | - `random_var`: a numeric, length 1, specifying the variance in the second level 112 | - `cov_param`: a list, length of variables in `fixed` + 1 for the intercept, 113 | defines the function and parameters to generate the X values 114 | - `cor_vars`: a matrix of the variance between the X variables in `fixed` 115 | - `fixed_param`: a vector of numerics, the lenth of `fixed` + 1, represent the 116 | beta coefficients 117 | - `ngrps`: numeric, length of 1, number of second-level grouping terms 118 | - `unbalanceRange`: numeric, length of 2, representing the minimum and maximum 119 | number of observations in each second-level cluster 120 | - `type`: character, either "linear" or NULL 121 | 122 | There are several of these parameters: 123 | 124 | - `gpa_sim_parameters`: simulation parameters for the GPA simulation 125 | - `grad_sim_parameters`: simulation parameters for high school graduation 126 | - `ps_sim_parameters`: simulation parameters for postsecondary enrollment 127 | - `assess_sim_par`: simulation parameters for student assessment data 128 | 129 | ### Outcome Simulation Adjustments 130 | 131 | If we only rely on the simulation controls above, the data will be too predictable 132 | to be realistic, and structural inequalities along economic, racial, and gender 133 | lines will be underrepresented. To address this, it is possible to do post-simulation 134 | adjustments to introduce more variance to the outcomes. 135 | 136 | - `race_list`: 137 | - `perturb_race`: function, 138 | - `frl_list`: 139 | - `perturb_frl`: function, 140 | 141 | - `assessment_adjustment`: adjustments to the assessment score 142 | - `grad_adjustment`: adjustments to the graduation probability 143 | - `ps_adjustment`: adjustments to the postsecondary probability 144 | - `gpa_adjustment`: adjustments to the grade point average 145 | 146 | 147 | ### Baselines 148 | 149 | Currently there are two special parameters that are set based on baseline data 150 | built into the package. These are the initial grade distribution of students, 151 | and the initial program participation of students in `ell`, `iep`, and `frpl` 152 | programs. 153 | 154 | These set some of the simulation requirements, but others are set using the 155 | `baseline` function family. 156 | 157 | ```{r, eval=FALSE} 158 | get_baseline("program") 159 | get_baseline("grade") 160 | ``` 161 | 162 | Currently, baseline values cannot be modified by the user, but this will come in 163 | a future release. 164 | 165 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, echo = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "tools/figs/README-", 12 | message = FALSE, 13 | warning = FALSE 14 | ) 15 | ``` 16 | 17 | # OpenSDPsynthR 18 | 19 | ![](tools/figs/open_sdp_logo_red.png) 20 | 21 | 22 | A project to generate realistic synthetic unit-level longitudinal education data 23 | to empower collaboration in education analytics. 24 | 25 | ## Design Goals 26 | 27 | 1. Generate synthetic education data that is realistic for use by analysts 28 | across the education sector. Realistic means messy, and reflective of the 29 | general pattern of relationships found in the U.S. education sector. 30 | 2. Synthetic data should be able to be generated on-demand and responsive to 31 | inputs from the user. These inputs should allow the user to configure the 32 | process to produce data that resembles the patterns of data in their agency. 33 | 3. The package should be modular and extendable allowing new data topics to be 34 | generated as needed so synthetic data coverage can grow. 35 | 36 | ## Structure 37 | 38 | The package is organized into the following functions: 39 | 40 | - `simpop()` is the overall function that runs the simulation, this function calls 41 | many subfunctions to simulate different elements of the student data 42 | - `cleaners` are functions which take the output from the `simpop` function and 43 | reshape it into data formats for different analyses. Currently only two cleaners 44 | are supported -- `CEDS` and `sdp_cleaner()` which prepare the data into a CEDS 45 | like format and into the Strategic Data Project college-going analysis file 46 | specification respectively. 47 | - `sim_control()` -- a function that controls all of the parameters of the `simpop` 48 | simulation. The details of this function are covered in the vignettes. 49 | 50 | # Get Started 51 | 52 | To use `OpenSDPsynthR`, follow the instructions below: 53 | 54 | ## Install Package 55 | 56 | The development version of the package is able to be installed using the 57 | `install_github()`. To use this command you will need to install the `devtools` 58 | package. 59 | 60 | ```{r eval=FALSE} 61 | devtools::install_github("opensdp/OpenSDPsynthR") 62 | ``` 63 | 64 | ## Make some data 65 | 66 | Load the package 67 | 68 | ```{r, message=TRUE} 69 | library(OpenSDPsynthR) 70 | ``` 71 | 72 | The main function of the package is `simpop` which generates a list of data 73 | elements corresponding to simulated educational careers, K-20, for a user 74 | specified number of students. In R, a list is a data structure that can contain 75 | multiple data elements of different structures. This can be used to emulate 76 | the multiple tables of a Student Information System (SIS). 77 | 78 | 79 | 80 | ```{r, message=TRUE} 81 | out <- simpop(nstu = 500, seed = 213, control = sim_control(nschls = 3)) 82 | ``` 83 | 84 | Currently ten tables are produced: 85 | 86 | ```{r} 87 | names(out) 88 | ``` 89 | 90 | 91 | Data elements produced include: 92 | 93 | - **Student demographics:** age, race, and sex 94 | - **Student participation:** grade advancement, ELL status, IEP status, 95 | FRPL status, gifted and talented status, attendance 96 | - **Student enrollment status:** exit type, enrollment type, transfer, graduation, 97 | dropout, etc. 98 | - **School attributes:** name, school category, school size, Title I and Title III status, etc. 99 | - **Student assessment:** math assessment, reading assessment, grade level assessed 100 | - **High school outcomes:** graduation, cumulative GPA, graduation type, cohort, 101 | class rank, postsecondary enrollment 102 | - **High school progression:** annual class rank, cumulative credits earned, credits 103 | earned, credits by English Language Arts and by Mathematics, credits attempted, 104 | ontrack status 105 | - **Postsecondary enrollment:** year of enrollment, transfer indicator, name and ID of 106 | postsecondary institution, type of institution 107 | - **Postsecondary institution:** name, city, state, online only, average net price, 108 | Pell grant rate, retention four year full time, share of part time enrollment, 109 | enrollment by race, SAT and ACT score distribution for admitted students 110 | 111 | There are two tables of metadata about the assessment data above to be used in 112 | cases where multiple types of student assessment are analyzed together. 113 | 114 | - **Assessment information:** grade, subject, ID, type, and name of assessment 115 | - **Proficiency information:** mean score, error of score, number of students tested 116 | 117 | 118 | ```{r, echo=FALSE, message=FALSE, warning=FALSE, include=FALSE} 119 | table_names <- data.frame(table = NULL, column = NULL) 120 | for(i in seq_along(out)){ 121 | table_name <- names(out)[[i]] 122 | columns <- names(out[[i]]) 123 | tmp <- data.frame(table = table_name, column = columns, 124 | stringsAsFactors = FALSE) 125 | table_names <- bind_rows(table_names, tmp) 126 | } 127 | 128 | ``` 129 | 130 | 131 | ```{r, inclue=FALSE} 132 | head(out$demog_master %>% arrange(sid) %>% select(1:4)) 133 | head(out$stu_year, 10) 134 | ``` 135 | 136 | ## Cleaners 137 | 138 | You can reformat the synthetic data for use in specific types of projects. 139 | Currently two functions exist to format the simulated data into an analysis 140 | file matching the SDP College-going data specification and a CEDS-like 141 | data specification. More of these functions are planned in the future. 142 | 143 | ```{r eval=FALSE} 144 | cgdata <- sdp_cleaner(out) 145 | ceds <- ceds_cleaner(out) 146 | ``` 147 | 148 | 149 | ## Control Parameters 150 | 151 | By default, you only need to specify the number of students to simulate to the 152 | `simpop` command. The package has default simulation parameters that will result 153 | in creating a small school district with two schools. 154 | 155 | 156 | ```{r demonstrateOptionList} 157 | names(sim_control()) 158 | ``` 159 | 160 | These parameters can have complex structures to allow for conditional and random 161 | generation of data. Parameters fall into four categories: 162 | 163 | - **vectors:** a single list of parameters like school names, category names, or 164 | school IDs 165 | - **conditional probability list:** an R list that contains a variable to group by, 166 | a function to generate data with, and a list of parameters for that function for 167 | each group in the grouping variable 168 | - **outcome simulation parameters:** an R list of arguments to pass to the `simglm` 169 | function 170 | - **outcome adjustments:** an R list of lists, with functions that modify a variable 171 | in an existing data set 172 | 173 | For more details, see the simulation control vignette. 174 | 175 | ```{r, eval=FALSE} 176 | vignette("Controlling the Data Simulation", package = "OpenSDPsynthR") 177 | ``` 178 | 179 | 180 | ## Package Dependencies 181 | 182 | - `dplyr` 183 | - `lubridate` 184 | - [wakefield](https://www.github.com/trinker/wakefield) 185 | - [simglm](https://www.github.com/lebebr01/simglm) 186 | 187 | ## OpenSDP 188 | 189 | `OpenSDPsynthR` is part of the OpenSDP project. 190 | 191 | [OpenSDP](https://opensdp.github.io) is an online, public repository of analytic 192 | code, tools, and training intended to foster collaboration among education 193 | analysts and researchers in order to accelerate the improvement of our school 194 | systems. The community is hosted by the 195 | [Strategic Data Project](https://sdp.cepr.harvard.edu), an initiative of the 196 | [Center for Education Policy Research at Harvard University](https://cepr.harvard.edu). 197 | We welcome contributions and feedback. 198 | 199 | These materials were originally authored by the Strategic Data Project. 200 | -------------------------------------------------------------------------------- /R/baselines.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' Function to load in SDP default baseline data 4 | #' 5 | #' @param bl a character naming the type of baseline available 6 | #' 7 | #' @return the restored baseline object 8 | #' @export 9 | get_baseline <- function(bl){ 10 | if(bl == "ell"){ 11 | #data(sysdata, envir=environment()) 12 | data <- ell 13 | keys <- c("race", "age") 14 | fun <- function(x) rbinom(1, 1, x) 15 | } else if(bl == "ses"){ 16 | data <- ses 17 | keys <- c("race") 18 | fun <- function(x) rbinom(1, 1, x) 19 | } else if(bl == "program"){ 20 | data <- prog_baseline 21 | keys <- NULL 22 | fun <- function() {prog_baseline[sample(rownames(prog_baseline), 1, 23 | prob = prog_baseline$prob), 1:3]} 24 | } else if(bl == "grade"){ 25 | data <- age_grade 26 | keys <- "age" 27 | fun <- function(x){ 28 | if(x %in% age_grade$age){ 29 | probs <- age_grade[which(age_grade$age == x),][-1] 30 | out <-sample(names(age_grade)[-1], 1, prob = probs) 31 | out <- convert_grade(out) 32 | return(out) 33 | } else{ 34 | return(NA) 35 | } 36 | } 37 | } else{ 38 | stop("Baseline not currently defined. Maybe you can write your own?") 39 | } 40 | return(list(data = data, keys = keys, fun = fun)) 41 | } 42 | 43 | #' Append baseline data to initial data 44 | #' 45 | #' @param baseline character value of the default baseline to assign 46 | #' @param data a data.frame to append the baseline to 47 | #' 48 | #' @return the data.frame passed by the user with an additional variable appended 49 | #' @importFrom dplyr left_join 50 | #' @export 51 | assign_baseline <- function(baseline = NULL, data){ 52 | bl_data <- get_baseline(baseline) 53 | if(baseline == "grade"){ 54 | data <- as.data.frame(data) 55 | out <- sapply(data[, bl_data$keys], bl_data$fun) 56 | out <- as.character(out) 57 | data <- cbind(data, out) 58 | names(data)[ncol(data)] <- baseline 59 | data[,ncol(data)] <- as.character(data[,ncol(data)]) 60 | return(data) 61 | } 62 | if(is.null(bl_data$keys)){ 63 | out <- replicate(nrow(data), bl_data$fun(), simplify=FALSE) %>% 64 | Reduce("rbind", .) 65 | out <- cbind(data, out) 66 | } else { 67 | if(any(!bl_data$keys %in% names(data))){ 68 | msg <- paste("Data supplied does not have right keys to merge. Please use columns:", 69 | paste(bl_data$keys, collapse = ", "), sep = " \n ") 70 | stop(msg) 71 | } 72 | data <- as.data.frame(left_join(data, bl_data$data, by = bl_data$keys)) 73 | var <- names(bl_data$data)[!names(bl_data$data) %in% bl_data$keys] 74 | if(length(var) > 1){ 75 | stop("Variables are labeled wrong in data.") 76 | } 77 | # Avoid binomial NA warning in ELL baseline 78 | suppressWarnings({ 79 | out <- sapply(data[, var], bl_data$fun) 80 | }) 81 | } 82 | return(out) 83 | } 84 | 85 | # Function to generate conditional probabilities and append them to data 86 | # Input is data frame, output is a data frame 87 | # prob_list needs to have the levels of the factor variable and be the same length 88 | 89 | # TODO: UNIT TESTS 90 | # Document prob_list structure and verify 91 | 92 | #' Generate conditional probabilities by group 93 | #' 94 | #' @param data dataframe to add variable to 95 | #' @param factor grouping variable that probability of \code{newvar} is conditional on 96 | #' @param newvar name, character, of new variable defined by \code{prob_list} 97 | #' @param prob_list a list, defining the way \code{newvar} should be generated 98 | #' 99 | #' @return data.frame with \code{newvar} appended to dataframe 100 | #' @export 101 | cond_prob <- function(data, factor, newvar, prob_list){ 102 | # Work around tbls 103 | if(any(class(data) != "data.frame")){ 104 | data <- as.data.frame(data) 105 | } 106 | if(!factor %in% names(data)){ 107 | stop("Factor not found in data. Did you forget to create it?") 108 | } 109 | data[, newvar] <- NA 110 | # Error checking, if not all factors are defined in prob_list, issue error 111 | if(!all(unique(data[, factor]) %in% names(prob_list))){ 112 | missingLevels <- 113 | as.character(unique(data[, factor])[!unique(data[, factor]) %in% 114 | names(prob_list)]) 115 | msg <- paste0("Probability list does not specify all possible factor levels.", 116 | "\n", "Missing levels: \n", 117 | paste0(missingLevels, collapse = ",\n")) 118 | stop(msg) 119 | } 120 | # If not all prob_list elements are listed in the data, issue a warning 121 | if(!all(names(prob_list) %in% unique(data[, factor]))){ 122 | missingLevels <- 123 | as.character(names(prob_list)[!names(prob_list) %in% 124 | unique(data[, factor])]) 125 | msg <- paste0("Probability list elements not found in data.", 126 | "\n", "Missing levels: \n", 127 | paste0(missingLevels, collapse = ",\n")) 128 | warning(msg) 129 | } 130 | 131 | for(i in unique(data[, factor])){ 132 | N <- nrow(data[data[, factor] == i, ]) 133 | data[data[, factor] == i, newvar] <- do.call(prob_list[[i]]$f, 134 | c(list(n = N), 135 | prob_list[[i]]$pars)) 136 | 137 | } 138 | return(data) 139 | } 140 | 141 | #' Assign student a grade 142 | #' 143 | #' @param age age of the student in years 144 | #' @param ability a modifier that signifies student ability? 145 | #' 146 | #' @return a vector of grade levels 147 | #' @importFrom wakefield level 148 | #' @export 149 | #' 150 | #' @examples 151 | #' age <- c(9, 10, 11, 12) 152 | #' assign_grade(age = age) 153 | assign_grade <- function(age, ability){ 154 | baseGrade <- floor(age - 7) 155 | maxGrade <- floor(age - 4) 156 | out <- wakefield::level(1, x = baseGrade:maxGrade, prob = c(0.01, 0.03, 0.9, 0.06)) 157 | return(out) 158 | } 159 | 160 | 161 | 162 | #' Generate a random transition matrix for school enrollments 163 | #' 164 | #' @param nschls integer, the number of schools available, default = 15 165 | #' @param diag_limit a numeric between 0 and 1 that sets the minimum probability that a student 166 | #' will stay schools 167 | #' 168 | #' @return a transition matrix nschls X nschls of transition probabilities 169 | #' @export 170 | #' 171 | #' @examples 172 | #' out <- school_transitions(12) 173 | #' out 174 | school_transitions <- function(nschls = 15L, diag_limit = 0.9){ 175 | stopifnot(class(nschls) %in% c("numeric", "integer")) 176 | stopifnot(diag_limit > 0 & diag_limit < 1) 177 | empty <- matrix(rep(0, nschls^2), nrow = nschls, ncol = nschls) 178 | diag(empty) <- 1 179 | for(i in 1:nschls){ 180 | diag(empty)[i] <- runif(1, diag_limit, 1) 181 | } 182 | for(i in 1:nschls){ 183 | empty[i, ][empty[i,] ==0] <- rand_vect_cont(nschls-1, 1-diag(empty)[i]) 184 | } 185 | school_tm <- empty 186 | if(nschls < 100){ 187 | sch_dims <- sprintf("%02d", 1:nschls) 188 | } else{ 189 | sch_dims <- sprintf("%03d", 1:nschls) 190 | } 191 | dimnames(school_tm) <- list(sch_dims, sch_dims) 192 | return(school_tm) 193 | } 194 | 195 | #' Generate a grade advancement transition matrix 196 | #' 197 | #' @param ngrades integer, the number of grade levels to simulate 198 | #' @param diag_limit the minimum probability of a student advancing to the next grade 199 | #' 200 | #' @return a matrix, ngrades x ngrades of grade transition probabilities 201 | #' @export 202 | grade_transitions <- function(ngrades=15L, diag_limit = 0.975){ 203 | stopifnot(class(ngrades) %in% c("numeric", "integer")) 204 | stopifnot(diag_limit > 0 & diag_limit < 1) 205 | empty <- matrix(rep(0, ngrades^2), nrow = ngrades, ncol = ngrades) 206 | pool <- sort(runif(ngrades, diag_limit, 1)) 207 | if(ngrades > 9){ 208 | diag(empty) <- pool[c(1:3, 8:(ngrades-1), 4, 5, 6, 7, ngrades)] 209 | } else{ 210 | diag(empty) <- pool 211 | } 212 | empty <- diag_offset(empty, 1L) 213 | for(g in 1:ngrades){ 214 | i <- g + 1 215 | elem <- (i-2):(i+2) 216 | if(any(elem < 1) | any(elem > ngrades)){ 217 | elem <- elem[elem >= 1] 218 | elem <- elem[elem <= ngrades] 219 | } 220 | elem <- elem[-which(elem == i)] 221 | if(i > ngrades){ 222 | fill <- round(rand_vect_cont(N = length(elem), runif(1, diag_limit, 1), 223 | sd = 4), 4) 224 | } else{ 225 | fill <- round(rand_vect_cont(N = length(elem), 1-empty[g, i], sd = 4), 4) 226 | } 227 | fill <- sort(fill, decreasing = TRUE) 228 | fill <- na.omit(fill[c(2, 1, 4, 3)]) 229 | attributes(fill) <- NULL # strip NA info 230 | empty[g, ][elem] <- fill 231 | } 232 | empty[ngrades, ngrades] <- 1 233 | grades_tm <- empty 234 | dimms <- convert_grade(paste0("g", -1:(ngrades-2))) 235 | dimnames(grades_tm) <- list(dimms, dimms) 236 | grades_tm <- tm_convert(grades_tm) 237 | return(grades_tm) 238 | } 239 | 240 | 241 | 242 | #' Offset the diagonal values of a matrix 243 | #' 244 | #' @param matrix a square matrix 245 | #' @param offset an integer value to offset the diagonal by 246 | #' 247 | #' @return a square matrix 248 | #' @export 249 | diag_offset <- function(matrix, offset = 1L){ 250 | for(i in 1:nrow(matrix)){ 251 | matrix[i, ] <- c(rep(0, offset), matrix[i, ][1:(length(matrix[i, ])-offset)]) 252 | } 253 | return(matrix) 254 | } 255 | 256 | 257 | 258 | 259 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | OpenSDPsynthR 4 | ============= 5 | 6 | ![](tools/figs/open_sdp_logo_red.png) 7 | 8 | A project to generate realistic synthetic unit-level longitudinal education data to empower collaboration in education analytics. 9 | 10 | Design Goals 11 | ------------ 12 | 13 | 1. Generate synthetic education data that is realistic for use by analysts across the education sector. Realistic means messy, and reflective of the general pattern of relationships found in the U.S. education sector. 14 | 2. Synthetic data should be able to be generated on-demand and responsive to inputs from the user. These inputs should allow the user to configure the process to produce data that resembles the patterns of data in their agency. 15 | 3. The package should be modular and extendable allowing new data topics to be generated as needed so synthetic data coverage can grow. 16 | 17 | Structure 18 | --------- 19 | 20 | The package is organized into the following functions: 21 | 22 | - `simpop()` is the overall function that runs the simulation, this function calls many subfunctions to simulate different elements of the student data 23 | - `cleaners` are functions which take the output from the `simpop` function and reshape it into data formats for different analyses. Currently only two cleaners are supported -- `CEDS` and `sdp_cleaner()` which prepare the data into a CEDS like format and into the Strategic Data Project college-going analysis file specification respectively. 24 | - `sim_control()` -- a function that controls all of the parameters of the `simpop` simulation. The details of this function are covered in the vignettes. 25 | 26 | Get Started 27 | =========== 28 | 29 | To use `OpenSDPsynthR`, follow the instructions below: 30 | 31 | Install Package 32 | --------------- 33 | 34 | The development version of the package is able to be installed using the `install_github()`. To use this command you will need to install the `devtools` package. 35 | 36 | ``` r 37 | devtools::install_github("opensdp/OpenSDPsynthR") 38 | ``` 39 | 40 | Make some data 41 | -------------- 42 | 43 | Load the package 44 | 45 | ``` r 46 | library(OpenSDPsynthR) 47 | #> Loading required package: dplyr 48 | #> 49 | #> Attaching package: 'dplyr' 50 | #> The following objects are masked from 'package:stats': 51 | #> 52 | #> filter, lag 53 | #> The following objects are masked from 'package:base': 54 | #> 55 | #> intersect, setdiff, setequal, union 56 | #> Loading required package: lme4 57 | #> Loading required package: Matrix 58 | ``` 59 | 60 | The main function of the package is `simpop` which generates a list of data elements corresponding to simulated educational careers, K-20, for a user specified number of students. In R, a list is a data structure that can contain multiple data elements of different structures. This can be used to emulate the multiple tables of a Student Information System (SIS). 61 | 62 | ``` r 63 | out <- simpop(nstu = 500, seed = 213, control = sim_control(nschls = 3)) 64 | #> Preparing student identities for 500 students... 65 | #> Creating annual enrollment for 500 students... 66 | #> Assigning 500 students to initial FRPL, IEP, and ELL status 67 | #> Assigning initial grade levels... 68 | #> Organizing status variables for you... 69 | #> Assigning 500 students longitudinal status trajectories... 70 | #> Sorting your records 71 | #> Cleaning up... 72 | #> Creating 3 schools for you... 73 | #> Assigning 6946 student-school enrollment spells... 74 | #> Simulating assessment table... be patient... 75 | #> Simulating high school outcomes... be patient... 76 | #> Simulating annual high school outcomes... be patient... 77 | #> Simulating postsecondary outcomes... be patient... 78 | #> Success! Returning you student and student-year data in a list. 79 | ``` 80 | 81 | Currently ten tables are produced: 82 | 83 | ``` r 84 | names(out) 85 | #> [1] "demog_master" "stu_year" "schools" "stu_assess" 86 | #> [5] "hs_outcomes" "hs_annual" "nsc" "ps_enroll" 87 | #> [9] "assessments" "proficiency" 88 | ``` 89 | 90 | Data elements produced include: 91 | 92 | - **Student demographics:** age, race, and sex 93 | - **Student participation:** grade advancement, ELL status, IEP status, FRPL status, gifted and talented status, attendance 94 | - **Student enrollment status:** exit type, enrollment type, transfer, graduation, dropout, etc. 95 | - **School attributes:** name, school category, school size, Title I and Title III status, etc. 96 | - **Student assessment:** math assessment, reading assessment, grade level assessed 97 | - **High school outcomes:** graduation, cumulative GPA, graduation type, cohort, class rank, postsecondary enrollment 98 | - **High school progression:** annual class rank, cumulative credits earned, credits earned, credits by English Language Arts and by Mathematics, credits attempted, ontrack status 99 | - **Postsecondary enrollment:** year of enrollment, transfer indicator, name and ID of postsecondary institution, type of institution 100 | - **Postsecondary institution:** name, city, state, online only, average net price, Pell grant rate, retention four year full time, share of part time enrollment, enrollment by race, SAT and ACT score distribution for admitted students 101 | 102 | There are two tables of metadata about the assessment data above to be used in cases where multiple types of student assessment are analyzed together. 103 | 104 | - **Assessment information:** grade, subject, ID, type, and name of assessment 105 | - **Proficiency information:** mean score, error of score, number of students tested 106 | 107 | ``` r 108 | head(out$demog_master %>% arrange(sid) %>% select(1:4)) 109 | #> sid Sex Birthdate Race 110 | #> 1 001 Male 2000-01-30 White 111 | #> 2 002 Male 1998-01-25 White 112 | #> 3 003 Female 2000-10-22 Black or African American 113 | #> 4 004 Female 2003-10-05 White 114 | #> 5 005 Female 2001-05-27 Hispanic or Latino Ethnicity 115 | #> 6 006 Female 1998-12-16 White 116 | head(out$stu_year, 10) 117 | #> Source: local data frame [10 x 17] 118 | #> Groups: sid [10] 119 | #> 120 | #> sid year age grade frpl ell iep gifted grade_advance 121 | #> 122 | #> 1 002 2002 5 KG 0 0 0 0 123 | #> 2 009 2002 6 1 1 0 0 1 124 | #> 3 014 2002 5 KG 1 0 0 0 125 | #> 4 017 2002 4 PK 0 0 0 0 126 | #> 5 023 2002 5 KG 0 0 0 0 127 | #> 6 024 2002 4 PK 0 0 0 0 128 | #> 7 028 2002 5 KG 0 0 0 1 129 | #> 8 030 2002 5 KG 0 0 0 0 130 | #> 9 031 2002 6 1 1 0 0 0 131 | #> 10 034 2002 5 KG 1 0 0 0 132 | #> # ... with 8 more variables: cohort_year , cohort_grad_year , 133 | #> # exit_type , enrollment_status , ndays_possible , 134 | #> # ndays_attend , att_rate , schid 135 | ``` 136 | 137 | Cleaners 138 | -------- 139 | 140 | You can reformat the synthetic data for use in specific types of projects. Currently two functions exist to format the simulated data into an analysis file matching the SDP College-going data specification and a CEDS-like data specification. More of these functions are planned in the future. 141 | 142 | ``` r 143 | cgdata <- sdp_cleaner(out) 144 | ceds <- ceds_cleaner(out) 145 | ``` 146 | 147 | Control Parameters 148 | ------------------ 149 | 150 | By default, you only need to specify the number of students to simulate to the `simpop` command. The package has default simulation parameters that will result in creating a small school district with two schools. 151 | 152 | ``` r 153 | names(sim_control()) 154 | #> [1] "nschls" "best_schl" 155 | #> [3] "race_groups" "race_prob" 156 | #> [5] "minyear" "maxyear" 157 | #> [7] "gifted_list" "iep_list" 158 | #> [9] "ses_list" "ell_list" 159 | #> [11] "ps_transfer_list" "grade_levels" 160 | #> [13] "n_cohorts" "school_means" 161 | #> [15] "school_cov_mat" "school_names" 162 | #> [17] "postsec_names" "gpa_sim_parameters" 163 | #> [19] "grad_sim_parameters" "ps_sim_parameters" 164 | #> [21] "assess_sim_par" "assessment_adjustment" 165 | #> [23] "grad_adjustment" "ps_adjustment" 166 | #> [25] "gpa_adjustment" "assess_grades" 167 | #> [27] "n_postsec" "postsec_method" 168 | ``` 169 | 170 | These parameters can have complex structures to allow for conditional and random generation of data. Parameters fall into four categories: 171 | 172 | - **vectors:** a single list of parameters like school names, category names, or school IDs 173 | - **conditional probability list:** an R list that contains a variable to group by, a function to generate data with, and a list of parameters for that function for each group in the grouping variable 174 | - **outcome simulation parameters:** an R list of arguments to pass to the `simglm` function 175 | - **outcome adjustments:** an R list of lists, with functions that modify a variable in an existing data set 176 | 177 | For more details, see the simulation control vignette. 178 | 179 | ``` r 180 | vignette("Controlling the Data Simulation", package = "OpenSDPsynthR") 181 | ``` 182 | 183 | Package Dependencies 184 | -------------------- 185 | 186 | - `dplyr` 187 | - `lubridate` 188 | - [wakefield](https://www.github.com/trinker/wakefield) 189 | - [simglm](https://www.github.com/lebebr01/simglm) 190 | 191 | OpenSDP 192 | ------- 193 | 194 | `OpenSDPsynthR` is part of the OpenSDP project. 195 | 196 | [OpenSDP](https://opensdp.github.io) is an online, public repository of analytic code, tools, and training intended to foster collaboration among education analysts and researchers in order to accelerate the improvement of our school systems. The community is hosted by the [Strategic Data Project](https://sdp.cepr.harvard.edu), an initiative of the [Center for Education Policy Research at Harvard University](https://cepr.harvard.edu). We welcome contributions and feedback. 197 | 198 | These materials were originally authored by the Strategic Data Project. 199 | -------------------------------------------------------------------------------- /vignettes/sim_diagnostics.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Structuring the OpenSDPsynthR Package" 3 | author: "Jared E. Knowles" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{Simulation Diagnostics} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ## Introduction 13 | 14 | 15 | ### Diagnostics 16 | 17 | How do we know it worked? We can look at the patterns of ELL enrollment that 18 | are observed and see what patterns are the most common. To do this, let's compute 19 | the frequency of transition states observed per student. 20 | 21 | ```{r genData} 22 | library(OpenSDPsynthR) 23 | simouts <- simpop(nstu = 1000, control = sim_control(nschls = 8L)) 24 | stu_year <- simouts$stu_year 25 | 26 | ``` 27 | 28 | 29 | 30 | ```{r ellDiagnostic} 31 | library(ggplot2) 32 | library(tidyr) 33 | plotdf <- stu_year %>% arrange(sid, year) %>% group_by(sid) %>% 34 | do(tidy_sequence(.$ell, states = c(1, 0))) 35 | 36 | plotdf$total <- rowSums(plotdf[, -1]) 37 | plotdf <- plotdf %>% gather(-sid, key = "Transition", value = "Count") 38 | 39 | # plotdf %>% group_by(Transition) %>% filter(Transition != "total") %>% 40 | # summarize(sum(Count)) 41 | 42 | plotdf <- plotdf %>% filter(Transition != "total") %>% 43 | group_by(sid) %>% 44 | mutate(total = sum(Count)) %>% 45 | mutate(per = Count / total) %>% filter(Transition != "total") %>% 46 | separate(Transition, into = c("From", "To"), sep = "-") 47 | 48 | ggplot(plotdf, aes(Count)) + geom_histogram() + 49 | scale_x_continuous(breaks = c(0:25)) + 50 | facet_grid(From~To, labeller = label_both, switch = "y") + 51 | theme_bw() + 52 | labs(title = "Frequency of Transition States by Student - ELL", 53 | y = "Count", x = "Times per Student State Observed") 54 | 55 | ``` 56 | 57 | Looking at this chart we can see that most students went from the No state to 58 | a No state -- as would be expected when there are few ELLs. 59 | 60 | Through this process we've gained students in the ELL status who were not 61 | initially ELL. Depending on our application this may not be desirable and we 62 | may want to modify the transition matrix to avoid this. Otherwise, later, 63 | this becomes an exercise in data cleaning. 64 | 65 | Two other visual diagnostics are below. 66 | 67 | ```{r visualdiagnostics} 68 | # Other plots 69 | 70 | # ggplot(plotdf, aes(per)) + geom_density() + 71 | # facet_grid(From ~ To, labeller = label_both, switch = "y") + 72 | # theme_bw() + labs(title = "By Student Densities of Transitions") 73 | 74 | # Heatmap 75 | plotdf %>% group_by(From, To) %>% 76 | summarise(Count = sum(Count)) %>% 77 | ungroup %>% 78 | mutate(total = sum(Count)) %>% 79 | mutate(per = Count/total) %>% 80 | ggplot(aes(x = From, y = To, fill = per)) + 81 | geom_tile(color= I("black")) + 82 | geom_text(aes(label = round(per, digits = 2))) + 83 | theme_minimal() + 84 | coord_cartesian() + labs(title = "Heatmap of ELL Transition States") 85 | 86 | ``` 87 | 88 | 89 | We can also do a comparative diagnostic. Given the relatively short length of 90 | our sequence per student, it will be hard to estimate fit from a short sequence. 91 | 92 | 93 | ```{r testbyStudent} 94 | # series <- stu_year$ell[stu_year$ID == "1705"] 95 | # series <- stu_year$ell[stu_year$ID == "0001"] 96 | 97 | test_fit <- function(series, expected){ 98 | if(dim(table(series)) == 1){ 99 | return(TRUE) 100 | } else { 101 | out <- fit_series(series, return = "fit", confidencelevel = 0.99, 102 | possibleStates = rownames(expected)) 103 | low <- out$lowerEndpointMatrix < expected 104 | hi <- out$upperEndpointMatrix > expected 105 | return(all(low, hi)) 106 | } 107 | } 108 | 109 | defaultFit <- sim_control()$ell_list$ALL$pars$tm 110 | 111 | test_res <- stu_year %>% group_by(sid) %>% 112 | summarize(fit_ok = test_fit(ell, expected = defaultFit)) 113 | 114 | table(test_res$fit_ok) 115 | ``` 116 | 117 | 118 | Let's look at co-occurrence of status over time. 119 | 120 | ```{r CrossTabs} 121 | # Look at by year patterns of relationships by student year 122 | table(FRL = stu_year$frpl, GIFTED = stu_year$gifted) 123 | table(FRL = stu_year$frpl, IEP = stu_year$iep) 124 | table(GIFTED = stu_year$gifted, IEP = stu_year$iep) 125 | 126 | ``` 127 | 128 | Let's check polychoric correlations: 129 | 130 | ```{r} 131 | gamma_GK(stu_year$gifted, stu_year$iep) 132 | gamma_GK(stu_year$frpl, stu_year$iep) 133 | gamma_GK(stu_year$frpl, stu_year$ell) 134 | ``` 135 | 136 | 137 | Finally, let's see who winds up "ever" in each category 138 | 139 | ```{r collapseEver} 140 | 141 | test_df <- stu_year %>% group_by(sid) %>% 142 | summarize(iep_ever = if_else(any(iep == 1), "Yes", "No"), 143 | ell_ever = if_else(any(ell == 1), "Yes", "No"), 144 | frpl_ever = if_else(any(frpl == 1), "Yes", "No"), 145 | gifted_ever = if_else(any(gifted == 1), "Yes", "No")) 146 | 147 | table(IEP_EVER = test_df$iep_ever) 148 | table(ELL_EVER = test_df$ell_ever) 149 | table(FRPL_EVER = test_df$frpl_ever) 150 | table(GIFTED_EVER = test_df$gifted_ever) 151 | 152 | ``` 153 | 154 | 155 | ## Assigning Schools and Outcomes 156 | 157 | Students move through grades, schools, and outcomes. 158 | 159 | ```{r, include=FALSE, eval=FALSE} 160 | # Outcome assignment, outcomes are assigned in order 161 | ## sat_act 162 | ## ps_enroll 163 | 164 | # TODO: Consider including diploma attainment... 165 | 166 | out <- simpop(nstu = 1250, seed = 1241, sim_control(nschls = 9)) 167 | final_data <- sdp_cleaner(out) 168 | 169 | ggplot(out$assessment, aes(x = age, y = math_ss, group = sid)) + 170 | geom_line(alpha = I(0.2)) + 171 | # geom_smooth(method = 'lm', se=FALSE, color = I("black"), alpha = I(0.2)) + 172 | facet_wrap(~schid) 173 | 174 | 175 | score_table <- assess %>% group_by(year, age) %>% 176 | summarize(read_mean = mean(rdg_ss), 177 | read_sd = sd(rdg_ss), 178 | math_mean = mean(math_ss), 179 | math_sd = sd(math_ss)) 180 | 181 | assess <- left_join(assess, score_table) 182 | assess$math_std <- (assess$math_ss - assess$math_mean) / assess$math_sd 183 | assess$read_std <- (assess$rdg_ss - assess$read_mean) / assess$read_sd 184 | cor(assess$math_std, assess$read_std, use = "pairwise") 185 | 186 | ggplot(assess, aes(x = age, y = math_std, group = sid)) + 187 | facet_wrap(~schid) + geom_line(alpha = I(0.2)) + 188 | geom_smooth(aes(group=1), se = FALSE) 189 | 190 | 191 | 192 | ggplot(assess, aes(x = math_ss, y = rdg_ss)) + geom_point(alpha = I(0.2)) 193 | ggplot(assess, aes(x = math_std, y = read_std)) + geom_point(alpha = I(0.2)) 194 | 195 | idx <- sample(unique(assess$sid), 12) 196 | ggplot(assess[assess$sid %in% idx, ], aes(x = age, y = math_std)) + 197 | facet_wrap(~sid) + geom_line() + geom_smooth(method = 'lm', se=FALSE) + 198 | geom_hline(yintercept = 0, color = I("red")) + geom_point() 199 | 200 | ggplot(assess[assess$sid %in% idx, ], aes(x = age, y = math_ss)) + 201 | facet_wrap(~sid) + geom_line() + geom_smooth(se=FALSE) 202 | 203 | 204 | 205 | 206 | g12_cohort <- out$stu_year[out$stu_year$grade == "12", ] 207 | g12_cohort <- na.omit(g12_cohort) 208 | g12_cohort <- left_join(g12_cohort, out$demog_master[, 1:4], by = "sid") 209 | g12_cohort$male <- ifelse(g12_cohort$Sex == "Male", 1, 0) 210 | hs_outcomes <- OpenSDPsynthR:::assign_hs_outcomes(g12_cohort, 211 | control = sim_control()) 212 | 213 | zzz <- out$hs_outcomes 214 | 215 | dddff <- do.call(gen_outcome_model, ps_sim_parameters) 216 | 217 | 218 | 219 | 220 | df <- sim_glm(fixed = fixed, random = random, 221 | fixed_param = fixed_param, random_param = random_param, 222 | random3 = NULL, 223 | random_param3 = NULL, 224 | cov_param = cov_param, 225 | fact_vars = fact_vars, k = NULL, 226 | n = ngrps, p = NULL, 227 | cor_vars = cor_vars, data_str = "cross", unbal = TRUE, 228 | unbalCont = unbalanceRange) 229 | mod <- glmer(update(fixed, "sim_data ~ . - math_ss + (1|clustID)"), 230 | data = df, family = "binomial") 231 | 232 | # out <- simpop(nstu = 400, seed = 32231, 233 | # control = sim_control(nschls = 3L)) 234 | 235 | # Student Student Year Outcome 236 | # sid assessment hs_diploma 237 | # race_ethnicity school_id cum_gpa_final 238 | # sex on_track sat_act 239 | # frpl_ever frpl ps_enroll 240 | # ell_ever ell dropout 241 | # gifted_ever gifted transfer 242 | # iep_ever iep disappear 243 | # grade_level still_enroll 244 | random <- ~ 1 245 | random_param <- list(random_var = random_var, rand_gen = "rnorm") 246 | 247 | library(simglm) 248 | 249 | assess_sim_par <- list( 250 | fixed = ~ 1 + time + gifted + iep + frpl + ell + male, 251 | random = ~ 1 + time, 252 | random3 = ~ 1 + time, 253 | cor_vars = c(-0.276, -0.309, -0.046, -0.033, 254 | -0.03, -0.029, -0.003, 0.06, 0.007, 0.001), 255 | fixed_param = c(0.0024, 0.75, 0.10, -0.161388, -0.075, -0.056, 0.007), 256 | fact_vars = NULL, 257 | # intercept + any slopes in length 258 | random_param = list(random_var = c(0.2, 0.1), cor_vars = c(0.4), rand_gen = 'rnorm'), 259 | random_param3 = list(random_var = c(0.3, 0.025), rand_gen = 'rnorm'), # intercept + any slopes in length 260 | cov_param = list( 261 | dist_fun = c("rbinom", "rbinom", "rbinom", "rbinom", "rbinom"), 262 | var_type = rep("lvl1", 5), 263 | opts = list( 264 | list(size = 1, prob = 0.1), 265 | list(size = 1, prob = 0.2), 266 | list(size = 1, prob = 0.45), 267 | list(size = 1, prob = 0.1), 268 | list(size = 1, prob = 0.52) 269 | ) 270 | ), 271 | unbalCont = c(2, 16), 272 | unbalCont3 = c(100, 800), 273 | unbal = TRUE, 274 | # Total number of level 2 groups = k * n 275 | k = 15, # level 3 groups 276 | n = 200, # obs per group level 2 group 277 | p = 400, # obs per group? 278 | error_var = 1, 279 | with_err_gen = 'rnorm', 280 | lvl1_err_params = list(mean = 0, sd = 1), 281 | data_str = "long" 282 | ) 283 | 284 | assess_table <- do.call(sim_reg, assess_sim_par, quote = TRUE) 285 | # needs to be the length of all correlations between predictors 286 | temp_three <- sim_reg(fixed = fixed, random = random, random3 = random3, 287 | fixed_param = fixed_param, random_param = random_param, 288 | random_param3 = random_param3, cov_param = cov_param, 289 | fact_vars = fact_vars, k = k,n = n, p = p, 290 | lvl1_err_params = lvl1_err_params, 291 | error_var= error_var, with_err_gen = with_err_gen, 292 | cor_vars = cor_vars, data_str = "long", unbal = TRUE, 293 | unbalCont = unbalCont, unbalCont3 = unbalCont3) 294 | 295 | 296 | library(ggplot2) 297 | ggplot(temp_three, aes(x = time, y = sim_data, group = clustID)) + 298 | geom_line(alpha = I(0.2)) + facet_wrap(~clust3ID) 299 | 300 | names(temp_three)[1:6] <- c("intercept", "age", "gifted", "iep", "frpl", 301 | "ell") 302 | names(temp_three)[14] <- "math_ss" 303 | names(temp_three)[15:17] <- c("time", "sid", "schid") 304 | 305 | ggplot(temp_three, aes(x = age, y = math_ss, group = sid)) + 306 | geom_line(alpha = I(0.2)) + facet_wrap(~schid) 307 | 308 | #witihnID = time, nested w/in level 2 309 | 310 | library(lme4) 311 | proof <- lmer(math_ss ~ 1 + age + gifted + 312 | iep + frpl + ell + 313 | (1 + age | sid) + 314 | (1 | schid), data = temp_three) 315 | 316 | ``` 317 | 318 | 319 | -------------------------------------------------------------------------------- /data-raw/build_data.R: -------------------------------------------------------------------------------- 1 | # Build data 2 | library(purrr) 3 | library(tidyr) 4 | 5 | ## Build example ELL baseline data 6 | ell <- read.csv("data-raw/ellDist.csv") 7 | ell <- na.omit(ell) 8 | ell <- ell %>% gather(-age, key = "race", value = "prob") 9 | 10 | ## Build example SDP/CEDS crosswalk 11 | xwalk <- read.csv("data-raw/CEDS_SDP_map.csv", stringsAsFactors = FALSE) 12 | xwalk$schema <- NA 13 | for(i in 1:nrow(xwalk)){ 14 | xwalk$schema[i] <- I(list(OpenSDPsynthR:::get_code_values(xwalk$CEDS_Option_set[i]))) 15 | } 16 | 17 | # SES data 18 | ses <- data.frame(race = c("black", "asian", "hispanic", "amerind", "white", 19 | "other", "multiracial", "hawaiian_pi"), 20 | prob = c(0.65, 0.375, 0.6, 0.4, 0.4, 0.4, 0.4, 0.4)) 21 | ses$race <- as.character(ses$race) 22 | 23 | # Program Baseline 24 | prog_baseline <- read.csv("data-raw/program_baseline.csv") 25 | prog_baseline[, 1] <- as.character(prog_baseline[, 1]) 26 | prog_baseline[, 2] <- as.character(prog_baseline[, 2]) 27 | prog_baseline[, 3] <- as.character(prog_baseline[, 3]) 28 | names(prog_baseline) <- tolower(names(prog_baseline)) 29 | prog_baseline$frpl[prog_baseline$frpl == "2"] <- "1" 30 | prog_baseline <- prog_baseline %>% group_by(ell, iep, frpl) %>% 31 | summarize(count = sum(count), prob = sum(prob)) %>% as.data.frame() 32 | # Map CEDS names and option names 33 | # prog_baseline[, 1:3] <- recode_options(prog_baseline[, 1:3], from = "SDP") 34 | # names(prog_baseline)[1:3] <- map_CEDS(names(prog_baseline)[1:3]) 35 | 36 | ## Pull in age_grade baseline 37 | age_grade <- read.csv("data-raw/age_grade_baseline.csv") 38 | names(age_grade) <- c("age", paste0("g", -1:12), "total") 39 | age_grade[, 2:15] <- round(age_grade[, 2:15] / rowSums(age_grade[, 2:15]), 5) 40 | age_grade$total <- NULL 41 | 42 | sch_names <- c("Jackson", "Wallaby", "Kendrick", "Willow Creek", "Cypress", 43 | "Dover Hills", "Milton South", "Spring Port", "Dogwood", "Topshire", 44 | "Hillside", "Donohue", "Upper Falls", "George", "Avondale", "East Valley", 45 | "Township", "Clark", "Pollock", "Duvall", "Santiago", "Hickory", 46 | "Robin", "Warren", "Alvin", "Allen", "Haskin", "Hawthorne", "Woodcliff", 47 | "Wright", "Buchanan", "Oriole", "Diamond Lake", "Walnut", "Central", 48 | "Arbor", "Gail Hill", "Southbridge", "Silver Oak", "Marsden", 49 | "Crossroads", "Meridian", "Linden", "Success", "Jefferson", "Columbia", 50 | "Falcon", "Apollo", "Hampshire", "Madison", "Briarfield", "Jubilee", 51 | "Magnolia", "Bauer", "Stone Street", "Angelea", "Riva Ridge", 52 | "Perez", "Lincoln", "Goldeneye", "Hazel", "Vidalia", "Lewis", 53 | "Freedman", "Venice", "Wintergreen", "Holden", "Beebe", "Red Hills", 54 | "Floral", "Pressley", "Winchester", "Oak Tree", "Laurel", "Kingfisher", 55 | "Verndale", "Kennedy", "Flagstaff", "Laughlin", "East Harley", 56 | "Rolling Knoll", "Elizabeth", "Meadow", "Wakeland", "Rockford", 57 | "Hummingbird", "Castle Rock", "Sugarplum", "London Lane", "Yawkey", 58 | "Sparrow", "Watercress", "Lowell", "Mountain", "Bilford", "Maple", 59 | "Ritchie", "Dalton", "Homer", "Lower Falls", "Midlands", "Pike", 60 | "Tucker", "Whitebridge", "Pinkney", "Athena", "Acorn", "Wilson", 61 | "Common Way", "Eastbridge", "Blue Hills", "Onassis", "Capital City", 62 | "Wildwood", "Noble", "John Brown", "Ferdinand", "Weeks", "Majestic", 63 | "Starlight", "Hemingway", "Spellman", "Olivero", "Cornerstone", 64 | "Van Dusen", "Emblem", "Peyton", "Pleasant Lake", "West Harley", 65 | "Einstein", "Frederick", "Middlewood", "Burton", "Graham", "Lone Pine", 66 | "Tyler", "Yellow Creek", "Sargent", "Cathedral", "Moseley", "Justice", 67 | "Avocet", "Anderson", "James", "Barclay", "Marie Curie", "Hunt Circle", 68 | "Eastbourne", "Sunset", "Seaside", "Paisley", "Jupiter", "Prentice", 69 | "Lockland", "Canyon", "Mayer", "Westerley", "Sweetville", "Lindgren", 70 | "Carmen", "Hawking", "Lindsey", "Hunnewell", "Eagle", "Holly", 71 | "Reigh Count", "Valley Way", "Navigation", "Olsen", "Daisy Hill", 72 | "Hope", "Eliot", "Coleman", "Milton North", "Sage", "Lang", "Sierra Lane", 73 | "Commander", "Rosa Parks", "Cicero", "Chickamee", "Labyrinth", 74 | "Calvin Lee", "Nightland", "Forbes", "Sterling", "Triumph", "Wellington", 75 | "MacDonald", "Englander", "Tupelo", "Beckham", "Franklin", "Astro", 76 | "Aristides", "Morris", "Shortleaf", "Cardinal", "Little Valley", 77 | "Applewood", "Roosevelt", "Marvel", "Juniper Hill", "Esperanza", 78 | "Touchstone", "Blakeville", "Shafer", "Sebastian", "Polaris", 79 | "Ponderosa", "Mount Lyon", "Lafayette", "New Beacon", "Chesterfield", 80 | "Harmony", "Rosebud", "Tremont", "Fortuna", "North Star", "Everett", 81 | "Basswood", "Islander", "Martin", "Peabody", "Northbridge", "Hanover", 82 | "Rainbow", "Bicknell", "Courtdale", "Miller", "Mulberry", "Hadley", 83 | "Greenwich", "Ridley", "Birch", "Albert", "Culver", "Chelsea", 84 | "Unity", "Brookfield", "Irving", "Acacia", "Alliance", "Douglass", 85 | "Brennan", "Clivedale", "Shaheen", "Tesla", "Young Oak", "Duncan", 86 | "Selwyn", "Kumar", "Woodland", "Cedar", "Goldfinch", "Sullivan", 87 | "Buffalo Way", "Danehill", "Simpson", "Nyquist", "Chan", "Alewife", 88 | "Galaxy", "Highland", "Marlowe", "Tower", "Alonzo", "Prairie", 89 | "Eureka", "King", "Humphrey", "Knightwood", "Gallant", "Boundbrook", 90 | "Cherry Hill", "Kirkland", "Nobscot", "Suarez", "Floyd", "Harrison", 91 | "Greenfield", "Crane", "Rhodes", "Sea Glass", "Westminster", 92 | "Horan", "Copper Cove", "Fairbanks", "Marshfield", "Ashmont", 93 | "Heron", "Henderson", "Ridge Park", "Kent", "Friendship", "Pegasus", 94 | "Garden", "Adler", "Shuster", "Dickens", "Liberty", "Edison", 95 | "Blanchard", "Camino", "Baker", "Cilian", "Speedwell", "Bootes", 96 | "Darwin", "Quincy", "Cold Springs", "Chestnut", "Secretariat", 97 | "Gateway", "Dudley", "Horace Mann", "Glenbrook", "Pompano", "Park", 98 | "Maverick", "Hyde", "Leeds", "Percival", "Davis", "Shirley", 99 | "Redwood", "Murphy", "Poplar", "Fillmore", "Thompson", "Pharaoh", 100 | "Bennett", "Kinney", "Gilbert", "Marshall", "Monarch", "Doyle", 101 | "Walker", "Clarion", "Spinelli", "Inspiration", "Neptune", "Gibson", 102 | "Independence", "Josephine", "Finnley", "Summit", "Bloomfield", 103 | "Zola", "Ventura", "Lakeshore", "Phoenix", "Honeypot", "Chateau", 104 | "Carter ", "Velocidad", "Utopia", "Owen", "Turcotte", "Sorrel Ridge", 105 | "Fawzi", "Woodpecker", "Chisholm", "Hoffman", "Lever", "Ace Hall", 106 | "Davidson", "Weston", "Forest Heights", "Finch", "Monroe", "Caldwell", 107 | "Hamilton", "Vision", "Elk Grove", "Eleanor", "Quicksilver", 108 | "Longleaf", "Chiswick", "Buckeye", "Winthrop", "Austen", "Reyes", 109 | "Juneberry", "Waldron", "Adams", "Westbridge", "Fraser", "Truman", 110 | "Old Steeple", "Turnstone", "Memorial", "Sycamore", "Sequoia", 111 | "Sylvan", "Sandy Beach", "Hathaway", "Mandalay", "Deer Corner", 112 | "Clearview", "Corcoran", "Cabot", "Lookout Point") 113 | 114 | # ps_list <- ps_list %>% select(ope8_id, city, state, name, 115 | # degrees_awarded_predominant, 116 | # size, online_only, avg_net_price_pub, pell_grant_rate, 117 | # retention_four_year_full_time, operating, 118 | # part_time_share, act_25th_pctl_cumulative, 119 | # act_75th_pctl_cumulative, sat_average_all, 120 | # sat_25th_pctl_math, sat_75th_pctl_math, 121 | # sat_25th_pctl_reading, sat_75th_pctl_reading, 122 | # sat_25th_pctl_writing, sat_75th_pctl_writing, 123 | # race_ethn_white, race_ethn_black, race_ethn_hispanic, 124 | # race_ethn_asian, race_ethn_two_or_more) 125 | # ps_list <- zap_formats(ps_list) 126 | # ps_list <- as_factor(ps_list) 127 | # ps_list <- ps_list %>% filter(!degrees_awarded_predominant %in% c("Not Classified", 128 | # "Entirely graduate-degree granting")) 129 | # ps_list <- ps_list %>% filter(size > 0) %>% filter(size < 100000) 130 | # ps_list <- ps_list %>% filter(operating == 1) %>% select(-operating) 131 | 132 | college_scorecard <- read.csv("data-raw/college_scorecard_2013.csv", 133 | stringsAsFactors = FALSE) 134 | 135 | 136 | ps_names <- c("COMMUNITY COLLEGE 400", "DEF COMMUNITY COLLEGE", "D COMMUNITY COLLEGE", 137 | "UVW COMMUNITY COLLEGE", "COMMUNITY COLLEGE C", "A COMMUNITY COLLEGE", 138 | "COMMUNITY COLLEGE 4", "E COMMUNITY COLLEGE", "COMMUNITY COLLEGE 500", 139 | "COMMUNITY COLLEGE B", "HKL COMMUNITY COLLEGE", "XYZ COMMUNITY COLLEGE", 140 | "COMMUNITY COLLEGE A", "COMMUNITY COLLEGE 1", "B COMMUNITY COLLEGE", 141 | "MNO COMMUNITY COLLEGE", "C COMMUNITY COLLEGE", "COMMUNITY COLLEGE 6", 142 | "PRIVATE TECHNICAL INSTITUTE", "COMMUNITY COLLEGE 100", "COMMUNITY COLLEGE 7", 143 | "COMMUNITY COLLEGE 2", "COMMUNITY COLLEGE 200", "GHI COMMUNITY COLLEGE", 144 | "COMMUNITY COLLEGE 3", "PQR COMMUNITY COLLEGE", "COMMUNITY COLLEGE 5", 145 | "ABC COMMUNITY COLLEGE", "COMMUNITY COLLEGE 300", "STU COMMUNITY COLLEGE", 146 | "UNIVERSITY XYZ", "COLLEGE OF XYZ", "UNIVERSITY OF C", "ABC STATE UNIVERSITY", 147 | "STATE UNIVERSITY - NORTH CAMPUS", "PRIVATE COLLEGE A", "UNIVERSITY A", 148 | "UNIVERSITY - MAIN CAMPUS", "PUBLIC UNIVERSITY 1", "STATE UNIVERSITY - EAST CAMPUS", 149 | "PRIVATE COLLEGE B", "UNIVERSITY OF B", "UNIVERSITY - CAMPUS 2", 150 | "STATE UNIVERSITY - WEST CAMPUS", "AB COLLEGE", "UNIVERSITY OF GH", 151 | "COLLEGE OF ABC", "COLLEGE OF DEFGH", "PUBLIC UNIVERSITY 2", 152 | "STATE UNIVERSITY - SOUTH CAMPUS", "COMMUNITY COLLEGE 400", 153 | "DEF COMMUNITY COLLEGE", "COLLEGE OF XYZ", "D COMMUNITY COLLEGE", 154 | "UVW COMMUNITY COLLEGE", "UNIVERSITY XYZ", "A COMMUNITY COLLEGE", 155 | "COMMUNITY COLLEGE 4", "E COMMUNITY COLLEGE", "COMMUNITY COLLEGE 500", 156 | "ABC STATE UNIVERSITY", "COMMUNITY COLLEGE B", "STATE UNIVERSITY - NORTH CAMPUS", 157 | "HKL COMMUNITY COLLEGE", "PRIVATE COLLEGE A", "UNIVERSITY A", 158 | "UNIVERSITY - MAIN CAMPUS", "PUBLIC UNIVERSITY 1", "STATE UNIVERSITY - EAST CAMPUS", 159 | "PRIVATE COLLEGE B", "UNIVERSITY OF B", "UNIVERSITY - CAMPUS 2", 160 | "COMMUNITY COLLEGE 1", "STATE UNIVERSITY - WEST CAMPUS", "XYZ COMMUNITY COLLEGE", 161 | "AB COLLEGE", "B COMMUNITY COLLEGE", "MNO COMMUNITY COLLEGE", 162 | "UNIVERSITY OF GH", "C COMMUNITY COLLEGE", "COMMUNITY COLLEGE 6", 163 | "COMMUNITY COLLEGE A", "COLLEGE OF ABC", "COLLEGE OF DEFGH", 164 | "PUBLIC UNIVERSITY 2", "PRIVATE TECHNICAL INSTITUTE", "COMMUNITY COLLEGE 7", 165 | "UNIVERSITY OF C", "COMMUNITY COLLEGE 2", "COMMUNITY COLLEGE 200", 166 | "GHI COMMUNITY COLLEGE", "PQR COMMUNITY COLLEGE", "COMMUNITY COLLEGE C", 167 | "COMMUNITY COLLEGE 5", "COMMUNITY COLLEGE 100", "ABC COMMUNITY COLLEGE", 168 | "COMMUNITY COLLEGE 3", "STATE UNIVERSITY - SOUTH CAMPUS", "COMMUNITY COLLEGE 300", 169 | "STU COMMUNITY COLLEGE") 170 | 171 | # saveRDS(xwalk, "data/sdp_ceds_map.rds") 172 | devtools::use_data(ell, xwalk, ses, prog_baseline, age_grade, sch_names, 173 | college_scorecard, 174 | ps_names, 175 | internal = TRUE, overwrite = TRUE) 176 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | # Expand DF into grid, from SO 2 | # 3 | #' Expand dataframe into a complete grid 4 | #' 5 | #' @param ... dataframe 6 | #' @details From SO {\url{http://stackoverflow.com/questions/11693599/alternative-to-expand-grid-for-data-frames}} 7 | #' @return an expanded data frame 8 | #' @export 9 | expand_grid_df <- function(...){ 10 | Reduce(function(...) merge(..., by=NULL), list(...)) 11 | } 12 | 13 | ##' Function to calculate age from date of birth. 14 | ##' @description his function calculates age in days, months, or years from a 15 | ##' date of birth to another arbitrary date. This returns a numeric vector in 16 | ##' the specified units. 17 | ##' @param dob a vector of class \code{Date} representing the date of birth/start date 18 | ##' @param enddate a vector of class Date representing the when the observation's 19 | ##' age is of interest, defaults to current date. 20 | ##' @param units character, which units of age should be calculated? allowed values are 21 | ##' days, months, and years 22 | ##' @param precise logical indicating whether or not to calculate with leap year 23 | ##' and leap second precision 24 | ##' @return A numeric vector of ages the same length as the dob vector 25 | ##' @source This function was developed in part from this response on the R-Help mailing list. 26 | ##' @seealso See also \code{\link{difftime}} which this function uses and mimics 27 | ##' some functionality but at higher unit levels. 28 | ##' @author Jason P. Becker 29 | ##' @export 30 | ##' @examples 31 | ##' a <- as.Date(seq(as.POSIXct('1987-05-29 018:07:00'), len=26, by="21 day")) 32 | ##' b <- as.Date(seq(as.POSIXct('2002-05-29 018:07:00'), len=26, by="21 day")) 33 | ##' 34 | ##' age <- age_calc(a, units='years') 35 | ##' age 36 | ##' age <- age_calc(a, units='months') 37 | ##' age 38 | ##' age <- age_calc(a, as.Date('2005-09-01')) 39 | ##' age 40 | age_calc <- function (dob, enddate = Sys.Date(), 41 | units = "months", precise = TRUE){ 42 | if (!inherits(dob, "Date") | 43 | !inherits(enddate, "Date")) { 44 | stop("Both dob and enddate must be Date class objects") 45 | } 46 | if (any(enddate < dob)) { 47 | stop("End date must be a date after date of birth") 48 | } 49 | start <- as.POSIXlt(dob) 50 | end <- as.POSIXlt(enddate) 51 | if (precise) { 52 | start_is_leap <- 53 | ifelse(start$year %% 400 == 0, 54 | TRUE, 55 | ifelse( 56 | start$year %% 100 == 57 | 0, 58 | FALSE, 59 | ifelse(start$year %% 4 == 0, TRUE, FALSE) 60 | )) 61 | end_is_leap <- 62 | ifelse(end$year %% 400 == 0, 63 | TRUE, 64 | ifelse(end$year %% 100 == 65 | 0, FALSE, ifelse(end$year %% 66 | 4 == 0, TRUE, FALSE))) 67 | } 68 | if (units == "days") { 69 | result <- difftime(end, start, units = "days") 70 | } 71 | else if (units == "months") { 72 | months <- sapply(mapply( 73 | seq, 74 | as.POSIXct(start), 75 | as.POSIXct(end), 76 | by = "months", 77 | SIMPLIFY = FALSE 78 | ), 79 | length) - 1 80 | if (precise) { 81 | month_length_end <- ifelse(end$mon == 1 & end_is_leap, 82 | 29, 83 | ifelse(end$mon == 1, 28, ifelse(end$mon %in% 84 | c(3, 5, 8, 10), 30, 31))) 85 | month_length_prior <- ifelse((end$mon - 1) == 1 & 86 | start_is_leap, 87 | 29, 88 | ifelse((end$mon - 1) == 1, 89 | 28, ifelse((end$mon - 1) %in% c(3, 5, 8, 10), 90 | 30, 31))) 91 | month_frac <- ifelse( 92 | end$mday > start$mday, 93 | (end$mday - 94 | start$mday) / month_length_end, 95 | ifelse( 96 | end$mday < 97 | start$mday, 98 | (month_length_prior - start$mday) / month_length_prior + 99 | end$mday / 100 | month_length_end, 101 | 0 102 | ) 103 | ) 104 | result <- months + month_frac 105 | } 106 | else { 107 | result <- months 108 | } 109 | } 110 | else if (units == "years") { 111 | years <- sapply(mapply( 112 | seq, 113 | as.POSIXct(start), 114 | as.POSIXct(end), 115 | by = "years", 116 | SIMPLIFY = FALSE 117 | ), 118 | length) - 1 119 | if (precise) { 120 | start_length <- ifelse(start_is_leap, 366, 365) 121 | end_length <- ifelse(end_is_leap, 366, 365) 122 | start_day <- ifelse(start_is_leap & start$yday >= 123 | 60, start$yday - 1, start$yday) 124 | end_day <- ifelse(end_is_leap & end$yday >= 60, end$yday - 125 | 1, end$yday) 126 | year_frac <- ifelse( 127 | start_day < end_day, 128 | (end_day - 129 | start_day) / end_length, 130 | ifelse( 131 | start_day > end_day, 132 | (start_length - start_day) / 133 | start_length + end_day / end_length, 134 | 0 135 | ) 136 | ) 137 | result <- years + year_frac 138 | } 139 | else { 140 | result <- years 141 | } 142 | } 143 | else { 144 | stop("Unrecognized units. Please choose years, months, or days.") 145 | } 146 | return(result) 147 | } 148 | 149 | 150 | #' Generate positive random numbers that sum to a value 151 | #' 152 | #' @param N number of numbers to generate 153 | #' @param M constraint on the sum 154 | #' @param sd standard deviation of values 155 | #' 156 | #' @return a vector of numerics, length N 157 | #' @importFrom stats rlnorm 158 | #' @export 159 | #' @examples 160 | #' out <- rand_vect_cont(N = 10, M = 2, sd = 1) 161 | #' sum(out) 162 | #' length(out) 163 | rand_vect_cont <- function(N, M, sd = 1) { 164 | vec <- rlnorm(N, M/N, sd) 165 | vec / sum(vec) * M 166 | } 167 | 168 | #' Convert auto-generated grades to better grades 169 | #' 170 | #' @param x a vector of grades formatted with g 171 | #' 172 | #' @return a character vector of grades 173 | convert_grade <- function(x){ 174 | x <- as.character(x) 175 | x[x == "g-1"] <- "PK" 176 | x[x == "g0"] <- "KG" 177 | x[x %in% paste0("g", 1:12)] <- sapply(x[x %in% paste0("g", 1:12)], 178 | function(x) gsub("g", "", x)) 179 | x <- as.character(unlist(x)) 180 | return(x) 181 | } 182 | 183 | #' Generate simulated predictions from a linear model that account for 184 | #' model and parameter error 185 | #' 186 | #' @param object an object of class lm or glm 187 | #' @param nsim number of simulations per observation to generate 188 | #' @param newdata dataframe containing the observations to generate predictions 189 | #' for 190 | #' @param resid_error should the model residual error be added to predictions, 191 | #' default is FALSE 192 | #' @importFrom mvtnorm rmvnorm 193 | #' 194 | #' @return a matrix of predictions nrow(newdata) x nsim columns 195 | #' @export 196 | better_sim.lm <- function(object, nsim, newdata, resid_error = FALSE){ 197 | newdata <- as.matrix(newdata) 198 | se <- vcov(object) 199 | eff <- coef(object) 200 | coefs <- mvtnorm::rmvnorm(nsim, mean = eff, sigma = se) 201 | if(ncol(newdata) == ncol(coefs)-1){ 202 | warning("One too few variables in newdata, appending intercept to front") 203 | newdata <- cbind("Intercept" = 1, newdata) 204 | } else if(ncol(newdata) != ncol(coefs)){ 205 | stop("Wrong dimensions in newdata") 206 | } 207 | preds <- as.matrix(newdata) %*% t(coefs) 208 | error <- rnorm(length(preds), mean = 0, sd = sigma(object)) 209 | if(resid_error){ 210 | preds <- preds + error 211 | } 212 | return(preds) 213 | } 214 | 215 | 216 | #' Covnert a matrix to a row-wise transition matrix 217 | #' 218 | #' @param matrix a matrix tof counts 219 | #' 220 | #' @return matrix M divided by the sum of its rows 221 | #' @export 222 | #' 223 | #' @examples 224 | #' base_mat <- structure(c(44985, 740, 781, 7640), .Dim = c(2L, 2L), 225 | #' .Dimnames = list(c("0", "1"), c("0", "1"))) 226 | #' tm_convert(base_mat) 227 | tm_convert <- function(matrix){ 228 | stopifnot("matrix" %in% class(matrix)) 229 | out <- matrix / rowSums(matrix) 230 | return(out) 231 | } 232 | 233 | #' Unscale a scaled variable 234 | #' 235 | #' @param x numeric vector that has been scaled 236 | #' @param mean a numeric, the mean to add to x 237 | #' @param sd a numeric, the standardized factor to divide x by 238 | #' 239 | #' @return x rescaled with mean and sd specified by the user 240 | #' @export 241 | unscale <- function(x, mean, sd) { 242 | y <- (x / sd) + mean 243 | return(y) 244 | } 245 | 246 | #' Reliabily rescale numerics with missingness 247 | #' 248 | #' @param x a numeric vector 249 | #' 250 | #' @return x mean centered and divided by it's standard deviation 251 | #' @details If \code{sd(x)} is undefined, this returns a zero 252 | #' @export 253 | rescale <- function(x){ 254 | y <- (x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE) 255 | y[is.na(y)] <- 0 256 | return(y) 257 | } 258 | 259 | #' Recode numeric data into plausible credit data 260 | #' 261 | #' @param x numeric, vector of numerics to be truncated 262 | #' @param top integer, maximum allowable value 263 | #' @details enforces no negative numbers and truncates data at a user specified 264 | #' maximum 265 | #' @return a vector, length of x, with truncated values 266 | #' @export 267 | recode_credits <- function(x, top = 6){ 268 | y <- num_clip(x, min = 0, max = top) 269 | return(y) 270 | } 271 | 272 | 273 | #' Rescaling variables in groups 274 | #' 275 | #' @param data a dataframe containing variables you wish to rescale 276 | #' @param var name of the variable to be rescaled 277 | #' @param group_var character vector of the grouping terms 278 | #' @param newvar optional character vector for the name of the new rescaled variable 279 | #' @return data with the newvar appended 280 | #' @export 281 | #' @importFrom lazyeval interp 282 | group_rescale <- function(data, var, group_var, newvar=NULL){ 283 | if(is.null(newvar)){ 284 | newvar <- var 285 | } 286 | varval <- lazyeval::interp(~ OpenSDPsynthR::rescale(z), z = as.name(var)) 287 | data <- data %>% group_by_(group_var) %>% 288 | mutate_(.dots = setNames(list(varval), newvar)) 289 | return(data) 290 | } 291 | 292 | #' Convert a character representation of grades to numeric 293 | #' 294 | #' @param grade a character vector of grades with character labels 295 | #' 296 | #' @return a numeric vector, length of grade, representing grade levels as numbers 297 | #' @export 298 | num_grade <- function(grade){ 299 | grade[grade == "g13"] <- "13" 300 | grade[grade == "KG"] <- "0" 301 | grade[grade == "PK"] <- "-1" 302 | grade <- as.numeric(grade) 303 | return(grade) 304 | } 305 | 306 | #' Convert NA values to 0 in a vector 307 | #' 308 | #' @param x a numeric vector containing NAs 309 | #' 310 | #' @return a numeric vector where all NA values are 0 311 | #' @export 312 | zeroNA <- function(x){ 313 | x[is.na(x)] <- 0 314 | return(x) 315 | } 316 | 317 | #' Clip a vector to be between a minimum and a maximum 318 | #' 319 | #' @param x a numeric vector 320 | #' @param min numeric, a floor 321 | #' @param max numeric, a ceiling 322 | #' 323 | #' @return x, truncated to be between min and max 324 | #' @export 325 | num_clip <- function(x, min, max){ 326 | x <- ifelse(x > max, max, x) 327 | x <- ifelse(x < min, min, x) 328 | return(x) 329 | } 330 | 331 | #' Validate probability list formatting and structure 332 | #' 333 | #' @param list list that is to be passed as a probability list to simulation functions 334 | #' 335 | #' @return Logical TRUE if list is valid, error if list is not valid. 336 | #' @export 337 | validate_probability_list <- function(list){ 338 | stopifnot(class(list) == "list") 339 | if(!"GROUPVARS" %in% names(list)){ 340 | stop("List must contain a named element GROUPVARS that defines the variable 341 | in the data to assign different probability functions, e.g. Sex or Race") 342 | } 343 | if(!class(list$GROUPVARS) %in% c("character", "factor")){ 344 | stop("List element GROUPVARS must be a character or factor which can be 345 | coerced to a character which defines the levels of a grouping term.") 346 | } 347 | if(!all(unlist(lapply(gifted_list[-which("GROUPVARS" == names(gifted_list))], class)) == "list")){ 348 | stop("List elements not GROUPVARS must each be a list.") 349 | } 350 | if(!all(unlist( 351 | lapply( 352 | lapply( 353 | gifted_list[-which("GROUPVARS" == names(gifted_list))], names), 354 | function(x) x == c("f", "pars"))))){ 355 | stop("All list elements except GROUPVARS must contain exactly two elements, 356 | 'f' and 'pars'.") 357 | } 358 | message("List validated.") 359 | return(TRUE) 360 | } 361 | 362 | 363 | 364 | #' Validate outcome simulation list formatting and structure 365 | #' 366 | #' @param list list that is to be passed as a parameter list to outcome simulation functions 367 | #' 368 | #' @return Logical TRUE if list is valid, error if list is not valid. 369 | #' @export 370 | validate_sim_parameter <- function(list){ 371 | stopifnot(class(list) == "list") 372 | if(length(names(list)) > 9){ 373 | if(any(!c("fixed", "random", "random3", "cor_vars", "fixed_param", "fact_vars", 374 | "random_param", "random_param3", "cov_param", "unbalCont", "unbalCont3", 375 | "unbal", "k", "n", "p", "error_var", "with_err_gen", "lvl1_err_params", 376 | "data_str") %in% names(list))){ 377 | stop("A three level sim list must have named elements: fixed, random, 378 | random3, cor_vars, fixed_param, fact_vars, random_param, random_param3, 379 | cov_param, unbalCont, unbalCont3, unbal, k, n, p, error_var, 380 | with_err_gen, lvl1_err_params, and data_str") 381 | } 382 | } else{ 383 | if(any(!c("fixed", "random_var", "cov_param", "cor_vars", 384 | "fixed_param", "ngrps", "unbalanceRange") %in% names(list))){ 385 | stop("List must have named elements 'fixed', 'random_var', 'cov_param', 386 | 'cor_vars', 'fixed_param', 'ngrps', and 'unbalanceRange'") 387 | } 388 | } 389 | if(class(list$fixed) != "formula"){ 390 | stop("Fixed must be a formula object defining the fixed component of the 391 | outcome formula.") 392 | } 393 | if(attr(terms(list$fixed), "response") > 0){ 394 | stop("Formula must not have a response or LHS variable defined.") 395 | } 396 | K <- length(attr(terms(list$fixed), "term.labels")) 397 | if(length(list$random_var > 0)){ 398 | if(class(list$random_var) != "numeric"){ 399 | stop("Random variance must be expressed as a numeric value. Consider a value 400 | in the range of 0.01 to 0.2") 401 | } 402 | } 403 | if(length(list$random_var) > 1){ 404 | stop("Random variance element of the list can only be length 1.") 405 | } 406 | if(class(list$cov_param) != "list"){ 407 | stop("cov_param element must be a list that defines the parameters of the 408 | covariate simulation.") 409 | } 410 | if(!all(c("dist_fun", "var_type", "opts") %in% names(list$cov_param))){ 411 | stop("cov_param list must have three elements, dist_fun, var_type, and opts") 412 | } 413 | if("time" %in% attr(terms(list$fixed), "term.labels")){ 414 | if(length(list$cov_param$dist_fun) != K-1){ 415 | stop("cov_param$dist_fun must have one fewer element than terms in list$fixed") 416 | } 417 | K_star <- K-1 # adjust for time variable being dropped out in simglm 418 | if(length(list$cor_vars) != (K_star^2 - K_star)/2){ 419 | cor_elements <- (K_star^2 - K_star) /2 420 | stop("Correlation vector cor_vars must specify ", cor_elements, " correlation 421 | matrix elements.") 422 | } 423 | } else{ 424 | if(length(list$cov_param$dist_fun) != K){ 425 | stop("cov_param$dist_fun must have the same length as the terms in list$fixed") 426 | } 427 | if(length(list$cor_vars) != (K^2 - K)/2){ 428 | cor_elements <- (K^2 - K) /2 429 | stop("Correlation vector cor_vars must specify ", cor_elements, " correlation 430 | matrix elements.") 431 | } 432 | } 433 | if(length(list$fixed_param) != K+1){ 434 | stop("List element fixed_param must define beta coefficient values for ", K+1, 435 | " parameters specified in the fixed formula and one intercept.") 436 | } 437 | if(class(list$fixed_param) != "numeric"){ 438 | stop("Please define fixed parameters as numeric.") 439 | } 440 | # For two level simulation this is the specification 441 | if("ngrps" %in% names(list)){ 442 | if(length(list$ngrps) != 1){ 443 | stop("Element ngrps must be length 1 numeric or integer") 444 | } 445 | if(!class(list$ngrps) %in% c("numeric", "integer")){ 446 | stop("Element ngrps must be an integer or numeric") 447 | } 448 | } 449 | if("unbalanceRange" %in% names(list)){ 450 | if(length(list$unbalanceRange) != 2){ 451 | stop("Element unbalanceRange must be length 2, numeric or integer") 452 | } 453 | if(!class(list$unbalanceRange) %in% c("numeric", "integer")){ 454 | stop("unbalanceRange must be numeric or integer") 455 | } 456 | } 457 | message("Simulation parameters validated.") 458 | return(TRUE) 459 | } 460 | 461 | 462 | #' Simple error assertions 463 | #' 464 | #' @param expr logical expression 465 | #' @param error character, error message 466 | #' 467 | #' @return An error message, or nothing 468 | assert <- function (expr, error) { 469 | if (! expr) stop(error, call. = FALSE) 470 | } 471 | 472 | 473 | #' Sum matrix elements in a list 474 | #' 475 | #' @param l a list 476 | #' 477 | #' @return a list of summed matrix elements 478 | #' @importFrom magrittr %>% 479 | m_sum <- function(l) { 480 | Reduce(l, `+`) %>% list() 481 | } 482 | 483 | #' Tidy a two-state markov sequence for output 484 | #' 485 | #' @param seq a vector of sequence elements, with only two states 486 | #' @param states a vector of length two naming both possible states 487 | #' 488 | #' @return a data.frame 489 | #' @importFrom markovchain createSequenceMatrix 490 | #' @export 491 | #' @examples 492 | #' tidy_sequence(seq = c("Yes", "No", "No", "No", "Yes", "Yes"), 493 | #' states = c("Yes", "No")) 494 | tidy_sequence <- function(seq, states){ 495 | stopifnot(length(states) == 2) 496 | tmp <- markovchain::createSequenceMatrix(seq, possibleStates = states) 497 | newNames <- outer(dimnames(tmp)[[1]], dimnames(tmp)[[2]], paste, sep = "-") 498 | dim(newNames) <- NULL 499 | dim(tmp) <- c(1, 4) 500 | tmp <- as.data.frame(tmp) 501 | colnames(tmp) <- newNames 502 | return(tmp) 503 | } 504 | 505 | #' Compute conditional Markov sequences in a pipeline 506 | #' 507 | #' @param groupname variable that identifies the list element to use 508 | #' @param n number of elements to generate, usually defined by \link[dplyr]{n} in dplyr 509 | #' @param lst probability list that contains function and function parameters 510 | #' @param ... additional arguments passed into the function 511 | #' @return A sequence generated by functions in the probability list 512 | #' @export 513 | markov_cond_list <- function(groupname, n, lst, ...){ 514 | # TODO: add checks to lst 515 | do.call(lst[[groupname]]$f, 516 | c(list(n = n), lst[[groupname]]$pars, ...), 517 | quote = FALSE) 518 | } 519 | 520 | -------------------------------------------------------------------------------- /inst/CG_Analyze_Example.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "SDP College Going Analyze Example" 3 | author: "Jared E. Knowles" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{CG Analyze} 8 | %\VignetteEncoding{UTF-8} 9 | %\VignetteEngine{knitr::rmarkdown} 10 | editor_options: 11 | chunk_output_type: console 12 | --- 13 | 14 | ```{r, echo = FALSE} 15 | knitr::opts_chunk$set( 16 | collapse = TRUE, 17 | comment = "#>", 18 | fig.path = "tools/figs/README-", 19 | message = FALSE, 20 | warning = FALSE, 21 | fig.width =8, fig.height = 6.25 22 | ) 23 | ``` 24 | 25 | ## Open SDP Data 26 | 27 | First, we need to generate data suitable to conducting a college-going analysis. 28 | 29 | ```{r echo=FALSE, warning=FALSE, message=FALSE} 30 | library(OpenSDPsynthR) 31 | library(ggplot2) 32 | library(tidyr) 33 | library(scales) 34 | library(magrittr) 35 | 36 | 37 | simouts <- simpop(nstu = 40000, seed = 8763434, 38 | control = sim_control(nschls = 12L, minyear = 1996, 39 | n_postsec = 50L, 40 | n_cohorts = 3, 41 | maxyear = 2017)) 42 | # Check hs_outcomes and g12_cohort 43 | 44 | # 45 | # // Ninth grade cohorts you can observe persisting to the second year of college 46 | # global chrt_ninth_begin_persist_yr2 = 2008 47 | # global chrt_ninth_end_persist_yr2 = 2010 48 | # 49 | # // Ninth grade cohorts you can observe graduating high school on time 50 | # global chrt_ninth_begin_grad = 2013 51 | # global chrt_ninth_end_grad = 2015 52 | # 53 | # // Ninth grade cohorts you can observe graduating high school one year late 54 | # global chrt_ninth_begin_grad_late = 2012 55 | # global chrt_ninth_end_grad_late = 2014 56 | # 57 | # // High school graduation cohorts you can observe enrolling in college the fall after graduation 58 | # global chrt_grad_begin = 2009 59 | # global chrt_grad_end = 2011 60 | # 61 | # // High school graduation cohorts you can observe enrolling in college two years after hs graduation 62 | # global chrt_grad_begin_delayed = 2008 63 | # global chrt_grad_end_delayed = 2010 64 | 65 | ``` 66 | 67 | 68 | ## Attainment 69 | 70 | ### Overall Progression 71 | 72 | ```{r} 73 | plotdf <- filter(cg_data, chrt_ninth >= 2004 & 74 | chrt_ninth <= 2008) %>% 75 | filter(!is.na(ontime)) 76 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0) 77 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0) 78 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 & 79 | plotdf$enroll_any_yr2 == 1) 80 | 81 | agencyData <- plotdf %>% 82 | summarize(grad = mean(grad), 83 | seamless_transitioners_any = mean(seamless_transitioners_any, na.rm=TRUE), 84 | second_year_persisters = mean(second_year_persisters, na.rm=TRUE), 85 | N = n()) 86 | agencyData$school_name <- "AGENCY AVERAGE" 87 | # // 2. Calculate the mean of each outcome variable by first high school attended 88 | schoolData <- plotdf %>% group_by(first_hs_code) %>% 89 | summarize(grad = mean(grad), 90 | seamless_transitioners_any = mean(seamless_transitioners_any, 91 | na.rm=TRUE), 92 | second_year_persisters = mean(second_year_persisters, na.rm=TRUE), 93 | N = n()) 94 | ## high school attended 95 | names(schoolData)[1] <- "school_name" 96 | # // 3. Identify the agency maximum values for each of the three outcome variables 97 | maxSchool <- schoolData %>% summarize_all(.funs = funs("max")) 98 | maxSchool$school_name <- "AGENCY MAX HS" 99 | # // 4. Identify the agency minimum values for each of the three outcome variables 100 | minSchool <- schoolData %>% summarize_all(.funs = funs("min")) 101 | minSchool$school_name <- "AGENCY MIN HS" 102 | # // 5. Append the three tempfiles to the school-level file loaded into R 103 | schoolData <- bind_rows(schoolData, agencyData, 104 | minSchool, maxSchool) 105 | rm(agencyData, minSchool, maxSchool) 106 | # // Step 6: Prepare to graph the results 107 | library(tidyr) 108 | schoolData$cohort <- 1 109 | schoolData <- schoolData %>% gather(key = outcome, 110 | value = measure, -N, -school_name) 111 | schoolData$subset <- grepl("AGENCY", schoolData$school_name) 112 | library(ggplot2) 113 | library(scales) 114 | schoolData$outcome[schoolData$outcome == "cohort"] <- "Ninth Graders" 115 | schoolData$outcome[schoolData$outcome == "grad"] <- "On-time Graduates" 116 | schoolData$outcome[schoolData$outcome == "seamless_transitioners_any"] <- 117 | "Seamless College Transitioner" 118 | schoolData$outcome[schoolData$outcome == "second_year_persisters"] <- 119 | "Second Year Persisters" 120 | 121 | ``` 122 | 123 | ```{r} 124 | ggplot(schoolData[schoolData$subset,], 125 | aes(x = outcome, y = measure, group = school_name, 126 | color = school_name, linetype = school_name)) + 127 | geom_line(size = 1.1) + geom_point(aes(group = 1), color = I("black")) + 128 | geom_text(aes(label = round(measure * 100, 1)), vjust = -0.8, hjust = -0.25, 129 | color = I("black")) + 130 | scale_y_continuous(limits = c(0, 1), label = percent) + 131 | theme_bw() + theme(legend.position = c(0.825, 0.825)) + 132 | guides(color = guide_legend("", keywidth = 6, 133 | label.theme = element_text(face = "bold", 134 | size = 8, 135 | angle = 0)), 136 | linetype = "none") + 137 | labs(y = "Percent of Ninth Graders", 138 | title = "Student Progression from 9th Grade Through College", 139 | subtitle = "Agency Average", x = "", 140 | caption = paste0("Sample: 2004-2005 Agency first-time ninth graders. \n", 141 | "Postsecondary enrollment outcomes from NSC matched records. \n", 142 | "All other data from Agency administrative records.")) 143 | ``` 144 | 145 | ### Progression by Race and Ethnicity 146 | 147 | ```{r} 148 | plotdf <- filter(cg_data, chrt_ninth >= 2004 & 149 | chrt_ninth <= 2008)%>% 150 | filter(!is.na(ontime)) 151 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0) 152 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0) 153 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 & 154 | plotdf$enroll_any_yr2 == 1) 155 | 156 | # // Step 3: Create agency-level average outcomes 157 | progressRace <- plotdf %>% group_by(race_ethnicity) %>% 158 | summarize(grad = mean(grad), 159 | seamless_transitioners_any = mean(seamless_transitioners_any, na.rm=TRUE), 160 | second_year_persisters = mean(second_year_persisters, na.rm=TRUE), N = n()) 161 | # // Step 4: Reformat the data for plotting 162 | progressRace$cohort <- 1 163 | progressRace <- progressRace %>% gather(key = outcome, 164 | value = measure, -N, -race_ethnicity) 165 | # // Step 5: Recode variables for plot-friendly labels 166 | progressRace$outcome[progressRace$outcome == "cohort"] <- "Ninth Graders" 167 | progressRace$outcome[progressRace$outcome == "grad"] <- "On-time Graduates" 168 | progressRace$outcome[progressRace$outcome == "seameless_transitioners_any"] <- 169 | "Seamless College Transitioner" 170 | progressRace$outcome[progressRace$outcome == "second_year_persisters"] <- 171 | "Second Year Persisters" 172 | progressRace$subset <- ifelse(progressRace$race_ethnicity %in% 173 | c("Black or African American", "White", "Asian", "Hispanic or Latino Ethnicity"), 174 | TRUE, FALSE) 175 | 176 | ``` 177 | 178 | ```{r} 179 | ggplot(progressRace[progressRace$subset,], 180 | aes(x = outcome, y = measure, group = race_ethnicity, 181 | color = race_ethnicity, linetype = race_ethnicity)) + 182 | geom_line(size = 1.1) + geom_point(aes(group = 1), color = I("black")) + 183 | geom_text(aes(label = round(measure * 100, 1)), vjust = -0.8, 184 | hjust = -0.25, color = I("black")) + 185 | scale_y_continuous(limits = c(0, 1), label = percent) + 186 | theme_bw() + theme(legend.position = c(0.825, 0.825)) + 187 | guides(color = guide_legend("", keywidth = 6, 188 | label.theme =element_text(face = "bold", size = 8, 189 | angle = 0)), linetype = "none") + 190 | labs(y = "Percent of Ninth Graders", 191 | title = "Student Progression from 9th Grade Through College", 192 | subtitle = "By Student Race/Ethnicity", x = "", 193 | caption = paste0("Sample: 2004-2005 Agency first-time ninth graders. \n", 194 | "Postsecondary enrollment outcomes from NSC matched records. \n", 195 | "All other data from Agency administrative records.")) 196 | ``` 197 | 198 | ### Progression by Race/Ethnicity by FRL 199 | 200 | 201 | ```{r} 202 | plotdf <- filter(cg_data, chrt_ninth >= 2004 & 203 | chrt_ninth <= 2008)%>% 204 | filter(!is.na(ontime)) 205 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0) 206 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0) 207 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 & 208 | plotdf$enroll_any_yr2 == 1) 209 | plotdf <- plotdf %>% filter(frpl_ever_hs == 0) 210 | 211 | progressRaceFRL <- plotdf %>% group_by(race_ethnicity) %>% 212 | summarize(grad = mean(grad), 213 | seameless_transitioners_any = mean(seamless_transitioners_any, na.rm=TRUE), 214 | second_year_persisters = mean(second_year_persisters, na.rm=TRUE), 215 | N = n()) 216 | # // Step 5: Reformat the data file so that one variable contains all the 217 | # outcomes of interest 218 | progressRaceFRL %<>% filter(N >= 20) 219 | # // Step 6: Prepare to graph the results 220 | ## Reshape the data 221 | progressRaceFRL$cohort <- 1 222 | progressRaceFRL <- progressRaceFRL %>% 223 | gather(key = outcome,value = measure, -N, -race_ethnicity) 224 | 225 | ## Recode the variables for plot friendly labels 226 | # // Step 5: Recode variables for plot-friendly labels 227 | progressRaceFRL$outcome[progressRaceFRL$outcome == "cohort"] <- "Ninth Graders" 228 | progressRaceFRL$outcome[progressRaceFRL$outcome == "grad"] <- "On-time Graduates" 229 | progressRaceFRL$outcome[progressRaceFRL$outcome == "seameless_transitioners_any"] <- 230 | "Seamless College Transitioner" 231 | progressRaceFRL$outcome[progressRaceFRL$outcome == "second_year_persisters"] <- 232 | "Second Year Persisters" 233 | progressRaceFRL$subset <- ifelse(progressRaceFRL$race_ethnicity %in% 234 | c("Black or African American", "White", "Asian", "Hispanic or Latino Ethnicity"), 235 | TRUE, FALSE) 236 | ``` 237 | 238 | 239 | 240 | ```{r} 241 | ggplot( 242 | progressRaceFRL[progressRaceFRL$subset, ], 243 | aes( 244 | x = outcome, 245 | y = measure, 246 | group = race_ethnicity, 247 | color = race_ethnicity, 248 | linetype = race_ethnicity 249 | ) 250 | ) + 251 | geom_line(size = 1.1) + geom_point(aes(group = 1), color = I("black")) + 252 | geom_text( 253 | aes(label = round(measure * 100, 1)), 254 | vjust = -0.8, 255 | hjust = -0.25, 256 | color = I("black") 257 | ) + 258 | scale_y_continuous(limits = c(0, 1), label = percent) + 259 | theme_bw() + theme(legend.position = c(0.825, 0.825)) + 260 | guides(color = guide_legend( 261 | "", 262 | keywidth = 6, 263 | label.theme = element_text(face = "bold", size = 8, angle = 0) 264 | ), 265 | linetype = "none") + 266 | labs( 267 | y = "Percent of Ninth Graders", 268 | title = "Student Progression from 9th Grade Through College", 269 | subtitle = paste0( 270 | c( 271 | "Among Students Qualifying for Free or Reduced Price Lunch \n", 272 | "By Student Race/Ethnicity" 273 | ) 274 | ), 275 | x = "", 276 | caption = paste0( 277 | "Sample: 2004-2005 Agency first-time ninth graders. \n", 278 | "Postsecondary enrollment outcomes from NSC matched records.\n", 279 | "All other data from Agency administrative records." 280 | ) 281 | ) 282 | ``` 283 | 284 | ### Progression by On-Track Status 285 | 286 | ```{r} 287 | plotdf <- filter(cg_data, chrt_ninth >= 2004 & 288 | chrt_ninth <= 2008) 289 | 290 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0) 291 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0) 292 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 & 293 | plotdf$enroll_any_yr2 == 1) 294 | 295 | # // Step 3: Generate on track indicators that take into account students’ GPAs 296 | # upon completion of their first year in high school 297 | plotdf$ot <- NA 298 | plotdf$ot[plotdf$ontrack_yr1 == 0] <- "Off-Track to Graduate" 299 | # Check for correctness 300 | plotdf$ot[plotdf$ontrack_yr1 == 1 & plotdf$cum_gpa_yr1 < 3 & 301 | !is.na(plotdf$cum_gpa_yr1)] <- "On-Track to Graduate, GPA < 3.0" 302 | plotdf$ot[plotdf$ontrack_yr1 == 1 & plotdf$cum_gpa_yr1 >= 3 & 303 | !is.na(plotdf$cum_gpa_yr1)] <- "On-Track to Graduate, GPA >= 3.0" 304 | # // Step 4: Calculate aggregates for the Agency by on track status 305 | progressTrack <- plotdf %>% group_by(ot) %>% 306 | summarize(grad = mean(grad), 307 | seamless_transitioners_any = mean(seamless_transitioners_any, na.rm=TRUE), 308 | second_year_persisters = mean(second_year_persisters, na.rm=TRUE), 309 | N = n()) 310 | 311 | # of interest 312 | progressTrack$cohort <- 1 313 | progressTrack <- progressTrack %>% gather(key = outcome, 314 | value = measure, -N, -ot) 315 | progressTrack$outcome[progressTrack$outcome == "cohort"] <- "Ninth Graders" 316 | progressTrack$outcome[progressTrack$outcome == "grad"] <- "On-time Graduates" 317 | progressTrack$outcome[progressTrack$outcome == "seamless_transitioners_any"] <- 318 | "Seamless College Transitioner" 319 | progressTrack$outcome[progressTrack$outcome == "second_year_persisters"] <- 320 | "Second Year Persisters" 321 | 322 | ``` 323 | 324 | ```{r} 325 | ann_txt <- data.frame(outcome = rep("Second Year Persisters", 3), 326 | measure = c(0.22, 0.55, 0.85), 327 | textlabel = c("Off-Track \nto Graduate", 328 | "On-Track to Graduate,\n GPA < 3.0", 329 | "On-Track to Graduate,\n GPA >= 3.0")) 330 | ann_txt$ot <- ann_txt$textlabel 331 | ggplot(progressTrack, 332 | aes(x = outcome, y = measure, group = ot, 333 | color = ot, linetype = ot)) + 334 | geom_line(size = 1.1) + geom_point(aes(group = 1), color = I("black")) + 335 | geom_text(aes(label = round(measure * 100, 1)), vjust = -0.8, hjust = -0.25, 336 | color = I("black")) + 337 | geom_text(data = ann_txt, aes(label = textlabel)) + 338 | scale_y_continuous(limits = c(0, 1), label = percent) + 339 | theme_bw() + theme(legend.position = c(0.825, 0.825)) + 340 | scale_color_brewer(type = "qual", palette = 2) + 341 | guides(color = "none", 342 | linetype = "none") + 343 | labs(y = "Percent of Ninth Graders", 344 | title = "Student Progression from 9th Grade Through College", 345 | subtitle = "By Course Credits and GPA after First High School Year", x = "", 346 | caption = paste0("Sample: 2004-2005 Agency first-time ninth graders. \n", 347 | "Postsecondary enrollment outcomes from NSC matched records. \n", 348 | "All other data from Agency administrative records.")) 349 | ``` 350 | 351 | ## Ninth to Tenth Grade Transition by On-Track Status 352 | 353 | 354 | ### Proportion of Students On-Track by High School 355 | 356 | ```{r} 357 | plotdf <- filter(cg_data, chrt_ninth >= 2005 & 358 | chrt_ninth <= 2008) %>% 359 | filter(!is.na(ontime)) 360 | 361 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0) 362 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0) 363 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 & 364 | plotdf$enroll_any_yr2 == 1) 365 | 366 | plotdf$ot <- NA 367 | plotdf$ot[plotdf$ontrack_yr1 == 0] <- "Off-Track to Graduate" 368 | plotdf$ot[plotdf$ontrack_yr1 == 1 & plotdf$cum_gpa_yr1 < 3 & 369 | !is.na(plotdf$cum_gpa_yr1)] <- "On-Track to Graduate, GPA < 3.0" 370 | plotdf$ot[plotdf$ontrack_yr1 == 1 & plotdf$cum_gpa_yr1 >= 3 & 371 | !is.na(plotdf$cum_gpa_yr1)] <- "On-Track to Graduate, GPA >= 3.0" 372 | 373 | progressBars <- bind_rows( 374 | plotdf %>% group_by(ot) %>% tally() %>% ungroup %>% 375 | mutate(count = sum(n), first_hs_code = "Agency Average"), 376 | plotdf %>% group_by(first_hs_code, ot) %>% tally() %>% ungroup %>% 377 | group_by(first_hs_code) %>% 378 | mutate(count = sum(n)) 379 | ) 380 | 381 | # replace first_hs_name = subinstr(first_hs_name, " High School", "", .) 382 | # progressBars$first_hs_name <- gsub(" High School", "", progressBars$first_hs_name) 383 | # // Step 5: For students who are off-track upon completion of their first year 384 | # of high school, convert the values to be negative for ease of 385 | # visualization in the graph 386 | progressBars$n[progressBars$ot == "Off-Track to Graduate"] <- 387 | -progressBars$n[progressBars$ot == "Off-Track to Graduate"] 388 | 389 | ``` 390 | 391 | ```{r} 392 | # // Step 6: Plot 393 | ggplot(progressBars, aes(x = reorder(first_hs_code, n/count), 394 | y = n/count, group = ot)) + 395 | geom_bar(aes(fill = ot), stat = 'identity', color = I("black")) + 396 | geom_text(aes(label = round(100* n/count, 0)), 397 | position = position_stack(vjust=0.3)) + 398 | theme_bw() + 399 | scale_y_continuous(limits = c(-0.8,1), label = percent, 400 | name = "Percent of Ninth Graders", 401 | breaks = seq(-0.8, 1, 0.2)) + 402 | scale_fill_brewer(name = "", type = "qual", palette = 6) + 403 | theme(axis.text.x = element_text(angle = 30, color = "black", vjust = 0.5), 404 | legend.position = c(0.15, 0.875)) + 405 | labs(title = "Proportion of Students On-Track to Graduate by School", 406 | subtitle = "End of Ninth Grade On-Track Status \n By High School", x = "", 407 | caption = paste0("Sample: 2004-2005 and 2005-20065 Agency first-time ninth 408 | graders. \n", 409 | "Postsecondary enrollment outcomes from NSC matched records. \n", 410 | "All other data from Agency administrative records.")) 411 | ``` 412 | 413 | 414 | ### Ninth to Tenth Grade Transition by On-Track Status 415 | 416 | 417 | ## High School Graduation 418 | 419 | ### High School Completion Rates by School 420 | 421 | ```{r} 422 | plotdf <- filter(cg_data, chrt_ninth >= 2005 & 423 | chrt_ninth <= 2008) %>% 424 | filter(!is.na(ontime)) 425 | 426 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0) 427 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0) 428 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 & 429 | plotdf$enroll_any_yr2 == 1) 430 | 431 | schoolLevel <- bind_rows( 432 | plotdf %>% group_by(first_hs_code) %>% 433 | summarize(ontime_grad = mean(ontime, na.rm=TRUE), 434 | late_grad = mean(late, na.rm=TRUE), 435 | count = n()), 436 | plotdf %>% ungroup %>% 437 | summarize(first_hs_code = "Agency AVERAGE", 438 | ontime_grad = mean(ontime, na.rm=TRUE), 439 | late_grad = mean(late, na.rm=TRUE), 440 | count = n()) 441 | ) 442 | # // Step 3: Reshape the data wide 443 | schoolLevel <- schoolLevel %>% gather(key = outcome, 444 | value = measure, -count, -first_hs_code) 445 | schoolLevel$outcome[schoolLevel$outcome == "ontime_grad"] <- "On-Time HS Graduate" 446 | schoolLevel$outcome[schoolLevel$outcome == "late_grad"] <- "Graduate in 4+ Years" 447 | ``` 448 | 449 | ```{r} 450 | ggplot(schoolLevel, aes(x = reorder(first_hs_code, measure), y = measure, 451 | group = first_hs_code, fill = outcome)) + 452 | geom_bar(aes(fill = outcome), stat = 'identity', color = I("black")) + 453 | geom_text(aes(label = round(100 * measure, 0)), 454 | position = position_stack(vjust = 0.8)) + 455 | theme_bw() + theme(panel.grid = element_blank(), axis.ticks.x = element_blank()) + 456 | scale_y_continuous(limits = c(0, 1), label = percent, 457 | name = "Percent of Ninth Graders") + 458 | scale_fill_brewer(name = "", type = "qual", palette = 7) + 459 | theme(axis.text.x = element_text(color = "black", angle = 30, vjust = 0.5), 460 | legend.position = c(0.15, 0.825)) + 461 | labs(title = "High School Graduation Rates by High School", 462 | x = "", 463 | caption = paste0("Sample: 2004-2005 Agency first-time ninth graders. \n", 464 | "Data from Agency administrative records.")) 465 | 466 | ``` 467 | 468 | 469 | ### High School Completion Rates by Average 8th Grade Achievement 470 | 471 | 472 | ```{r} 473 | plotdf <- filter(cg_data, chrt_ninth >= 2005 & 474 | chrt_ninth <= 2008) %>% 475 | filter(!is.na(ontime)) %>% filter(!is.na(test_math_8_std)) 476 | 477 | 478 | schoolLevel <- bind_rows( 479 | plotdf %>% group_by(first_hs_code) %>% 480 | summarize(ontime_grad = mean(ontime, na.rm=TRUE), 481 | std_score = mean(test_math_8_std, na.rm=TRUE), 482 | count = n()), 483 | plotdf %>% ungroup %>% 484 | summarize(first_hs_code = "Agency AVERAGE", 485 | ontime_grad = mean(ontime, na.rm=TRUE), 486 | std_score = mean(test_math_8_std, na.rm=TRUE), 487 | count = n()) 488 | ) 489 | 490 | 491 | 492 | ``` 493 | 494 | ```{r} 495 | ggplot(schoolLevel[schoolLevel$first_hs_code != "Agency AVERAGE", ], 496 | aes(x = std_score, y = ontime_grad)) + 497 | geom_vline(xintercept = as.numeric(schoolLevel[schoolLevel$first_hs_code == 498 | "Agency AVERAGE", "std_score"]), 499 | linetype = 4, color = I("goldenrod"), size = 1.1) + 500 | geom_hline(yintercept = as.numeric(schoolLevel[schoolLevel$first_hs_code == 501 | "Agency AVERAGE", "ontime_grad"]), 502 | linetype = 4, color = I("purple"), size = 1.1) + 503 | geom_point(size = I(2)) + 504 | theme_bw() + theme(panel.grid = element_blank()) +coord_cartesian() + 505 | annotate(geom = "text", x = -.85, y = 0.025, 506 | label = "Below average math scores & \n below average graduation rates", 507 | size = I(2.5)) + 508 | annotate(geom = "text", x = .85, y = 0.025, 509 | label = "Above average math scores & \n below average graduation rates", 510 | size = I(2.5)) + 511 | annotate(geom = "text", x = .85, y = 0.975, 512 | label = "Above average math scores & \n above average graduation rates", 513 | size = I(2.5)) + 514 | annotate(geom = "text", x = -.85, y = 0.975, 515 | label = "Below average math scores & \n above average graduation rates", 516 | size = I(2.5)) + 517 | annotate(geom = "text", x = .205, y = 0.025, 518 | label = "Agency Average \n Test Score", 519 | size = I(2.5), color = I("goldenrod")) + 520 | annotate(geom = "text", x = .85, y = 0.61, 521 | label = "Agency Average Graduation Rate", 522 | size = I(2.5)) + 523 | scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.2)) + 524 | scale_y_continuous(limits = c(0, 1), label = percent, 525 | name = "Percent of Ninth Graders", breaks = seq(0, 1, 0.1)) + 526 | geom_text(aes(label = first_hs_code), nudge_y = 0.065, vjust = "top", size = I(4), 527 | nudge_x = 0.01) + 528 | labs(title = "High School Graduation Rates by High School", 529 | x = "Average 8th Grade Math Standardized Score", 530 | subtitle = "By Student Achievement Profile Upon High School Entry", 531 | caption = paste0("Sample: 2004-2005 through 2005-2006 Agency first-time ", 532 | "ninth graders with eighth grade math test scores. \n", 533 | "Data from Agency administrative records.")) 534 | ``` 535 | 536 | --------------------------------------------------------------------------------