├── NEWS.md
├── R
    ├── sysdata.rda
    ├── OpenSDPsynthR-package.r
    ├── zzz.R
    ├── gamma_gk.R
    ├── diagnostics.R
    ├── markov_transitions.R
    ├── CEDS.R
    ├── baselines.R
    └── utils.R
├── .gitignore
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-all.R
    │   ├── test-validators.R
    │   ├── test-age_calc.R
    │   ├── test-gen.R
    │   ├── test-markov.R
    │   ├── test-mapCEDS.R
    │   └── test-baseline.R
├── tools
    ├── figs
    │   ├── open_sdp_logo_red.png
    │   ├── README-ellDiagnostic-1.png
    │   ├── README-visualdiagnostics-1.png
    │   └── README-visualdiagnostics-2.png
    └── example_collab.Rmd
├── vignettes
    ├── tools
    │   └── figs
    │   │   ├── README-ellDiagnostic-1.png
    │   │   ├── README-unnamed-chunk-10-1.png
    │   │   ├── README-unnamed-chunk-12-1.png
    │   │   ├── README-unnamed-chunk-14-1.png
    │   │   ├── README-unnamed-chunk-16-1.png
    │   │   ├── README-unnamed-chunk-4-1.png
    │   │   ├── README-unnamed-chunk-6-1.png
    │   │   ├── README-unnamed-chunk-8-1.png
    │   │   ├── README-visualdiagnostics-1.png
    │   │   └── README-visualdiagnostics-2.png
    ├── sim_control.Rmd
    └── sim_diagnostics.Rmd
├── .Rbuildignore
├── data-raw
    ├── program_baseline.csv
    ├── grade_matrix.csv
    ├── age_grade_baseline.csv
    ├── ellDist.csv
    ├── CEDS_SDP_map.csv
    └── build_data.R
├── man
    ├── OpenSDPsynthR.Rd
    ├── m_sum.Rd
    ├── namedList.Rd
    ├── zeroNA.Rd
    ├── assert.Rd
    ├── rescale_gpa.Rd
    ├── convert_grade.Rd
    ├── get_baseline.Rd
    ├── gen_ontrack.Rd
    ├── diag_offset.Rd
    ├── get_sim_groupvars.Rd
    ├── gen_gpa.Rd
    ├── sdp_cleaner.Rd
    ├── gen_assess.Rd
    ├── get_code_values.Rd
    ├── rescale.Rd
    ├── gen_ps.Rd
    ├── num_grade.Rd
    ├── num_clip.Rd
    ├── gen_annual_gpa.Rd
    ├── gen_credits.Rd
    ├── expand_grid_df.Rd
    ├── gen_student_years.Rd
    ├── map_CEDS.Rd
    ├── unscale.Rd
    ├── gen_hs_annual.Rd
    ├── assign_grade.Rd
    ├── assign_hs_outcomes.Rd
    ├── gen_annual_status.Rd
    ├── gen_grad.Rd
    ├── validate_probability_list.Rd
    ├── recode_options.Rd
    ├── validate_sim_parameter.Rd
    ├── assign_baseline.Rd
    ├── gen_initial_status.Rd
    ├── gen_nsc.Rd
    ├── tm_convert.Rd
    ├── grade_transitions.Rd
    ├── recode_credits.Rd
    ├── gen_students.Rd
    ├── tidy_sequence.Rd
    ├── assign_schools.Rd
    ├── gen_ps_enrollment.Rd
    ├── rand_vect_cont.Rd
    ├── simpop.Rd
    ├── ceds_cleaner.Rd
    ├── group_rescale.Rd
    ├── make_inds.Rd
    ├── cond_prob.Rd
    ├── school_transitions.Rd
    ├── markov_cond_list.Rd
    ├── make_binary_series.Rd
    ├── gen_schools.Rd
    ├── better_sim.lm.Rd
    ├── gamma_GK.Rd
    ├── fit_series.Rd
    ├── gen_outcome_model.Rd
    ├── make_markov_series.Rd
    ├── age_calc.Rd
    └── sim_control.Rd
├── todo.md
├── OpenSDP.data.Rproj
├── DESCRIPTION
├── CONTRIBUTING.md
├── LICENSE
├── NAMESPACE
├── CONDUCT.md
├── README.Rmd
├── README.md
└── inst
    └── CG_Analyze_Example.Rmd


/NEWS.md:
--------------------------------------------------------------------------------
1 | ## Release - beta
2 | 
3 | * rollout
4 | 


--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/R/sysdata.rda


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | inst/doc
6 | *.dll
7 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(OpenSDPsynthR)
3 | 
4 | test_check("OpenSDPsynthR")
5 | 


--------------------------------------------------------------------------------
/tools/figs/open_sdp_logo_red.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/tools/figs/open_sdp_logo_red.png


--------------------------------------------------------------------------------
/tools/figs/README-ellDiagnostic-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/tools/figs/README-ellDiagnostic-1.png


--------------------------------------------------------------------------------
/tools/figs/README-visualdiagnostics-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/tools/figs/README-visualdiagnostics-1.png


--------------------------------------------------------------------------------
/tools/figs/README-visualdiagnostics-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/tools/figs/README-visualdiagnostics-2.png


--------------------------------------------------------------------------------
/vignettes/tools/figs/README-ellDiagnostic-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-ellDiagnostic-1.png


--------------------------------------------------------------------------------
/vignettes/tools/figs/README-unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/vignettes/tools/figs/README-unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/vignettes/tools/figs/README-unnamed-chunk-14-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-14-1.png


--------------------------------------------------------------------------------
/vignettes/tools/figs/README-unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/vignettes/tools/figs/README-unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/vignettes/tools/figs/README-unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/vignettes/tools/figs/README-unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/vignettes/tools/figs/README-visualdiagnostics-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-visualdiagnostics-1.png


--------------------------------------------------------------------------------
/vignettes/tools/figs/README-visualdiagnostics-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSDP/OpenSDPsynthR/HEAD/vignettes/tools/figs/README-visualdiagnostics-2.png


--------------------------------------------------------------------------------
/R/OpenSDPsynthR-package.r:
--------------------------------------------------------------------------------
1 | #' OpenSDPsynthR
2 | #'
3 | #' @name OpenSDPsynthR
4 | #' @description A package to generate OpenSDP data
5 | #' @docType package
6 | NULL
7 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^\.travis\.yml$
 4 | ^CONDUCT\.md$
 5 | ^README\.Rmd$
 6 | ^README-.*\.png$
 7 | ^data-raw$
 8 | ^todo.md$
 9 | ^demo.R$
10 | ^erratta.R$
11 | 


--------------------------------------------------------------------------------
/data-raw/program_baseline.csv:
--------------------------------------------------------------------------------
 1 | ELL,IEP,FRPL,count,prob
 2 | 0,0,0,56838,55.00
 3 | 0,0,1,6078,3.7
 4 | 0,0,2,40026,25.93
 5 | 0,1,0,8178,4.44
 6 | 0,1,1,1200,0.73
 7 | 0,1,2,10223,5.4
 8 | 1,0,0,1089,0.64
 9 | 1,0,1,360,0.18
10 | 1,0,2,4411,3.41
11 | 1,1,0,121,0.09
12 | 1,1,1,62,0.05
13 | 1,1,2,809,0.43
14 | 


--------------------------------------------------------------------------------
/man/OpenSDPsynthR.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/OpenSDPsynthR-package.r
 3 | \docType{package}
 4 | \name{OpenSDPsynthR}
 5 | \alias{OpenSDPsynthR}
 6 | \alias{OpenSDPsynthR-package}
 7 | \title{OpenSDPsynthR}
 8 | \description{
 9 | A package to generate OpenSDP data
10 | }
11 | 


--------------------------------------------------------------------------------
/man/m_sum.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{m_sum}
 4 | \alias{m_sum}
 5 | \title{Sum matrix elements in a list}
 6 | \usage{
 7 | m_sum(l)
 8 | }
 9 | \arguments{
10 | \item{l}{a list}
11 | }
12 | \value{
13 | a list of summed matrix elements
14 | }
15 | \description{
16 | Sum matrix elements in a list
17 | }
18 | 


--------------------------------------------------------------------------------
/man/namedList.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sim_control.R
 3 | \name{namedList}
 4 | \alias{namedList}
 5 | \title{Create a named list}
 6 | \usage{
 7 | namedList(...)
 8 | }
 9 | \arguments{
10 | \item{...}{arguments to pass to the list}
11 | }
12 | \value{
13 | a named list
14 | }
15 | \description{
16 | Create a named list
17 | }
18 | 


--------------------------------------------------------------------------------
/todo.md:
--------------------------------------------------------------------------------
 1 | ## TO-DO
 2 | 
 3 | - unit tests for map_CEDS like functions + error handling
 4 | - unit tests for all user-level generator functions
 5 | - unit tests for numeric accuracy
 6 | X - Create baselines for `frpl`, `iep`, `gifted`, `ell`, and `grade`
 7 | X - Define input requirements for sequence generation
 8 | X - Define process for generating sequences with and without initial states
 9 | X - Define diagnostics
10 | 


--------------------------------------------------------------------------------
/man/zeroNA.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{zeroNA}
 4 | \alias{zeroNA}
 5 | \title{Convert NA values to 0 in a vector}
 6 | \usage{
 7 | zeroNA(x)
 8 | }
 9 | \arguments{
10 | \item{x}{a numeric vector containing NAs}
11 | }
12 | \value{
13 | a numeric vector where all NA values are 0
14 | }
15 | \description{
16 | Convert NA values to 0 in a vector
17 | }
18 | 


--------------------------------------------------------------------------------
/man/assert.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{assert}
 4 | \alias{assert}
 5 | \title{Simple error assertions}
 6 | \usage{
 7 | assert(expr, error)
 8 | }
 9 | \arguments{
10 | \item{expr}{logical expression}
11 | 
12 | \item{error}{character, error message}
13 | }
14 | \value{
15 | An error message, or nothing
16 | }
17 | \description{
18 | Simple error assertions
19 | }
20 | 


--------------------------------------------------------------------------------
/man/rescale_gpa.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_outcomes.R
 3 | \name{rescale_gpa}
 4 | \alias{rescale_gpa}
 5 | \title{Rescale scaled GPA to be on 0-4 scale}
 6 | \usage{
 7 | rescale_gpa(x)
 8 | }
 9 | \arguments{
10 | \item{x}{a scaled normal variable}
11 | }
12 | \value{
13 | a GPA rounded to tenths, from 0 to 1
14 | }
15 | \description{
16 | Rescale scaled GPA to be on 0-4 scale
17 | }
18 | 


--------------------------------------------------------------------------------
/man/convert_grade.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{convert_grade}
 4 | \alias{convert_grade}
 5 | \title{Convert auto-generated grades to better grades}
 6 | \usage{
 7 | convert_grade(x)
 8 | }
 9 | \arguments{
10 | \item{x}{a vector of grades formatted with g}
11 | }
12 | \value{
13 | a character vector of grades
14 | }
15 | \description{
16 | Convert auto-generated grades to better grades
17 | }
18 | 


--------------------------------------------------------------------------------
/man/get_baseline.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/baselines.R
 3 | \name{get_baseline}
 4 | \alias{get_baseline}
 5 | \title{Function to load in SDP default baseline data}
 6 | \usage{
 7 | get_baseline(bl)
 8 | }
 9 | \arguments{
10 | \item{bl}{a character naming the type of baseline available}
11 | }
12 | \value{
13 | the restored baseline object
14 | }
15 | \description{
16 | Function to load in SDP default baseline data
17 | }
18 | 


--------------------------------------------------------------------------------
/man/gen_ontrack.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_outcomes.R
 3 | \name{gen_ontrack}
 4 | \alias{gen_ontrack}
 5 | \title{Calculate on_track based on credits}
 6 | \usage{
 7 | gen_ontrack(gpa_ontrack)
 8 | }
 9 | \arguments{
10 | \item{gpa_ontrack}{a data.frame with cumulative credits}
11 | }
12 | \value{
13 | \code{gpa_ontrack} with ontrack indicators appended
14 | }
15 | \description{
16 | Calculate on_track based on credits
17 | }
18 | 


--------------------------------------------------------------------------------
/OpenSDP.data.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: XeLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/man/diag_offset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/baselines.R
 3 | \name{diag_offset}
 4 | \alias{diag_offset}
 5 | \title{Offset the diagonal values of a matrix}
 6 | \usage{
 7 | diag_offset(matrix, offset = 1L)
 8 | }
 9 | \arguments{
10 | \item{matrix}{a square matrix}
11 | 
12 | \item{offset}{an integer value to offset the diagonal by}
13 | }
14 | \value{
15 | a square matrix
16 | }
17 | \description{
18 | Offset the diagonal values of a matrix
19 | }
20 | 


--------------------------------------------------------------------------------
/man/get_sim_groupvars.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sim_defaults.R
 3 | \name{get_sim_groupvars}
 4 | \alias{get_sim_groupvars}
 5 | \title{Get grouping variables}
 6 | \usage{
 7 | get_sim_groupvars(control = sim_control())
 8 | }
 9 | \arguments{
10 | \item{control}{a control list produced by \code{\link{sim_control}}}
11 | }
12 | \value{
13 | a character vector of grouping terms in control lists
14 | }
15 | \description{
16 | Get grouping variables
17 | }
18 | 


--------------------------------------------------------------------------------
/man/gen_gpa.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_outcomes.R
 3 | \name{gen_gpa}
 4 | \alias{gen_gpa}
 5 | \title{Generate a final GPA for students}
 6 | \usage{
 7 | gen_gpa(data, control = sim_control())
 8 | }
 9 | \arguments{
10 | \item{data}{a dataframe with variables}
11 | 
12 | \item{control}{a sim_control parmeter, default is \code{sim_control}}
13 | }
14 | \value{
15 | a numeric vector
16 | }
17 | \description{
18 | Generate a final GPA for students
19 | }
20 | 


--------------------------------------------------------------------------------
/man/sdp_cleaner.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleaners.R
 3 | \name{sdp_cleaner}
 4 | \alias{sdp_cleaner}
 5 | \title{Convert a population simulation into an analysis file}
 6 | \usage{
 7 | sdp_cleaner(simouts)
 8 | }
 9 | \arguments{
10 | \item{simouts}{result of the \code{simpop} function}
11 | }
12 | \value{
13 | an analysis file as a single dataframe for HS outcomes
14 | }
15 | \description{
16 | Convert a population simulation into an analysis file
17 | }
18 | 


--------------------------------------------------------------------------------
/man/gen_assess.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_outcomes.R
 3 | \name{gen_assess}
 4 | \alias{gen_assess}
 5 | \title{Generate an assessment table}
 6 | \usage{
 7 | gen_assess(data, control = sim_control())
 8 | }
 9 | \arguments{
10 | \item{data}{student-year data}
11 | 
12 | \item{control}{output from \code{sim_control}}
13 | }
14 | \value{
15 | a two-column dataframe with math and reading scores
16 | }
17 | \description{
18 | Generate an assessment table
19 | }
20 | 


--------------------------------------------------------------------------------
/man/get_code_values.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CEDS.R
 3 | \name{get_code_values}
 4 | \alias{get_code_values}
 5 | \title{Get values of codes from the CEDS Crosswalk List}
 6 | \usage{
 7 | get_code_values(x)
 8 | }
 9 | \arguments{
10 | \item{x}{the data.frame character element that contains the codes}
11 | }
12 | \value{
13 | a list with the labels and levels properly formatted
14 | }
15 | \description{
16 | Get values of codes from the CEDS Crosswalk List
17 | }
18 | 


--------------------------------------------------------------------------------
/man/rescale.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{rescale}
 4 | \alias{rescale}
 5 | \title{Reliabily rescale numerics with missingness}
 6 | \usage{
 7 | rescale(x)
 8 | }
 9 | \arguments{
10 | \item{x}{a numeric vector}
11 | }
12 | \value{
13 | x mean centered and divided by it's standard deviation
14 | }
15 | \description{
16 | Reliabily rescale numerics with missingness
17 | }
18 | \details{
19 | If \code{sd(x)} is undefined, this returns a zero
20 | }
21 | 


--------------------------------------------------------------------------------
/man/gen_ps.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_outcomes.R
 3 | \name{gen_ps}
 4 | \alias{gen_ps}
 5 | \title{Generate postsecondary enrollment outcome}
 6 | \usage{
 7 | gen_ps(data, control = sim_control())
 8 | }
 9 | \arguments{
10 | \item{data}{cohort data}
11 | 
12 | \item{control}{output from \code{sim_control}}
13 | }
14 | \value{
15 | a two-column dataframe with probabilities and binary outcome
16 | }
17 | \description{
18 | Generate postsecondary enrollment outcome
19 | }
20 | 


--------------------------------------------------------------------------------
/man/num_grade.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{num_grade}
 4 | \alias{num_grade}
 5 | \title{Convert a character representation of grades to numeric}
 6 | \usage{
 7 | num_grade(grade)
 8 | }
 9 | \arguments{
10 | \item{grade}{a character vector of grades with character labels}
11 | }
12 | \value{
13 | a numeric vector, length of grade, representing grade levels as numbers
14 | }
15 | \description{
16 | Convert a character representation of grades to numeric
17 | }
18 | 


--------------------------------------------------------------------------------
/man/num_clip.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{num_clip}
 4 | \alias{num_clip}
 5 | \title{Clip a vector to be between a minimum and a maximum}
 6 | \usage{
 7 | num_clip(x, min, max)
 8 | }
 9 | \arguments{
10 | \item{x}{a numeric vector}
11 | 
12 | \item{min}{numeric, a floor}
13 | 
14 | \item{max}{numeric, a ceiling}
15 | }
16 | \value{
17 | x, truncated to be between min and max
18 | }
19 | \description{
20 | Clip a vector to be between a minimum and a maximum
21 | }
22 | 


--------------------------------------------------------------------------------
/man/gen_annual_gpa.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_outcomes.R
 3 | \name{gen_annual_gpa}
 4 | \alias{gen_annual_gpa}
 5 | \title{Generate annual GPA sequence for students based on final GPA}
 6 | \usage{
 7 | gen_annual_gpa(gpa_ontrack)
 8 | }
 9 | \arguments{
10 | \item{gpa_ontrack}{a data.frame containing final GPA for a student}
11 | }
12 | \value{
13 | \code{gpa_ontrack} appended with gpa by year
14 | }
15 | \description{
16 | Generate annual GPA sequence for students based on final GPA
17 | }
18 | 


--------------------------------------------------------------------------------
/man/gen_credits.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_outcomes.R
 3 | \name{gen_credits}
 4 | \alias{gen_credits}
 5 | \title{Generate credit sequence for students based on their final GPA}
 6 | \usage{
 7 | gen_credits(gpa_ontrack)
 8 | }
 9 | \arguments{
10 | \item{gpa_ontrack}{a data.frame containing final GPA for a student}
11 | }
12 | \value{
13 | \code{gpa_ontrack} but appended with credits by year
14 | }
15 | \description{
16 | Generate credit sequence for students based on their final GPA
17 | }
18 | 


--------------------------------------------------------------------------------
/man/expand_grid_df.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{expand_grid_df}
 4 | \alias{expand_grid_df}
 5 | \title{Expand dataframe into a complete grid}
 6 | \usage{
 7 | expand_grid_df(...)
 8 | }
 9 | \arguments{
10 | \item{...}{dataframe}
11 | }
12 | \value{
13 | an expanded data frame
14 | }
15 | \description{
16 | Expand dataframe into a complete grid
17 | }
18 | \details{
19 | From SO {\url{http://stackoverflow.com/questions/11693599/alternative-to-expand-grid-for-data-frames}}
20 | }
21 | 


--------------------------------------------------------------------------------
/man/gen_student_years.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sim_defaults.R
 3 | \name{gen_student_years}
 4 | \alias{gen_student_years}
 5 | \title{Generate annual student observations}
 6 | \usage{
 7 | gen_student_years(data, control = sim_control())
 8 | }
 9 | \arguments{
10 | \item{data}{students to generate annual data for}
11 | 
12 | \item{control}{a list, defined by \code{\link{sim_control}}}
13 | }
14 | \value{
15 | a data.frame
16 | }
17 | \description{
18 | Generate annual student observations
19 | }
20 | 


--------------------------------------------------------------------------------
/man/map_CEDS.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CEDS.R
 3 | \name{map_CEDS}
 4 | \alias{map_CEDS}
 5 | \title{Map to and from CEDS column names}
 6 | \usage{
 7 | map_CEDS(user, category = NULL, CEDS = NULL)
 8 | }
 9 | \arguments{
10 | \item{user}{character vector of variable names to match}
11 | 
12 | \item{category}{character, a category of CEDS data to match}
13 | 
14 | \item{CEDS}{optional}
15 | }
16 | \value{
17 | mapped CEDS names
18 | }
19 | \description{
20 | Map to and from CEDS column names
21 | }
22 | 


--------------------------------------------------------------------------------
/man/unscale.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{unscale}
 4 | \alias{unscale}
 5 | \title{Unscale a scaled variable}
 6 | \usage{
 7 | unscale(x, mean, sd)
 8 | }
 9 | \arguments{
10 | \item{x}{numeric vector that has been scaled}
11 | 
12 | \item{mean}{a numeric, the mean to add to x}
13 | 
14 | \item{sd}{a numeric, the standardized factor to divide x by}
15 | }
16 | \value{
17 | x rescaled with mean and sd specified by the user
18 | }
19 | \description{
20 | Unscale a scaled variable
21 | }
22 | 


--------------------------------------------------------------------------------
/man/gen_hs_annual.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_outcomes.R
 3 | \name{gen_hs_annual}
 4 | \alias{gen_hs_annual}
 5 | \title{Generate annual HS outcomes}
 6 | \usage{
 7 | gen_hs_annual(hs_outcomes, stu_year)
 8 | }
 9 | \arguments{
10 | \item{hs_outcomes}{a data frame with final gpa, hs_status, and sid}
11 | 
12 | \item{stu_year}{a data frame with student enrollment}
13 | }
14 | \value{
15 | an expanded table with credits and gpa information
16 | }
17 | \description{
18 | Generate annual HS outcomes
19 | }
20 | 


--------------------------------------------------------------------------------
/man/assign_grade.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/baselines.R
 3 | \name{assign_grade}
 4 | \alias{assign_grade}
 5 | \title{Assign student a grade}
 6 | \usage{
 7 | assign_grade(age, ability)
 8 | }
 9 | \arguments{
10 | \item{age}{age of the student in years}
11 | 
12 | \item{ability}{a modifier that signifies student ability?}
13 | }
14 | \value{
15 | a vector of grade levels
16 | }
17 | \description{
18 | Assign student a grade
19 | }
20 | \examples{
21 | age <- c(9, 10, 11, 12)
22 | assign_grade(age = age)
23 | }
24 | 


--------------------------------------------------------------------------------
/man/assign_hs_outcomes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sim_defaults.R
 3 | \name{assign_hs_outcomes}
 4 | \alias{assign_hs_outcomes}
 5 | \title{Assign high school outcomes}
 6 | \usage{
 7 | assign_hs_outcomes(data, control = sim_control())
 8 | }
 9 | \arguments{
10 | \item{data}{a dataframe with certain high school attributes}
11 | 
12 | \item{control}{control parameters from the \code{sim_control()} function}
13 | }
14 | \value{
15 | an outcome dataframe
16 | }
17 | \description{
18 | Assign high school outcomes
19 | }
20 | 


--------------------------------------------------------------------------------
/man/gen_annual_status.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sim_defaults.R
 3 | \name{gen_annual_status}
 4 | \alias{gen_annual_status}
 5 | \title{Generate annual status trajectories per student}
 6 | \usage{
 7 | gen_annual_status(data, control = sim_control())
 8 | }
 9 | \arguments{
10 | \item{data}{student-year data}
11 | 
12 | \item{control}{control list}
13 | }
14 | \value{
15 | the \code{data} object, with additional variables appended
16 | }
17 | \description{
18 | Generate annual status trajectories per student
19 | }
20 | 


--------------------------------------------------------------------------------
/man/gen_grad.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_outcomes.R
 3 | \name{gen_grad}
 4 | \alias{gen_grad}
 5 | \title{Generate a high school graduation for students}
 6 | \usage{
 7 | gen_grad(data, control = sim_control())
 8 | }
 9 | \arguments{
10 | \item{data}{a dataframe with variables}
11 | 
12 | \item{control}{a sim_control parmeter, default is \code{sim_control}}
13 | }
14 | \value{
15 | a data.frame with two values, a probability and a binary outcome
16 | }
17 | \description{
18 | Generate a high school graduation for students
19 | }
20 | 


--------------------------------------------------------------------------------
/man/validate_probability_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{validate_probability_list}
 4 | \alias{validate_probability_list}
 5 | \title{Validate probability list formatting and structure}
 6 | \usage{
 7 | validate_probability_list(list)
 8 | }
 9 | \arguments{
10 | \item{list}{list that is to be passed as a probability list to simulation functions}
11 | }
12 | \value{
13 | Logical TRUE if list is valid, error if list is not valid.
14 | }
15 | \description{
16 | Validate probability list formatting and structure
17 | }
18 | 


--------------------------------------------------------------------------------
/man/recode_options.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CEDS.R
 3 | \name{recode_options}
 4 | \alias{recode_options}
 5 | \title{Recode options}
 6 | \usage{
 7 | recode_options(data, from = c("SDP", "CEDS"))
 8 | }
 9 | \arguments{
10 | \item{data}{a data.frame to recode the variables to CEDS from}
11 | 
12 | \item{from}{the data definitions you are recoding from}
13 | }
14 | \value{
15 | the data object, but with all values matching CEDS specification
16 | recoded to meet the CEDS specification
17 | }
18 | \description{
19 | Recode options
20 | }
21 | 


--------------------------------------------------------------------------------
/man/validate_sim_parameter.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{validate_sim_parameter}
 4 | \alias{validate_sim_parameter}
 5 | \title{Validate outcome simulation list formatting and structure}
 6 | \usage{
 7 | validate_sim_parameter(list)
 8 | }
 9 | \arguments{
10 | \item{list}{list that is to be passed as a parameter list to outcome simulation functions}
11 | }
12 | \value{
13 | Logical TRUE if list is valid, error if list is not valid.
14 | }
15 | \description{
16 | Validate outcome simulation list formatting and structure
17 | }
18 | 


--------------------------------------------------------------------------------
/man/assign_baseline.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/baselines.R
 3 | \name{assign_baseline}
 4 | \alias{assign_baseline}
 5 | \title{Append baseline data to initial data}
 6 | \usage{
 7 | assign_baseline(baseline = NULL, data)
 8 | }
 9 | \arguments{
10 | \item{baseline}{character value of the default baseline to assign}
11 | 
12 | \item{data}{a data.frame to append the baseline to}
13 | }
14 | \value{
15 | the data.frame passed by the user with an additional variable appended
16 | }
17 | \description{
18 | Append baseline data to initial data
19 | }
20 | 


--------------------------------------------------------------------------------
/man/gen_initial_status.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sim_defaults.R
 3 | \name{gen_initial_status}
 4 | \alias{gen_initial_status}
 5 | \title{Generate initial student status indicators}
 6 | \usage{
 7 | gen_initial_status(data, baseline)
 8 | }
 9 | \arguments{
10 | \item{data}{that includes the pre-requsites for generating each status}
11 | 
12 | \item{baseline}{character, name of a baseline status to calculate}
13 | }
14 | \value{
15 | the data with status variables appended
16 | }
17 | \description{
18 | Generate initial student status indicators
19 | }
20 | 


--------------------------------------------------------------------------------
/man/gen_nsc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sim_defaults.R
 3 | \name{gen_nsc}
 4 | \alias{gen_nsc}
 5 | \title{Generate postsecondary institutions}
 6 | \usage{
 7 | gen_nsc(n, names = NULL, method = NULL)
 8 | }
 9 | \arguments{
10 | \item{n}{number of institutions to generate}
11 | 
12 | \item{names}{names to use for schools}
13 | 
14 | \item{method}{default NULL, can be set to "scorecard" to use college scorecard data}
15 | }
16 | \value{
17 | a data.frame of names, IDs, and enrollment weights
18 | }
19 | \description{
20 | Generate postsecondary institutions
21 | }
22 | 


--------------------------------------------------------------------------------
/man/tm_convert.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{tm_convert}
 4 | \alias{tm_convert}
 5 | \title{Covnert a matrix to a row-wise transition matrix}
 6 | \usage{
 7 | tm_convert(matrix)
 8 | }
 9 | \arguments{
10 | \item{matrix}{a matrix tof counts}
11 | }
12 | \value{
13 | matrix M divided by the sum of its rows
14 | }
15 | \description{
16 | Covnert a matrix to a row-wise transition matrix
17 | }
18 | \examples{
19 | base_mat <- structure(c(44985, 740, 781, 7640), .Dim = c(2L, 2L),
20 | .Dimnames = list(c("0", "1"), c("0", "1")))
21 | tm_convert(base_mat)
22 | }
23 | 


--------------------------------------------------------------------------------
/man/grade_transitions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/baselines.R
 3 | \name{grade_transitions}
 4 | \alias{grade_transitions}
 5 | \title{Generate a grade advancement transition matrix}
 6 | \usage{
 7 | grade_transitions(ngrades = 15L, diag_limit = 0.975)
 8 | }
 9 | \arguments{
10 | \item{ngrades}{integer, the number of grade levels to simulate}
11 | 
12 | \item{diag_limit}{the minimum probability of a student advancing to the next grade}
13 | }
14 | \value{
15 | a matrix, ngrades x ngrades of grade transition probabilities
16 | }
17 | \description{
18 | Generate a grade advancement transition matrix
19 | }
20 | 


--------------------------------------------------------------------------------
/man/recode_credits.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{recode_credits}
 4 | \alias{recode_credits}
 5 | \title{Recode numeric data into plausible credit data}
 6 | \usage{
 7 | recode_credits(x, top = 6)
 8 | }
 9 | \arguments{
10 | \item{x}{numeric, vector of numerics to be truncated}
11 | 
12 | \item{top}{integer, maximum allowable value}
13 | }
14 | \value{
15 | a vector, length of x, with truncated values
16 | }
17 | \description{
18 | Recode numeric data into plausible credit data
19 | }
20 | \details{
21 | enforces no negative numbers and truncates data at a user specified
22 | maximum
23 | }
24 | 


--------------------------------------------------------------------------------
/man/gen_students.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sim_defaults.R
 3 | \name{gen_students}
 4 | \alias{gen_students}
 5 | \title{Generate student-level attributes}
 6 | \usage{
 7 | gen_students(nstu, control = sim_control())
 8 | }
 9 | \arguments{
10 | \item{nstu}{integer, number of students to simulate}
11 | 
12 | \item{control}{a list, defined by \code{\link{sim_control}}}
13 | }
14 | \value{
15 | a data.frame
16 | }
17 | \description{
18 | Generate student-level attributes
19 | }
20 | \details{
21 | The default is to generate students in racial groups and male/female
22 | in proportion to the U.S. population.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/tidy_sequence.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{tidy_sequence}
 4 | \alias{tidy_sequence}
 5 | \title{Tidy a two-state markov sequence for output}
 6 | \usage{
 7 | tidy_sequence(seq, states)
 8 | }
 9 | \arguments{
10 | \item{seq}{a vector of sequence elements, with only two states}
11 | 
12 | \item{states}{a vector of length two naming both possible states}
13 | }
14 | \value{
15 | a data.frame
16 | }
17 | \description{
18 | Tidy a two-state markov sequence for output
19 | }
20 | \examples{
21 | tidy_sequence(seq = c("Yes", "No", "No", "No", "Yes", "Yes"),
22 |              states = c("Yes", "No"))
23 | }
24 | 


--------------------------------------------------------------------------------
/man/assign_schools.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sim_defaults.R
 3 | \name{assign_schools}
 4 | \alias{assign_schools}
 5 | \title{Assign student enrollment spells to a school ID}
 6 | \usage{
 7 | assign_schools(student, schools, method = NULL)
 8 | }
 9 | \arguments{
10 | \item{student}{data frame of student-year observations}
11 | 
12 | \item{schools}{data frame of schools to assign from}
13 | 
14 | \item{method}{currently unused, will allow for different assignment methods}
15 | }
16 | \value{
17 | student, with additional column schid appended
18 | }
19 | \description{
20 | Assign student enrollment spells to a school ID
21 | }
22 | 


--------------------------------------------------------------------------------
/man/gen_ps_enrollment.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_outcomes.R
 3 | \name{gen_ps_enrollment}
 4 | \alias{gen_ps_enrollment}
 5 | \title{Generate a student-year long table of postsecondary enrollments}
 6 | \usage{
 7 | gen_ps_enrollment(hs_outcomes, nsc, control)
 8 | }
 9 | \arguments{
10 | \item{hs_outcomes}{a dataframe of high school outcomes}
11 | 
12 | \item{nsc}{a dataframe of postsecondary institutions}
13 | 
14 | \item{control}{a control object from \code{sim_control()}}
15 | }
16 | \value{
17 | a table of enrollments
18 | }
19 | \description{
20 | Generate a student-year long table of postsecondary enrollments
21 | }
22 | 


--------------------------------------------------------------------------------
/man/rand_vect_cont.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{rand_vect_cont}
 4 | \alias{rand_vect_cont}
 5 | \title{Generate positive random numbers that sum to a value}
 6 | \usage{
 7 | rand_vect_cont(N, M, sd = 1)
 8 | }
 9 | \arguments{
10 | \item{N}{number of numbers to generate}
11 | 
12 | \item{M}{constraint on the sum}
13 | 
14 | \item{sd}{standard deviation of values}
15 | }
16 | \value{
17 | a vector of numerics, length N
18 | }
19 | \description{
20 | Generate positive random numbers that sum to a value
21 | }
22 | \examples{
23 | out <- rand_vect_cont(N = 10, M = 2, sd = 1)
24 | sum(out)
25 | length(out)
26 | }
27 | 


--------------------------------------------------------------------------------
/man/simpop.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sim_defaults.R
 3 | \name{simpop}
 4 | \alias{simpop}
 5 | \title{Grand simulation}
 6 | \usage{
 7 | simpop(nstu, seed = NULL, control = sim_control())
 8 | }
 9 | \arguments{
10 | \item{nstu}{integer, number of students to simulate}
11 | 
12 | \item{seed}{integer, random seed to make simulation reproducible across
13 | sessions, optional}
14 | 
15 | \item{control}{a list, defined by \code{\link{sim_control}}}
16 | }
17 | \value{
18 | a list with simulated data
19 | }
20 | \description{
21 | Grand simulation
22 | }
23 | \examples{
24 | \dontrun{
25 | out <- simpop(nstu = 20, seed = 213)
26 | }
27 | }
28 | 


--------------------------------------------------------------------------------
/man/ceds_cleaner.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleaners.R
 3 | \name{ceds_cleaner}
 4 | \alias{ceds_cleaner}
 5 | \title{CEDS oputput function}
 6 | \usage{
 7 | ceds_cleaner(simouts, output = "directory", directory = NULL)
 8 | }
 9 | \arguments{
10 | \item{simouts}{a simulation list resulting from a call to the \code{\link{simpop}}
11 | function}
12 | 
13 | \item{output}{a character, default "directory", specifying where the output should good}
14 | 
15 | \item{directory}{a path to a directory to store the output files}
16 | }
17 | \value{
18 | Nothing. Output is saved out to a file on disk.
19 | }
20 | \description{
21 | CEDS oputput function
22 | }
23 | 


--------------------------------------------------------------------------------
/man/group_rescale.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{group_rescale}
 4 | \alias{group_rescale}
 5 | \title{Rescaling variables in groups}
 6 | \usage{
 7 | group_rescale(data, var, group_var, newvar = NULL)
 8 | }
 9 | \arguments{
10 | \item{data}{a dataframe containing variables you wish to rescale}
11 | 
12 | \item{var}{name of the variable to be rescaled}
13 | 
14 | \item{group_var}{character vector of the grouping terms}
15 | 
16 | \item{newvar}{optional character vector for the name of the new rescaled variable}
17 | }
18 | \value{
19 | data with the newvar appended
20 | }
21 | \description{
22 | Rescaling variables in groups
23 | }
24 | 


--------------------------------------------------------------------------------
/man/make_inds.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/CEDS.R
 3 | \name{make_inds}
 4 | \alias{make_inds}
 5 | \title{Append indicator variables to a data frame based on a single factor variable}
 6 | \source{
 7 | \url{http://stackoverflow.com/questions/35943455/creating-indicator-variable-columns-in-dplyr-chain}
 8 | }
 9 | \usage{
10 | make_inds(data, col)
11 | }
12 | \arguments{
13 | \item{data}{a data frame}
14 | 
15 | \item{col}{character, name of the factor column to generate indicators for}
16 | }
17 | \value{
18 | a data frame with the factor levels appended as columns
19 | }
20 | \description{
21 | Append indicator variables to a data frame based on a single factor variable
22 | }
23 | 


--------------------------------------------------------------------------------
/man/cond_prob.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/baselines.R
 3 | \name{cond_prob}
 4 | \alias{cond_prob}
 5 | \title{Generate conditional probabilities by group}
 6 | \usage{
 7 | cond_prob(data, factor, newvar, prob_list)
 8 | }
 9 | \arguments{
10 | \item{data}{dataframe to add variable to}
11 | 
12 | \item{factor}{grouping variable that probability of \code{newvar} is conditional on}
13 | 
14 | \item{newvar}{name, character, of new variable defined by \code{prob_list}}
15 | 
16 | \item{prob_list}{a list, defining the way \code{newvar} should be generated}
17 | }
18 | \value{
19 | data.frame with \code{newvar} appended to dataframe
20 | }
21 | \description{
22 | Generate conditional probabilities by group
23 | }
24 | 


--------------------------------------------------------------------------------
/man/school_transitions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/baselines.R
 3 | \name{school_transitions}
 4 | \alias{school_transitions}
 5 | \title{Generate a random transition matrix for school enrollments}
 6 | \usage{
 7 | school_transitions(nschls = 15L, diag_limit = 0.9)
 8 | }
 9 | \arguments{
10 | \item{nschls}{integer, the number of schools available, default = 15}
11 | 
12 | \item{diag_limit}{a numeric between 0 and 1 that sets the minimum probability that a student
13 | will stay schools}
14 | }
15 | \value{
16 | a transition matrix nschls X nschls of transition probabilities
17 | }
18 | \description{
19 | Generate a random transition matrix for school enrollments
20 | }
21 | \examples{
22 | out <- school_transitions(12)
23 | out
24 | }
25 | 


--------------------------------------------------------------------------------
/man/markov_cond_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{markov_cond_list}
 4 | \alias{markov_cond_list}
 5 | \title{Compute conditional Markov sequences in a pipeline}
 6 | \usage{
 7 | markov_cond_list(groupname, n, lst, ...)
 8 | }
 9 | \arguments{
10 | \item{groupname}{variable that identifies the list element to use}
11 | 
12 | \item{n}{number of elements to generate, usually defined by \link[dplyr]{n} in dplyr}
13 | 
14 | \item{lst}{probability list that contains function and function parameters}
15 | 
16 | \item{...}{additional arguments passed into the function}
17 | }
18 | \value{
19 | A sequence generated by functions in the probability list
20 | }
21 | \description{
22 | Compute conditional Markov sequences in a pipeline
23 | }
24 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: OpenSDPsynthR
 2 | Title: Package to generate synthetic data for the OpenSDP Project
 3 | Version: 0.1.1.9000
 4 | Authors@R: person("Jared", "Knowles", email = "jknowles@gmail.com", role = c("aut", "cre"))
 5 | Description: Generate realistic synthetic unit-level education system data suitable for analysis.
 6 | Depends: R (>= 3.0.2),
 7 |   dplyr,
 8 |   methods,
 9 |   lme4 (>= 1.1-11)
10 | Imports: lubridate,
11 |   markovchain,
12 |   magrittr,
13 |   wakefield,
14 |   purrr,
15 |   mvtnorm,
16 |   simglm,
17 |   tidyr,
18 |   stringr,
19 |   lazyeval
20 | Suggests: testthat,
21 |     knitr,
22 |     rmarkdown,
23 |     ggplot2
24 | Remotes: jknowles/simglm
25 | License: MIT + file LICENSE
26 | Encoding: UTF-8
27 | LazyData: false
28 | RoxygenNote: 6.1.0
29 | VignetteBuilder: knitr
30 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | OpenSDP invites contributions which will help education analysts use data to
 2 | address challenges facing our schools. As our community grows, we will be
 3 | formalizing our contribution processes. In the near term, we invite you to use
 4 | our resources freely. If you see a way to make them better or have suggestions
 5 | for new content, please submit feedback either through our Slack team
 6 | [(opensdp.slack.com)](https://opensdp.slack.com) or by submitting an issue to the
 7 | relevant repository. We welcome pull requests with updates and corrections to
 8 | our existing analytic packages. If you are interested in sharing a repository
 9 | of your own work with the education community through OpenSDP, please reach out
10 | to us at opensdp@gse.harvard.edu.
11 | 
12 | Visit the OpenSDP website at [opensdp.github.io](https://opensdp.github.io).
13 | 


--------------------------------------------------------------------------------
/man/make_binary_series.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/markov_transitions.R
 3 | \name{make_binary_series}
 4 | \alias{make_binary_series}
 5 | \title{Create an autocorrelated binary series}
 6 | \usage{
 7 | make_binary_series(n = 100, mean = 0.5, corr = 0)
 8 | }
 9 | \arguments{
10 | \item{n}{integer, length of series to generate}
11 | 
12 | \item{mean}{numeric between 0 and 1, proportion of cases with value 1}
13 | 
14 | \item{corr}{numeric between -1 and 1, how correlated should series be?}
15 | }
16 | \value{
17 | A binary series
18 | }
19 | \description{
20 | Generate a binary series that is autocorrelated using the
21 | Markov method from \code{\link{make_markov_series}}
22 | }
23 | \examples{
24 | make_binary_series(n=12,mean=0.5,corr=0.9)
25 | make_binary_series(n=100,mean=0.5,corr=0.1)
26 | }
27 | 


--------------------------------------------------------------------------------
/man/gen_schools.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sim_defaults.R
 3 | \name{gen_schools}
 4 | \alias{gen_schools}
 5 | \title{Generate a roster of schools to assign students to}
 6 | \usage{
 7 | gen_schools(control)
 8 | }
 9 | \arguments{
10 | \item{control}{simulation control parameters from \code{\link{sim_control}}}
11 | }
12 | \value{
13 | a data.frame with schools and their attributes
14 | }
15 | \description{
16 | Generate a roster of schools to assign students to
17 | }
18 | \details{
19 | Controls include:
20 | 
21 | \describe{
22 | \item{n}{number of schools}
23 | \item{mean}{a vector of means for the school attributes}
24 | \item{sigma}{a covariance matrix for the school attributes}
25 | \item{names}{vector to draw names from}
26 | \item{best_schl}{a character value specifiying the ID of the best school}
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/data-raw/grade_matrix.csv:
--------------------------------------------------------------------------------
 1 | tofrom, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
 2 | 0,    0.1, 0.88, 0.02, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 3 | 1, 0.01, 0.06, 0.92, 0.01, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 4 | 2, 0, 0.01, 0.02, 0.96, 0.01, 0, 0, 0, 0, 0, 0, 0, 0, 0
 5 | 3, 0, 0, 0.02, 0.07, 0.89, 0.02, 0, 0, 0, 0, 0, 0, 0, 0
 6 | 4, 0, 0, 0, 0.01, 0.02, 0.95, 0.02, 0, 0, 0, 0, 0, 0, 0
 7 | 5, 0, 0, 0, 0, 0.01, 0.02, 0.95, 0.02, 0, 0, 0, 0, 0, 0
 8 | 6, 0, 0, 0, 0, 0, 0.01, 0.02, 0.95, 0.02, 0, 0, 0, 0, 0
 9 | 7, 0, 0, 0, 0, 0, 0, 0.01, 0.02, 0.93, 0.04, 0, 0, 0, 0
10 | 8, 0, 0, 0, 0, 0, 0, 0, 0.01, 0.02, 0.93, 0.04, 0, 0, 0
11 | 9, 0, 0, 0, 0, 0, 0, 0, 0, 0.01, 0.08, 0.87, 0.04, 0, 0
12 | 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.01, 0.08, 0.87, 0.04, 0
13 | 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.01, 0.08, 0.87, 0.04
14 | 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.15, 0.85
15 | 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
16 | 


--------------------------------------------------------------------------------
/man/better_sim.lm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{better_sim.lm}
 4 | \alias{better_sim.lm}
 5 | \title{Generate simulated predictions from a linear model that account for
 6 | model and parameter error}
 7 | \usage{
 8 | better_sim.lm(object, nsim, newdata, resid_error = FALSE)
 9 | }
10 | \arguments{
11 | \item{object}{an object of class lm or glm}
12 | 
13 | \item{nsim}{number of simulations per observation to generate}
14 | 
15 | \item{newdata}{dataframe containing the observations to generate predictions
16 | for}
17 | 
18 | \item{resid_error}{should the model residual error be added to predictions,
19 | default is FALSE}
20 | }
21 | \value{
22 | a matrix of predictions nrow(newdata) x nsim columns
23 | }
24 | \description{
25 | Generate simulated predictions from a linear model that account for
26 | model and parameter error
27 | }
28 | 


--------------------------------------------------------------------------------
/man/gamma_GK.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gamma_gk.R
 3 | \name{gamma_GK}
 4 | \alias{gamma_GK}
 5 | \title{Estimate the Goodman and Kruskal gamma statistic}
 6 | \usage{
 7 | gamma_GK(x, y = NULL, print = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{an unordered factor variable}
11 | 
12 | \item{y}{an unordered factor variable}
13 | 
14 | \item{print}{a logical vector indicating whether results should be printed to the console}
15 | }
16 | \value{
17 | A named list with gamma, standard error of gamma, p-value of gamma, and statistical significance
18 | }
19 | \description{
20 | Estimate the correlation between two unordered factor variables using the Goodman and Kruskal gamma statistic
21 | }
22 | \note{
23 | Yadda yadda yadda
24 | }
25 | \references{
26 | Adapted from Simon Jackman from: \url{http://jackman.stanford.edu/classes/151B/06/class0517.r}
27 | }
28 | \author{
29 | Jared E. Knowles
30 | }
31 | 


--------------------------------------------------------------------------------
/man/fit_series.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/markov_transitions.R
 3 | \name{fit_series}
 4 | \alias{fit_series}
 5 | \title{Identify the parameters that define a series of binary outcomes}
 6 | \usage{
 7 | fit_series(series, return = c("matrix", "fit"), ...)
 8 | }
 9 | \arguments{
10 | \item{series}{a vector of 0 and 1 values}
11 | 
12 | \item{return}{a character with two options, matrix returns a transition
13 | matrix, "fit" returns a \code{\link{markovchain}} object}
14 | 
15 | \item{...}{additional arguments to pass to \code{\link{markovchainFit}}}
16 | }
17 | \value{
18 | Either a transition matrix or a list with parameters mean and cor
19 | defining the transitions in the vector
20 | }
21 | \description{
22 | Identify the parameters that define a series of binary outcomes
23 | }
24 | \examples{
25 | series <- make_markov_series(10, matrix(c(0.3, 0.7, 0.25, 0.75),
26 |                        nrow = 2, byrow =TRUE))
27 | fit_series(series)
28 | }
29 | 


--------------------------------------------------------------------------------
/data-raw/age_grade_baseline.csv:
--------------------------------------------------------------------------------
 1 | age,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,Total
 2 | 0,1,3,0,0,0,0,1,0,0,0,0,0,0,0,5
 3 | 1,8,1,1,0,0,0,0,0,0,0,0,0,0,0,10
 4 | 2,88,0,0,0,0,0,0,0,0,0,0,0,0,0,88
 5 | 3,565,2,0,0,0,0,0,0,0,0,0,0,0,0,567
 6 | 4,839,43,0,0,0,0,0,0,0,0,0,0,0,0,882
 7 | 5,117,8627,75,1,0,0,0,0,0,0,0,0,0,0,8820
 8 | 6,0,1120,8449,89,1,0,0,0,0,0,0,0,0,0,9659
 9 | 7,0,12,1595,8098,97,0,0,0,0,0,0,0,0,0,9802
10 | 8,0,1,35,1872,7503,111,1,0,0,0,0,0,0,0,9523
11 | 9,0,1,2,65,2090,7338,139,0,0,0,0,0,0,0,9635
12 | 10,0,0,0,0,104,2193,7351,153,2,0,0,0,0,0,9803
13 | 11,0,0,0,0,0,153,2244,7092,166,1,0,0,0,1,9657
14 | 12,0,0,0,0,1,2,159,2384,6862,222,1,0,0,1,9632
15 | 13,0,0,0,0,0,0,1,211,2373,6583,267,0,0,2,9437
16 | 14,0,0,0,0,0,0,0,6,310,2388,6691,228,0,0,9623
17 | 15,0,0,0,0,0,0,0,0,5,299,3087,6127,233,7,9758
18 | 16,0,0,0,0,0,0,0,0,0,11,994,2743,5914,252,9914
19 | 17,0,0,0,0,0,0,0,0,0,0,283,791,2349,5777,9200
20 | 18,0,0,0,0,0,0,0,0,0,0,71,212,480,1942,2705
21 | 19,0,0,0,1,0,0,0,0,0,0,12,33,93,343,482
22 | 20,0,0,0,0,0,0,0,1,0,0,7,12,19,126,165
23 | 21,0,0,0,0,0,0,0,0,0,0,3,1,8,13,25
24 | 22,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 President and Fellows of Harvard College
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data-raw/ellDist.csv:
--------------------------------------------------------------------------------
 1 | age,black,asian,hispanic,amerind,white,other,multiracial,hawaiian_pi,total
 2 | 1,0.000,,0.000,,0.000,,,,
 3 | 2,0.333,0.500,0.000,,0.158,,,,
 4 | 3,0.000,0.000,0.000,,0.000,,,,
 5 | 4,0.000,0.450,0.610,0.000,0.000,0.000,0.000,0.000,0.000
 6 | 5,0.000,0.450,0.620,0.000,0.000,0.000,0.000,0.00,0.000
 7 | 6,0.024,0.431,0.614,0.081,0.011,0.500,0.019,0.00,0.132
 8 | 7,0.032,0.408,0.636,0.091,0.009,0.375,0.009,0.00,0.131
 9 | 8,0.019,0.305,0.559,0.083,0.007,0.500,0.014,0,0.105
10 | 9,0.017,0.182,0.432,0.116,0.005,0.000,0.000,0,0.079
11 | 10,0.016,0.138,0.348,0.037,0.005,0.000,0.012,0,0.060
12 | 11,0.009,0.113,0.238,0.000,0.002,0.500,0.005,0,0.039
13 | 12,0.009,0.108,0.195,0.027,0.002,0.200,0.000,0,0.031
14 | 13,0.010,0.088,0.184,0.038,0.003,0.000,0.000,0,0.029
15 | 14,0.013,0.084,0.158,0.000,0.001,0.000,0.000,0,0.025
16 | 15,0.013,0.114,0.188,0.000,0.002,0.000,0.000,0,0.029
17 | 16,0.007,0.065,0.167,0.000,0.002,0.000,0.000,0,0.024
18 | 17,0.011,0.065,0.175,0.000,0.002,0.000,0.000,0,0.023
19 | 18,0.007,0.084,0.155,0.027,0.002,0.000,0.000,0,0.020
20 | 19,0.018,0.159,0.192,0.200,0.003,0.000,0.050,0,0.033
21 | 20,0.039,0.385,0.291,0.167,0.000,0.000,0.058,0,0,
22 | 21,0.071,0.625,0.222,0.500,0.000,1.000,0.000,0,0.088
23 | 22,0.000,0.000,0.143,0.000,0.000,,0.018,0,0,0,
24 | 


--------------------------------------------------------------------------------
/tests/testthat/test-all.R:
--------------------------------------------------------------------------------
 1 | # All tests
 2 | 
 3 | # expand.grid.df
 4 | 
 5 | context("Test expand_grid_df")
 6 | ex_data <- data.frame(id = 100:116, age = c(3:19),
 7 |                       race = c(rep(c("amerind", "asian", "black", "hispanic",
 8 |                                      "multiracial", "white", "hawaiian_pi"), 2),
 9 |                                "other", "bad", "error"))
10 | 
11 | test_that("Expand grid returns expected objects", {
12 |   out <- expand_grid_df(ex_data, data.frame(year = 2006:2012))
13 |   expect_is(out, "data.frame")
14 |   expect_equal(ncol(out), 4)
15 |   expect_identical(names(out)[4], "year")
16 |   expect_equal(nrow(out), nrow(ex_data)*7)
17 |   expect_identical(out[1:5, 1:3], ex_data[1:5, 1:3])
18 |   out <- expand_grid_df(ex_data, c("year" = 2006:2012))
19 |   expect_is(out, "data.frame")
20 |   expect_equal(ncol(out), 4)
21 |   expect_identical(names(out)[4], "y")
22 |   expect_equal(nrow(out), nrow(ex_data) * 7)
23 |   expect_identical(out[1:5, 1:3], ex_data[1:5, 1:3])
24 |   out <- expand_grid_df(ex_data, 2006:2012)
25 |   expect_is(out, "data.frame")
26 |   expect_equal(ncol(out), 4)
27 |   expect_identical(names(out)[4], "y")
28 |   expect_equal(nrow(out), nrow(ex_data) * 7)
29 |   expect_identical(out[1:5, 1:3], ex_data[1:5, 1:3])
30 | })
31 | 
32 | 
33 | # assign_grade
34 | 
35 | # cond_prob
36 | 
37 | # createSeries
38 | 
39 | 
40 | # findTransitions
41 | 
42 | # get_baseline
43 | 
44 | # get_code_values
45 | 
46 | # ell
47 | 
48 | # xwalk
49 | 


--------------------------------------------------------------------------------
/man/gen_outcome_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_outcomes.R
 3 | \name{gen_outcome_model}
 4 | \alias{gen_outcome_model}
 5 | \title{Generate data for a multilevel outcome}
 6 | \usage{
 7 | gen_outcome_model(fixed, fixed_param, random_var, fact_vars,
 8 |   cov_param = NULL, cor_vars = NULL, ngrps, unbalanceRange,
 9 |   type = "binary", with_err_gen = NULL, error_var = NULL)
10 | }
11 | \arguments{
12 | \item{fixed}{a formula with RHS only that specifies the variables to use}
13 | 
14 | \item{fixed_param}{a vector of numerics for the coefficients of variables in fixed}
15 | 
16 | \item{random_var}{a numeric, length 1, variance of random (school level) component}
17 | 
18 | \item{fact_vars}{for each variable in fixed that is a factor, a definition...}
19 | 
20 | \item{cov_param}{a list, defining any continuous variables}
21 | 
22 | \item{cor_vars}{correlations between fixed variables}
23 | 
24 | \item{ngrps}{number of schools}
25 | 
26 | \item{unbalanceRange}{range of enrollments in each school}
27 | 
28 | \item{type}{character, either "binary" or "linear" to choose outcome variable
29 | type to generate}
30 | 
31 | \item{with_err_gen}{name of a distribution function to generate the errors, optional}
32 | 
33 | \item{error_var}{integer values to pass to err_gen, optional}
34 | }
35 | \value{
36 | a list with two elements
37 | }
38 | \description{
39 | Generate data for a multilevel outcome
40 | }
41 | \examples{
42 | zed2 <- do.call(gen_outcome_model, sim_control()$gpa_sim_parameters)
43 | }
44 | 


--------------------------------------------------------------------------------
/man/make_markov_series.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/markov_transitions.R
 3 | \name{make_markov_series}
 4 | \alias{make_markov_series}
 5 | \title{Create a series of binary variables based on a transition matrix}
 6 | \source{
 7 | \itemize{
 8 | \item \url{http://stats.stackexchange.com/questions/14175/how-to-generate-random-auto-correlated-binary-time-series-data}
 9 | \item \url{https://en.wikipedia.org/wiki/Examples_of_Markov_chains}
10 | }
11 | }
12 | \usage{
13 | make_markov_series(n, tm, burnin = NULL, ...)
14 | }
15 | \arguments{
16 | \item{n}{an integer representing the length of the series}
17 | 
18 | \item{tm}{a Transition Matrix that describes the transition matrix}
19 | 
20 | \item{burnin}{integer, number of values to simulate before drawing from the series}
21 | 
22 | \item{...}{additional arguments to apss to \code{\link{markovchainSequence}}}
23 | }
24 | \value{
25 | a vector of length \code{n} with a random series of 0 and 1 generated
26 | from the \code{tm}
27 | }
28 | \description{
29 | Generate a
30 | }
31 | \details{
32 | A Transition Matrix is a 2x2 matrix where:
33 | \itemize{
34 |  \item Element 1, 1 is the probability of moving to state 0 conditional on being in state 0
35 |  \item Element 2, 1 is the probability of moving to state 0 conditional on being in state 1
36 |  \item Element 1, 2 is the probability of moving to state 1 conditional on being in state 0
37 |  \item Element 2, 2 is the probability of moving to state 1 conditional on being in state 1
38 | }
39 | }
40 | \examples{
41 | make_markov_series(10, matrix(c(0.6,0.4,0.9,0.1), nrow=2, byrow=TRUE))
42 | }
43 | 


--------------------------------------------------------------------------------
/man/age_calc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{age_calc}
 4 | \alias{age_calc}
 5 | \title{Function to calculate age from date of birth.}
 6 | \source{
 7 | This function was developed in part from this response on the R-Help mailing list.
 8 | }
 9 | \usage{
10 | age_calc(dob, enddate = Sys.Date(), units = "months", precise = TRUE)
11 | }
12 | \arguments{
13 | \item{dob}{a vector of class \code{Date} representing the date of birth/start date}
14 | 
15 | \item{enddate}{a vector of class Date representing the when the observation's
16 | age is of interest, defaults to current date.}
17 | 
18 | \item{units}{character, which units of age should be calculated? allowed values are
19 | days, months, and years}
20 | 
21 | \item{precise}{logical indicating whether or not to calculate with leap year
22 | and leap second precision}
23 | }
24 | \value{
25 | A numeric vector of ages the same length as the dob vector
26 | }
27 | \description{
28 | his function calculates age in days, months, or years from a
29 | date of birth to another arbitrary date. This returns a numeric vector in
30 | the specified units.
31 | }
32 | \examples{
33 | a <- as.Date(seq(as.POSIXct('1987-05-29 018:07:00'), len=26, by="21 day"))
34 | b <- as.Date(seq(as.POSIXct('2002-05-29 018:07:00'), len=26, by="21 day"))
35 | 
36 | age <- age_calc(a, units='years')
37 | age
38 | age <- age_calc(a, units='months')
39 | age
40 | age <- age_calc(a, as.Date('2005-09-01'))
41 | age
42 | }
43 | \seealso{
44 | See also \code{\link{difftime}} which this function uses and mimics
45 | some functionality but at higher unit levels.
46 | }
47 | \author{
48 | Jason P. Becker
49 | }
50 | 


--------------------------------------------------------------------------------
/tests/testthat/test-validators.R:
--------------------------------------------------------------------------------
 1 | # Test validation tools
 2 | 
 3 | 
 4 | tm_gifted_f <- matrix(
 5 |   c(500, 1, 2, 500),
 6 |   nrow = 2,
 7 |   byrow = TRUE,
 8 |   dimnames = list(c("1", "0"), c("1", "0"))
 9 | )
10 | tm_gifted_m <- tm_gifted_f
11 | tm_gifted_m[1, 1] <- tm_gifted_m[1, 1] + 25
12 | tm_gifted_m <- tm_convert(tm_gifted_m)
13 | tm_gifted_f <- tm_convert(tm_gifted_f)
14 | 
15 | gifted_list <- list(
16 |   "Male" = list(f = make_markov_series,
17 |                 pars = list(tm = tm_gifted_m,
18 |                             # Use quote so for each call in the loop sample is redrawn
19 |                             t0 = quote(
20 |                               sample(c("1", "0"), 1, prob = c(10, 90))
21 |                             ))),
22 |   "Female" = list(f = make_markov_series,
23 |                   pars = list(tm = tm_gifted_f,
24 |                               t0 = quote(
25 |                                 sample(c("1", "0"), 1, prob = c(8, 92))
26 |                               ))),
27 |   "GROUPVARS" = c("Sex")
28 | )
29 | 
30 | 
31 | #
32 | 
33 | grad_sim_parameters <- list(
34 |   fixed = ~ 1 + math_ss + scale_gpa + gifted + iep + frpl + ell + male,
35 |   random_var = 0.09948,
36 |   cov_param = list(
37 |     dist_fun = c("rnorm", "rnorm", rep("rbinom", 5)),
38 |     var_type = rep("lvl1", 7),
39 |     opts = list(
40 |       list(mean = 0, sd = 1),
41 |       list(mean = 0, sd = 1),
42 |       list(size = 1, prob = 0.1),
43 |       list(size = 1, prob = 0.2),
44 |       list(size = 1, prob = 0.45),
45 |       list(size = 1, prob = 0.1),
46 |       list(size = 1, prob = 0.47)
47 |     )
48 |   ),
49 |   cor_vars = c(
50 |     0.5136, 0.453, -0.276, -0.309, -0.046, -0.033,
51 |     0.2890, -0.1404, -0.2674, -0.0352,-0.1992,
52 |     -0.1354, -0.2096, -0.0305, -0.0290,
53 |     0.1433, -0.0031, 0.1269,
54 |     0.0601, 0.0066,
55 |     0.0009
56 |   ),
57 |   fixed_param = c(
58 |     1.7816, 0.10764, 1.05872, -0.07352, -0.07959,
59 |     -0.331647,-0.22318254, 0.0590
60 |   ),
61 |   ngrps = 30,
62 |   unbalanceRange = c(100, 1500)
63 | )
64 | 
65 | assess_sim_par <- OpenSDPsynthR::sim_control()$assess_sim_par
66 | gpa_sim_par <- OpenSDPsynthR::sim_control()$gpa_sim_par
67 | 
68 | test_that("Validators work for two level and three level sim", {
69 |   expect_true(validate_sim_parameter(assess_sim_par))
70 |   expect_true(validate_sim_parameter(gpa_sim_par))
71 |   expect_true(validate_sim_parameter(grad_sim_parameters))
72 | })
73 | 
74 | 


--------------------------------------------------------------------------------
/tests/testthat/test-age_calc.R:
--------------------------------------------------------------------------------
 1 | # age, retention, and moves calculations
 2 | # From eeptools package
 3 | context("Test age calculator")
 4 | 
 5 | test_that("Leap year calculations work", {
 6 |   # from @larmarange
 7 |   expect_equal(age_calc(as.Date('2004-01-15'), as.Date('2004-02-16')), 1.034483,
 8 |                tol = .00001)
 9 |   expect_equal(age_calc(as.Date('2005-01-15'), as.Date('2005-02-16')), 1.035714,
10 |                tol = .00001)
11 |   expect_equal(age_calc(as.Date('1995-01-15'), as.Date('2003-02-16')),
12 |                age_calc(as.Date('1994-01-15'), as.Date('2002-02-16')))
13 |   expect_false(age_calc(as.Date('1996-01-15'), as.Date('2004-02-16')) ==
14 |                  age_calc(as.Date('1994-01-15'), as.Date('2002-02-16')))
15 | })
16 | 
17 | test_that("All function parameters result in a numeric calculations with sane inputs", {
18 |   tests <- expand.grid(precise = c(TRUE, FALSE),
19 |                        units = c("days", "months", "years"),
20 |                        dob = c("atomic", "vector"),
21 |                        enddate = c("atomic", "vector"))
22 | 
23 |   safe.ifelse <- function(cond, yes, no) structure(ifelse(cond, yes, no), class = class(yes))
24 | 
25 |   for(i in 1:nrow(tests)){
26 |     atomDOB <- as.Date(as.POSIXct('1987-05-29 018:07:00'))
27 |     vecDOB <- as.Date(seq(as.POSIXct('1987-05-29 018:07:00'), len=26, by="21 day"))
28 |     vecED <- as.Date(seq(as.POSIXct('2017-05-29 018:07:00'), len=26, by="21 day"))
29 |     atomED <- as.Date(as.POSIXct('2017-05-29 018:07:00'))
30 | 
31 |     dob <- safe.ifelse(tests[i, "dob"] == "atomic", atomDOB, vecDOB)
32 |     enddate <- safe.ifelse(tests[i, "enddate"] == "atomic", atomED, vecED)
33 | 
34 |     out <- age_calc(dob = dob, enddate = enddate, units = tests[i, ]$units,
35 |                     precise = tests[i, ]$precise)
36 |     expect_true(class(out) %in% c("difftime", "numeric"))
37 |   }
38 | 
39 | })
40 | 
41 | test_that("Bad inputs yield correct errors", {
42 |   expect_error(age_calc('2004-01-15', '2004-02-16'),
43 |                "Both dob and enddate must be Date class objects")
44 |   expect_error(age_calc(as.Date('2004-01-15'), '2004-02-16'),
45 |                "Both dob and enddate must be Date class objects")
46 |   expect_error(age_calc('2004-01-15', as.Date('2004-02-16')),
47 |                "Both dob and enddate must be Date class objects")
48 |   expect_error(age_calc(as.Date('2004-02-16'), as.Date('2004-01-15')),
49 |                "End date must be a date after date of birth")
50 | 
51 | })
52 | 


--------------------------------------------------------------------------------
/tests/testthat/test-gen.R:
--------------------------------------------------------------------------------
 1 | # Test gen
 2 | 
 3 | context("Test high level data generation functions")
 4 | 
 5 | #TODO make gen_students produce consistent number of columns
 6 | test_that("gen_students produces students", {
 7 |   set.seed(21421)
 8 |   out <- gen_students(n = 10)
 9 |   expect_equal(nrow(out), 10)
10 |   expect_is(out, "data.frame")
11 |   expect_identical(names(out),
12 |                    c("sid", "Sex", "Birthdate", "Race", "White",
13 |                      "Black.or.African.American",
14 |                      "Hispanic.or.Latino.Ethnicity",
15 |                      "Demographic.Race.Two.or.More.Races", "id_type"))
16 |   expect_is(out[, "sid"], "factor")
17 |   expect_is(out[, "Sex"], "factor")
18 |   expect_is(out[, "Birthdate"], "Date")
19 |   expect_is(out[, "Race"], "factor")
20 |   expect_is(out[, "White"], "character")
21 |   expect_is(out[, "Black.or.African.American"], "character")
22 |   out <- gen_students(n = 100)
23 |   expect_equal(nrow(out), 100)
24 |   expect_equal(ncol(out), 11)
25 | })
26 | 
27 | test_that("gen_student_years longitudinal data", {
28 |   set.seed(2352)
29 |   out <- gen_students(n = 100)
30 |   out_l <- gen_student_years(data = out)
31 |   expect_message(gen_student_years(data = out), regexp = NA)
32 |   expect_equal(ncol(out_l), 8)
33 |   # Figure out how to test number of rows produced
34 |   expect_equal(min(out_l$year), sim_control()$minyear)
35 |   expect_equal(max(out_l$year), sim_control()$maxyear)
36 |   expect_identical(names(out_l)[1:4], c("sid", "year", "Birthdate", "age"))
37 |   expect_identical(names(out_l)[5:8], c("enrollment_status", "cohort_grad_year",
38 |                                         "cohort_year", "exit_type"))
39 | })
40 | 
41 | 
42 | # context("Test annual status variables")
43 | # ex_data <- data.frame(id = 100:116,
44 | #                       Sex = sample(c("Male", "Female"), 17, replace = TRUE),
45 | #                       Race = sample(c("White", "Asian"), 17, replace = TRUE))
46 | #
47 | # gen_annual_status(ex_data)
48 | #
49 | # test_that("Does it work?", {
50 | #
51 | # })
52 | #
53 | # cond_vars <- get_sim_groupvars(control)
54 | # stu_year <- left_join(stu_year, demog_master[, c(idvar, cond_vars)],
55 | #                       by = idvar)
56 | # stu_year <- gen_annual_status(stu_year, control = control)
57 | 
58 | 
59 | # gen_annual_status
60 | # gen_initial_status
61 | # simpop
62 | # gen_student_years
63 | # make_inds
64 | # sim_control
65 | 
66 | test_that("Simulation control works", {
67 |   expect_is(sim_control(), "list")
68 | 
69 | })
70 | 


--------------------------------------------------------------------------------
/data-raw/CEDS_SDP_map.csv:
--------------------------------------------------------------------------------
 1 | entity,category,CEDS_name,sdp_name,CEDS_Option_set,SDP_option_match
 2 | K12Student,Demographic,Birthdate,dob,YYYY-MM-DD,YYYY-MM-DD
 3 | K12Student,Demographic,White,white,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected
 4 | K12Student,Demographic,Asian,asian,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected
 5 | K12Student,Demographic,American Indian or Alaska Native,amerind,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected
 6 | K12Student,Demographic,Black or African American,black,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected
 7 | K12Student,Demographic,Hispanic or Latino Ethnicity,hispanic,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected
 8 | K12Student,Demographic,Demographic Race Two or More Races,multiracial,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected
 9 | K12Student,Demographic,Native Hawaiian or Other Pacific Islander,hawaiian_pi,Yes - Yes;No - No;NotSelected - Not Selected,1 - Yes;0 - No;NA - Not Selected
10 | K12Student,Demographic,,other
11 | K12Student,Limited English Proficiency,Limited English Proficiency Status,ell,Yes - Yes;No - No,1 - Yes;0 - No;NA - Not Selected
12 | K12Student,Disability,Primary Disability Type,disab_type,AUT - Autism; DB - Deaf-blindness; DD - Developmental Delay; EMN - Emotional disturbance; HI - Hearing Impariment; ID - Intellectual Disability; MD - Multiple disabilities; OI - Orthopedic impairment; OHI - Other health impairment; SLD - Specific learning disability; SLI - Speech or language impairment; TBI - Traumatic brain injury; VI - Visual impairment
13 | K12Student,Enrollment,Gifted and Talented Indicator,gifted,Yes - Yes; No - No; Unknown - Unknown,1 - Yes;0 - No;NA - Not Selected
14 | K12Student,Economically Disadvantaged,Economic Disadvantage Status,frpl,Yes - Yes;No - No,1 - Yes;0 - No;NA - Not Selected
15 | K12Student,Disability,IDEA Indicator,iep,Yes - Yes; No - No,1 - Yes;0 - No
16 | K12Student,Identity,Student Identifier,sid,xxxx
17 | K12Student,Demographic,Sex,sex,Male - Male;Female - Female;NotSelected - Not Selected
18 | K12Student,Demographic,,age,,
19 | K12Student,Enrollment,Entry Grade Level,grade_level,IT - Infant/toddler;PR - Preschool;PK - Prekindergarten;TK - Transitional Kindergarten;KG - Kindergarten;01 - First grade;02 - Second grade;03 - Third grade;04 - Fourth grade;05 - Fifth grade;06 - Sixth grade;07 - Seventh grade;08 - Eigth grade;09 - Ninth grade;10 - Tenth grade;11 - Eleventh grade;12 - Twelfth grade;13 - Grade 13;PS - Postsecondary;UG - Ungraded;Other - Other,-9 - Ungraded;-1 - Prekindergarten;-1 - Transitional Kindergarten;-1 - Preschool;0 - Kindergarten;1 - First grade;2 - Second grade;3 - Third grade;4 - Fourth grade;5 - Fifth grade;6 - Sixth grade;7 - Seventh grade;8 - Eigth grade;9 - Ninth grade;10 - Tenth grade;11 - Eleventh grade;12 - Twelfth grade;13 - Grade 13;13 - Postsecondary
20 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | #' @importFrom stats rbinom runif pnorm setNames coef rnbinom vcov simulate update
 2 | #' rnorm rpois sd sigma terms time reshape
 3 | #' @importFrom stats na.omit
 4 | #' @importFrom utils data head tail write.csv zip
 5 | #' @importFrom methods new
 6 | 
 7 | .onAttach = function(...) {
 8 |   if (!interactive()) return()
 9 |   msg = "Welcome to OpenSDP." # nocov
10 |   packageStartupMessage(paste(strwrap(msg), collapse = "\n")) # nocov
11 | }
12 | 
13 | 
14 | utils::globalVariables(c("Sex", "iep", "Race", "frpl", "age"))
15 | 
16 | 
17 | .onLoad <- function(libname = find.package("OpenSDPsynthR"), pkgname = "OpenSDPsynthR"){
18 |     # CRAN Note avoidance
19 |   if(getRversion() >= "2.15.1")
20 |     utils::globalVariables(
21 |       # from simpop and subset function calls
22 |       c("sid", "grad", "first_flag", "schools_race_prob", "White",
23 |         "initschid", "Lodgment_method", "PHI_Ind", "Sw_amt", "Alow_ben_amt",
24 |         "ontrack_yr1", "grade", "math_ss", "rdg_ss",
25 |         "ontrack_yr4", "cum_credits_yr1", "cum_credits_yr4", "cum_credits_yr1_ela",
26 |         "cum_credits_yr4_ela", "cum_credits_yr1_math", "cum_credits_yr4_math",
27 |         "cum_gpa_yr1", "cum_gpa_yr4", "yr_seq",
28 |         "scale_gpa", "grad_prob", "ps_prob", "ps",
29 |         "cum_credits", "credits_earned", "credits_attempted", "grad",
30 |         "hs_status", "status_after", "event",
31 |         "ps_transfer", "opeid", "ps_change_ind", "short_name",
32 |         "type",
33 |         # we use the magrittr pipe
34 |         ".",
35 |         # CLEANER
36 |         "grade", "sid", "schid", "gifted", "chrt_ninth", "name", "keep", "math_ss",
37 |         "rdg_ss", "test_math_8_std", "test_ela_8_std", "test_composite_8",
38 |         "yr_seq", "ontrack", "cum_credits", "cum_credits_ela", "cum_credits_math",
39 |         "cum_gpa", "status_after", "scale_gpa", "gpa", "grad_prob", "grad",
40 |         "hs_status", "ps_prob", "ps", "diploma_type", "class_rank", "opeid",
41 |         "first_ps", "last_ps", "chrt_grad", "variable", "value", "temp",
42 |         "1_enrl_1oct_grad", "2_enrl_1oct_grad", "3_enrl_1oct_grad", "4_enrl_1oct_grad",
43 |         "5_enrl_1oct_grad", "1_enrl_1oct_ninth", "2_enrl_1oct_ninth", "3_enrl_1oct_ninth",
44 |         "4_enrl_1oct_ninth", "5_enrl_1oct_ninth", "enrl_ever_w2_grad_any",
45 |         "enrl_ever_w2_ninth_any", "ps_type", "enrl_1oct_grad", "enrl_1oct_ninth",
46 |         "enrl_ever_w2_ninth", "enrl_ever_w2_grad", "enrl_ever_w2_grad_2yr",
47 |         "enrl_ever_w2_grad_4yr", "enrl_ever_w2_ninth_2yr", "enrl_ever_w2_ninth_4yr",
48 |         "term", "ps_short_name", "enroll_count", "all4", "persist", "chrt",
49 |         "observed",
50 |         # control parameters
51 |         "race_list", "frl_list", "school_list", "gifted_list",
52 |         # more simpop
53 |         "flag", "grade_diff", "cohort_year", "subject", "assess_id", "score",
54 |         "ntests", "schid", "cohort_grad_year"
55 |       )
56 |     )
57 |   invisible()
58 | }
59 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
  1 | # Generated by roxygen2: do not edit by hand
  2 | 
  3 | export(age_calc)
  4 | export(assign_baseline)
  5 | export(assign_grade)
  6 | export(assign_hs_outcomes)
  7 | export(assign_schools)
  8 | export(better_sim.lm)
  9 | export(ceds_cleaner)
 10 | export(cond_prob)
 11 | export(diag_offset)
 12 | export(expand_grid_df)
 13 | export(fit_series)
 14 | export(gamma_GK)
 15 | export(gen_annual_gpa)
 16 | export(gen_annual_status)
 17 | export(gen_assess)
 18 | export(gen_credits)
 19 | export(gen_gpa)
 20 | export(gen_grad)
 21 | export(gen_hs_annual)
 22 | export(gen_initial_status)
 23 | export(gen_nsc)
 24 | export(gen_ontrack)
 25 | export(gen_outcome_model)
 26 | export(gen_ps)
 27 | export(gen_ps_enrollment)
 28 | export(gen_schools)
 29 | export(gen_student_years)
 30 | export(gen_students)
 31 | export(get_baseline)
 32 | export(get_code_values)
 33 | export(get_sim_groupvars)
 34 | export(grade_transitions)
 35 | export(group_rescale)
 36 | export(make_binary_series)
 37 | export(make_inds)
 38 | export(make_markov_series)
 39 | export(map_CEDS)
 40 | export(markov_cond_list)
 41 | export(num_clip)
 42 | export(num_grade)
 43 | export(rand_vect_cont)
 44 | export(recode_credits)
 45 | export(recode_options)
 46 | export(rescale)
 47 | export(rescale_gpa)
 48 | export(school_transitions)
 49 | export(sdp_cleaner)
 50 | export(sim_control)
 51 | export(simpop)
 52 | export(tidy_sequence)
 53 | export(tm_convert)
 54 | export(unscale)
 55 | export(validate_probability_list)
 56 | export(validate_sim_parameter)
 57 | export(zeroNA)
 58 | import(dplyr)
 59 | import(lme4)
 60 | importFrom(dplyr,left_join)
 61 | importFrom(lazyeval,interp)
 62 | importFrom(lubridate,year)
 63 | importFrom(magrittr,"%<>%")
 64 | importFrom(magrittr,"%>%")
 65 | importFrom(markovchain,createSequenceMatrix)
 66 | importFrom(markovchain,markovchainFit)
 67 | importFrom(markovchain,markovchainSequence)
 68 | importFrom(methods,new)
 69 | importFrom(mvtnorm,rmvnorm)
 70 | importFrom(purrr,map)
 71 | importFrom(simglm,sim_glm)
 72 | importFrom(simglm,sim_reg)
 73 | importFrom(stats,coef)
 74 | importFrom(stats,na.omit)
 75 | importFrom(stats,pnorm)
 76 | importFrom(stats,rbinom)
 77 | importFrom(stats,reshape)
 78 | importFrom(stats,rlnorm)
 79 | importFrom(stats,rnbinom)
 80 | importFrom(stats,rnorm)
 81 | importFrom(stats,rpois)
 82 | importFrom(stats,runif)
 83 | importFrom(stats,sd)
 84 | importFrom(stats,setNames)
 85 | importFrom(stats,sigma)
 86 | importFrom(stats,simulate)
 87 | importFrom(stats,terms)
 88 | importFrom(stats,time)
 89 | importFrom(stats,update)
 90 | importFrom(stats,vcov)
 91 | importFrom(stringr,str_trunc)
 92 | importFrom(tidyr,crossing)
 93 | importFrom(tidyr,gather)
 94 | importFrom(utils,data)
 95 | importFrom(utils,head)
 96 | importFrom(utils,tail)
 97 | importFrom(utils,write.csv)
 98 | importFrom(utils,zip)
 99 | importFrom(wakefield,dob)
100 | importFrom(wakefield,level)
101 | importFrom(wakefield,race)
102 | importFrom(wakefield,sex)
103 | 


--------------------------------------------------------------------------------
/tools/example_collab.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Example Collaboration"
  3 | author: "Jared Knowles"
  4 | date: "April 14, 2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, echo=TRUE, results='hide', message=FALSE, warning=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | library(OpenSDPsynthR)
 11 | simouts <- simpop(nstu = 1250L, seed = 488234, 
 12 |                   control = sim_control(nschls = 9L, 
 13 |                                         minyear=1997, maxyear=2013))
 14 | 
 15 | ```
 16 | 
 17 | ## Identify
 18 | 
 19 | ```{r identifyElements}
 20 | names(simouts$assessment)
 21 | names(simouts$demog_master)
 22 | names(simouts$stu_year)
 23 | ```
 24 | 
 25 | ## Clean
 26 | 
 27 | TBD
 28 | 
 29 | ## Connect
 30 | 
 31 | ```{r kwikconnect}
 32 | analyze_table <- left_join(simouts$demog_master, simouts$assessment, 
 33 |                            by = "sid")
 34 | analyze_table <- left_join(analyze_table, simouts$stu_year, 
 35 |                            by = c("sid", "year"))
 36 | 
 37 | analyze_table %>% group_by(sid) %>% 
 38 |   summarize(nobs = n()) %>% select(-sid) %>% summary
 39 | 
 40 | analyze_table <- analyze_table %>% filter(grade.x %in% 
 41 |                                             c("3", "4", "5", "6", "7"))
 42 | 
 43 | ```
 44 | 
 45 | 
 46 | ## Analyze
 47 | 
 48 | ```{r quickmodels}
 49 | 
 50 | # simple
 51 | 
 52 | simp_mod <- lm(math_ss ~ Race + grade.x + frpl + ell + iep + gifted, 
 53 |                data = analyze_table)
 54 | 
 55 | summary(simp_mod)
 56 | 
 57 | # school effect
 58 | 
 59 | sch_mod <- lm(math_ss ~ Race + grade.x + frpl + ell + iep + gifted + 
 60 |                 schid.x, 
 61 |                data = analyze_table)
 62 | 
 63 | summary(sch_mod)
 64 | 
 65 | ```
 66 | 
 67 | 
 68 | 
 69 | ## Report
 70 | 
 71 | ```{r}
 72 | library(broom)
 73 | library(ggplot2)
 74 | 
 75 | plot_mod <- tidy(sch_mod)
 76 | plot_mod$model <- "school"
 77 | tmp <- tidy(simp_mod)
 78 | tmp$model <- "simple"
 79 | plot_mod <- bind_rows(plot_mod, tmp); rm(tmp)
 80 | 
 81 | 
 82 | ggplot(plot_mod[plot_mod$term %in% c("frpl1", "ell1", "iep1"), ], 
 83 |        aes(x = term, y = estimate, ymin = estimate-std.error, 
 84 |            ymax = estimate+std.error, group = model, color = model)) + 
 85 |   geom_linerange(position = position_dodge(width = 1), size = 2) + 
 86 |   theme_bw() + geom_hline(yintercept = 0, color = I("red")) + 
 87 |   theme(legend.position = "bottom")
 88 | 
 89 | 
 90 | 
 91 | 
 92 | ```
 93 | 
 94 | 
 95 | ```{r schoolEffects}
 96 | ggplot(plot_mod[plot_mod$term %in% c("schid.x1", "schid.x2", 
 97 |                                      "schid.x3", "schid.x4", 
 98 |                                      "schid.x5", "schid.x6", 
 99 |                                      "schid.x7", "schid.x8", 
100 |                                      "schid.x9"), ], 
101 |        aes(x = term, y = estimate, ymin = estimate-std.error, 
102 |            ymax = estimate+std.error)) + 
103 |   geom_linerange(size = 2) + 
104 |   theme_bw() + geom_hline(yintercept = 0, color = I("red")) + 
105 |   theme(legend.position = "bottom")
106 | 
107 | ```
108 | 
109 | 


--------------------------------------------------------------------------------
/R/gamma_gk.R:
--------------------------------------------------------------------------------
 1 | ##' Estimate the Goodman and Kruskal gamma statistic
 2 | ##'
 3 | ##' Estimate the correlation between two unordered factor variables using the Goodman and Kruskal gamma statistic
 4 | ##'
 5 | ##' @param x an unordered factor variable
 6 | ##' @param y an unordered factor variable
 7 | ##' @param print a logical vector indicating whether results should be printed to the console
 8 | ##' @return A named list with gamma, standard error of gamma, p-value of gamma, and statistical significance
 9 | ##' @note Yadda yadda yadda
10 | ##' @export
11 | ##' @author Jared E. Knowles
12 | ##' @references Adapted from Simon Jackman from: \url{http://jackman.stanford.edu/classes/151B/06/class0517.r}
13 | gamma_GK <- function(x, y = NULL, print = FALSE){
14 |   concordant <- function(x){
15 |     ## get sum(matrix values > r AND > c)
16 |     ## for each matrix[r, c]
17 |     mat.lr <- function(r,c){
18 |       lr <- x[(r.x > r) & (c.x > c)]
19 |       sum(lr)
20 |     }
21 | 
22 |     ## get row and column index for each
23 |     ## matrix element
24 |     r.x <- row(x)
25 |     c.x <- col(x)
26 | 
27 |     ## return the sum of each matrix[r, c] * sums
28 |     ## using mapply to sequence thru each matrix[r, c]
29 |     sum(x * mapply(mat.lr, r = r.x, c = c.x))
30 |   }
31 | 
32 |   discordant <- function(x){
33 |     ## get sum(matrix values > r AND < c)
34 |     ## for each matrix[r, c]
35 |     mat.ll <- function(r,c){
36 |       ll <- x[(r.x > r) & (c.x < c)]
37 |       sum(ll)
38 |     }
39 | 
40 |     ## get row and column index for each
41 |     ## matrix element
42 |     r.x <- row(x)
43 |     c.x <- col(x)
44 | 
45 |     ## return the sum of each matrix[r, c] * sums
46 |     ## using mapply to sequence thru each matrix[r, c]
47 |     sum(x * mapply(mat.ll, r = r.x, c = c.x))
48 |   }
49 | 
50 |   if(is.table(x) | is.matrix(x)){
51 |     c <- concordant(x)
52 |     d <- discordant(x)
53 |     n <- sum(x)
54 |   }
55 |   else{
56 |     tab <- table(x,y)
57 |     c <- concordant(tab)
58 |     d <- discordant(tab)
59 |     n <- sum(tab)
60 |   }
61 |   gamma <- (c - d) / (c + d)
62 | 
63 |   arg <- (c+d)/(n*(1-(gamma^2)))
64 |   stdError <- 1/sqrt(arg)
65 |   z <- gamma/stdError
66 |   if(print==TRUE){
67 |     cat("Goodman-Kruskal gamma statistic:\n")
68 |     cat(paste("Concordant Pairs",c,"\n"))
69 |     cat(paste("Discordant Pairs",d,"\n\n"))
70 |     cat(paste("Estimate of gamma:",
71 |               signif(gamma,.Options$digits),
72 |               "Standard error:",
73 |               signif(stdError,.Options$digits),
74 |               "\n\n"))
75 | 
76 |     cat(paste("H0: gamma = 0 vs HA: two-sided\n"))
77 |     cat(paste("z:",
78 |               signif(z, .Options$digits),
79 |               "p-value:",
80 |               signif(2*(1-pnorm(abs(z))), .Options$digits),
81 |               "\n\n"))
82 |     if(c<51 | d<51){
83 |       cat("Warning: p-values are based on a normal approximation to the\n")
84 |       cat("sampling distribution of the z test statistic, which is commonly\n")
85 |       cat("considered to be good only if C and D are both > 50.\n")
86 |     }
87 |   }
88 |   return(list(gamma = signif(gamma,.Options$digits),
89 |               se = signif(stdError,.Options$digits),
90 |               z =signif(z, .Options$digits),
91 |               sig = signif(2*(1-pnorm(abs(z))), .Options$digits)))
92 |   invisible(NULL)
93 | }
94 | 


--------------------------------------------------------------------------------
/CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of experience,
 9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at opensdp@gse.harvard.edu. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at [http://contributor-covenant.org/version/1/4][version]
72 | 
73 | [homepage]: http://contributor-covenant.org
74 | [version]: http://contributor-covenant.org/version/1/4/
75 | 


--------------------------------------------------------------------------------
/R/diagnostics.R:
--------------------------------------------------------------------------------
  1 | # # # # # # Simulation diagnostics
  2 | # # # # #
  3 | # # # # # # Ever FRL percentage
  4 | # # #
  5 | # # simouts$stu_year %>% group_by(sid) %>%
  6 | # #   summarize(everFRL = ifelse(any(frpl == "1"), 1, 0)) %>%
  7 | # #   select(everFRL) %>% unlist %>% table
  8 | # #
  9 | # # simouts$stu_year %>% group_by(sid) %>%
 10 | # #   summarize(everELL = ifelse(any(ell == "1"), 1, 0)) %>%
 11 | # #   select(everELL) %>% unlist %>% table
 12 | # #
 13 | # # simouts$stu_year %>% group_by(sid) %>%
 14 | # #   summarize(everIEP = ifelse(any(iep == "1"), 1, 0)) %>%
 15 | # #   select(everIEP) %>% unlist %>% table
 16 | # #
 17 | # #
 18 | # # simouts$stu_year %>% group_by(sid) %>%
 19 | # #   summarize(everGifted = ifelse(any(gifted == "1"), 1, 0)) %>%
 20 | # #   select(everGifted) %>% unlist %>% table
 21 | # #
 22 | # # table(simouts$stu_year$enrollment_status)
 23 | # # table(simouts$stu_year$grade_advance)
 24 | #
 25 | # # ggplot(assess, aes(x = math_ss, y = math_ssb, group = frpl, color = frpl)) +
 26 | # #   geom_point(alpha = I(0.3)) + geom_smooth(se = FALSE) +
 27 | # #   geom_abline(slope = 1, intercept= 0)
 28 | # # ggplot(assess, aes(x = math_ssb, group = frpl, color = frpl)) +
 29 | # #   geom_density(alpha = I(0.3)) + facet_wrap(~age)
 30 | # # assess$math_ssc <- mapply(perturb_race, assess$math_ssb, assess$Race, assess$math_sd)
 31 | #
 32 | # #
 33 | # simoutsA <- simpop(nstu = 1000L, seed = 488234,
 34 | #                    control = sim_control(nschls = 3L, minyear=1990,
 35 | #                                          maxyear=2010))
 36 | #
 37 | # # Document that these race codes need to be replace din perturb functions and the like
 38 | # simoutsB <- simpop(nstu = 1000L, seed = 488234,
 39 | #                    control = sim_control(nschls = 3L, race_groups = c("Black", "White", "Hispanic"),
 40 | #                                          race_prob = c(0.3, 0.6, 0.1)))
 41 | 
 42 | # mod_sim <- do.call(gen_outcome_model, sim_control()$ps_sim_parameters)
 43 | # mod_sim$sim_model@resp$family$linkinv(unlist(ranef(mod_sim$sim_model)$clustID))
 44 | 
 45 | #
 46 | # simoutsC <- simpop(nstu = 1000L, seed = 488234,
 47 | #                    control = sim_control(nschls = 3L, n_cohorts = 5L))
 48 | 
 49 | # tm <- matrix(
 50 | #   c(800, 20, 5, 800),
 51 | #   nrow = 2,
 52 | #   byrow = TRUE,
 53 | #   dimnames = list(c("1", "0"), c("1", "0"))
 54 | # )
 55 | # mytm <- tm_convert(tm)
 56 | # myGL <- list(GROUPVARS = "ALL",
 57 | #              "ALL" = list(f = make_markov_series,
 58 | #                           pars = list(tm = mytm))
 59 | #              )
 60 | #
 61 | #
 62 | # simoutsD <- simpop(nstu = 1000L, seed = 488234,
 63 | #                    control = sim_control(nschls = 3L,
 64 | #                                          gifted_list = myGL))
 65 | #
 66 | #
 67 | #
 68 | #
 69 | # myGradSim <- list(
 70 | #     fixed = ~ 1 + math_ss + scale_gpa + gifted + iep + frpl + ell + male,
 71 | #     random_var = 0.8948,
 72 | #     cov_param = list(
 73 | #       dist_fun = c("rnorm", "rnorm", rep("rbinom", 5)),
 74 | #       var_type = rep("lvl1", 7),
 75 | #       opts = list(
 76 | #         list(mean = 0, sd = 1),
 77 | #         list(mean = 0, sd = 1),
 78 | #         list(size = 1, prob = 0.1),
 79 | #         list(size = 1, prob = 0.2),
 80 | #         list(size = 1, prob = 0.45),
 81 | #         list(size = 1, prob = 0.1),
 82 | #         list(size = 1, prob = 0.47)
 83 | #       )
 84 | #     ),
 85 | #     cor_vars = c(
 86 | #       0.5136, 0.453, -0.276, -0.309, -0.046, -0.033,
 87 | #       0.2890, -0.1404, -0.2674, -0.0352,-0.1992,
 88 | #       -0.1354, -0.2096, -0.0305, -0.0290,
 89 | #       0.1433, -0.0031, 0.1269,
 90 | #       0.0601, 0.0066,
 91 | #       0.0009
 92 | #     ),
 93 | #     fixed_param = c(
 94 | #       1.6816, 0.30764, 1.05872, -0.07352, -0.07959,
 95 | #       -0.331647,-0.22318254, 0.0590
 96 | #     ),
 97 | #     ngrps = nschls + 5,
 98 | #     unbalanceRange = c(100, 1500)
 99 | # )
100 | #
101 | # simouts <- simpop(nstu = 4000L, seed = 53232,
102 | #                   control = sim_control(nschls = 12L,
103 | #                                         grad_sim_parameters = ))
104 | 


--------------------------------------------------------------------------------
/R/markov_transitions.R:
--------------------------------------------------------------------------------
 1 | #' Create a series of binary variables based on a transition matrix
 2 | #'
 3 | #' @param n an integer representing the length of the series
 4 | #' @param tm a Transition Matrix that describes the transition matrix
 5 | #' @param burnin integer, number of values to simulate before drawing from the series
 6 | #' @param ... additional arguments to apss to \code{\link{markovchainSequence}}
 7 | #' @return a vector of length \code{n} with a random series of 0 and 1 generated
 8 | #' from the \code{tm}
 9 | #' @description Generate a
10 | #' @details A Transition Matrix is a 2x2 matrix where:
11 | #' \itemize{
12 | #'  \item Element 1, 1 is the probability of moving to state 0 conditional on being in state 0
13 | #'  \item Element 2, 1 is the probability of moving to state 0 conditional on being in state 1
14 | #'  \item Element 1, 2 is the probability of moving to state 1 conditional on being in state 0
15 | #'  \item Element 2, 2 is the probability of moving to state 1 conditional on being in state 1
16 | #' }
17 | #' @source \itemize{
18 | #' \item \url{http://stats.stackexchange.com/questions/14175/how-to-generate-random-auto-correlated-binary-time-series-data}
19 | #' \item \url{https://en.wikipedia.org/wiki/Examples_of_Markov_chains}
20 | #' }
21 | #' @importFrom markovchain markovchainSequence
22 | #' @export
23 | #' @examples
24 | #' make_markov_series(10, matrix(c(0.6,0.4,0.9,0.1), nrow=2, byrow=TRUE))
25 | make_markov_series <- function(n, tm, burnin = NULL, ...){
26 |   stopifnot(is.matrix(tm))
27 |   stopifnot(n > 0)
28 |   if(any(tm > 1)){
29 |     warning("TM elements exceed 1, adjusting by dividing by rowSums")
30 |     tm <- tm / rowSums(tm)
31 |   }
32 |   mc <- new("markovchain", transitionMatrix = tm)
33 |   if(!is.null(burnin)){
34 |     series <- markovchainSequence(n+burnin, mc, ...)
35 |     series <- series[burnin:(n+burnin)]
36 |   } else{
37 |     series <- markovchainSequence(n, mc, ...)
38 |   }
39 |   return(series)
40 | }
41 | 
42 | #' Create an autocorrelated binary series
43 | #'
44 | #' @param n integer, length of series to generate
45 | #' @param mean numeric between 0 and 1, proportion of cases with value 1
46 | #' @param corr numeric between -1 and 1, how correlated should series be?
47 | #' @description Generate a binary series that is autocorrelated using the
48 | #' Markov method from \code{\link{make_markov_series}}
49 | #'
50 | #' @return A binary series
51 | #' @export
52 | #' @examples
53 | #' make_binary_series(n=12,mean=0.5,corr=0.9)
54 | #' make_binary_series(n=100,mean=0.5,corr=0.1)
55 | make_binary_series <- function(n = 100, mean = 0.5, corr = 0){
56 |   p01 <- corr * (1 - mean) / mean
57 |   tm <- matrix(c(1 - p01, p01, corr, 1 - corr), nrow=2, byrow=T)
58 |   tm <- matrix(c(0.2, 1.24, 1.3, -0.4), nrow=2, byrow=T)
59 |   if(any(tm > 1 | tm < 0)){
60 |     tm[1, ] <- prop.table(sqrt(tm[1, ]^2))
61 |     tm[2, ] <- prop.table(sqrt(tm[1, ]^2))
62 |   }
63 |   tm <- t(apply(tm, 1, function(x)(x-min(x))/(max(x)-min(x))))
64 |   make_markov_series(n, tm)
65 | }
66 | 
67 | 
68 | #' Identify the parameters that define a series of binary outcomes
69 | #'
70 | #' @param series a vector of 0 and 1 values
71 | #' @param return a character with two options, matrix returns a transition
72 | #' matrix, "fit" returns a \code{\link{markovchain}} object
73 | #' @param ... additional arguments to pass to \code{\link{markovchainFit}}
74 | #' @return Either a transition matrix or a list with parameters mean and cor
75 | #' defining the transitions in the vector
76 | #' @importFrom markovchain markovchainFit
77 | #' @export
78 | #' @examples
79 | #' series <- make_markov_series(10, matrix(c(0.3, 0.7, 0.25, 0.75),
80 | #'                        nrow = 2, byrow =TRUE))
81 | #' fit_series(series)
82 | fit_series <- function(series, return = c("matrix", "fit"), ...){
83 |   if(missing(return)){
84 |     return <- "matrix"
85 |   }
86 |   # TODO check to ensure this coerces into a true transition matrix
87 |   # seqMat <- createSequenceMatrix(series)
88 |   out <- markovchainFit(series,  ...)
89 |   if(return == "matrix"){
90 |     return(out$estimate)
91 |   } else if(return == "fit") {
92 |     return(out)
93 |   }
94 | }
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/tests/testthat/test-markov.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # make_markov_series
 3 | 
 4 | context("Test Markov Generator")
 5 | 
 6 | test_that("Markov Chain works", {
 7 |   statesNames <- c("No", "Yes")
 8 |   tm <- matrix(c(800, 10, 200, 200), nrow = 2, byrow = TRUE,
 9 |                dimnames = list(statesNames, statesNames))
10 |   expect_warning(make_markov_series(10, tm = tm))
11 |   expect_error(make_markov_series(0, tm = tm))
12 |   tm <- tm / rowSums(tm)
13 |   expect_equal(length(make_markov_series(10, tm = tm)), 10)
14 |   expect_equal(length(make_markov_series(10, tm = tm, t0 = "Yes")), 10)
15 |   expect_equal(length(make_markov_series(10, tm = tm, t0 = "Yes",
16 |                                          include.t0 = TRUE)), 11)
17 |   expect_equal(length(make_markov_series(10, tm = tm, t0 = "No",
18 |                                          include.t0 = TRUE)), 11)
19 |   expect_true(all(make_markov_series(10, tm = tm,
20 |                                      t0 = "Yes", include.t0 = TRUE) %in% statesNames))
21 | })
22 | 
23 | 
24 | # Test accuracy
25 | 
26 | set.seed(458397)
27 | 
28 | test_fun <- function(basetm, runLength, ...){
29 |   zzz <- make_markov_series(runLength, tm = basetm)
30 |   out <- fit_series(zzz, return = "fit", ...)
31 |   return(all(c(all(out$lowerEndpointMatrix < basetm),
32 |                all(basetm < out$upperEndpointMatrix))))
33 | }
34 | 
35 | test_that("Markov series are correct", {
36 |   dumbTM <- structure(c(0.5, 0.5, 0.5, 0.5), .Dim = c(2L, 2L), .Dimnames = list(
37 |     c("No", "Yes"), c("No", "Yes")))
38 |   zzz <- replicate(500, test_fun(basetm = dumbTM, runLength = 75))
39 |   per_true <- length(zzz[zzz == TRUE]) / length(zzz)
40 |   expect_true(0.87 < per_true)
41 |   expect_true(0.92 > per_true)
42 |   statesNames <- c("No", "Yes")
43 |   dumbTM2 <-  matrix(c(0.5, 0.5, 0.5, 0.5), nrow = 2, byrow = TRUE,
44 |                      dimnames = list(statesNames, statesNames))
45 |   zzy <-  replicate(500, test_fun(basetm = dumbTM2, runLength = 10,
46 |                         possibleStates = statesNames))
47 |   per_true2 <- length(zzy[zzy == TRUE]) / length(zzy)
48 |   expect_true(per_true2 < per_true)
49 | })
50 | 
51 | test_that("fit series respects user input", {
52 |   dumbTM <- structure(c(0.5, 0.5, 0.5, 0.5), .Dim = c(2L, 2L), .Dimnames = list(
53 |     c("No", "Yes"), c("No", "Yes")))
54 |   zzz <- make_markov_series(20, tm = dumbTM)
55 |   out <- fit_series(zzz, return = "fit")
56 |   expect_is(out$estimate, "markovchain")
57 |   expect_is(out, "list")
58 |   expect_equal(out$confidenceLevel, 0.95)
59 |   out <- fit_series(zzz, return = "fit", confidencelevel = 0.9)
60 |   expect_equal(out$confidenceLevel, 0.9)
61 |   out <- fit_series(zzz, return = "matrix")
62 |   expect_is(out, "markovchain")
63 | })
64 | 
65 | # fit_series
66 | 
67 | # outList <- replicate(25, createAutocorBinSeries(n=10,mean=0.4,corr=0.8),
68 | #                      simplify = "array")
69 | #
70 | # outList <- apply(outList, 2, function(x) table(paste0(head(x, -1), tail(x,-1))))
71 | #
72 | # map(outList, function(x) sum(x[names(x) == "00"])) %>% reduce_right(sum)
73 | # map(outList, function(x) sum(x[names(x) == "10"])) %>% reduce_right(sum)
74 | # map(outList, function(x) sum(x[names(x) == "01"])) %>% reduce_right(sum)
75 | # map(outList, function(x) sum(x[names(x) == "11"])) %>% reduce_right(sum)
76 | #
77 | # apply(outList, 2, findTransitions)
78 | 
79 | #
80 | # out <- findTransitions(series)
81 | #
82 | # p01=corr*(1-mean)/mean
83 | # createSeries(n,matrix(c(1-p01,p01,corr,1-corr),nrow=2,byrow=T))
84 | #
85 | # series <- createAutocorBinSeries(n=5000, mean=0.1, corr=0.8)
86 | # out <- findTransitions(series)
87 | # out <- findTransitions(series, return = "simple")
88 | # mapply(function(x, y) (x*(1-y) / y), seq(0, 1, 0.1), seq(0, 1, 0.1))
89 | # mapply(function(x, y) (x*(1-y) / y), seq(1, 0, -0.1), seq(1, 0, -0.1))
90 | # mapply(function(x, y) (x*(1-y) / y), seq(0, 1, 0.1), seq(1, 0, -0.1))
91 | # mapply(function(x, y) (x*(1-y) / y), seq(1, 0, -0.1), seq(0, 1, 0.1))
92 | # p01=corr*(1-mean)/mean
93 | #
94 | # testdf <- expand.grid(mean = seq(0.01, 0.99, 0.1), corr = seq(0.01, 0.99, 0.1))
95 | # testdf$p01 <- mapply(function(x, y) (x*(1-y) / y), testdf$corr, testdf$mean)
96 | #
97 | # createSeries(100 , matrix(c(1-90, 90, 0.91, .09),nrow=2,byrow=T))
98 | 


--------------------------------------------------------------------------------
/tests/testthat/test-mapCEDS.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # map_CEDS
 4 | # recode_options
 5 | 
 6 | # Gen fake data
 7 | 
 8 | context("Test CEDS variable mapping")
 9 | 
10 | test_that("Error is appropriate when supplying dataframe...", {
11 |   SDP <- data.frame("sid" = NA, "ell" = NA, "iep" = NA, "frpl" = NA)
12 |   expect_error(map_CEDS(SDP), "passing or reassigning names(object)", fixed=TRUE)
13 |   SDP <- as.tbl(SDP)
14 |   expect_error(map_CEDS(SDP), "Please supply a character vector of names to be converted.", fixed=TRUE)
15 | })
16 | 
17 | test_that("Check names return dimensions", {
18 |   allSDPnames <- c("dob", "white", "asian", "amerind", "black", "hispanic",
19 |                    "multiracial", "hawaiian_pi", "other", "ell", "disab_type",
20 |                    "gifted", "frpl", "iep", "sid", "sex", "age")
21 |   cleaned <- map_CEDS(allSDPnames)
22 |   expect_equal(length(allSDPnames), length(cleaned))
23 |   allCEDSnames <- c("Birthdate", "White", "Asian", "American Indian or Alaska Native",
24 |                     "Black or African American", "Hispanic or Latino Ethnicity",
25 |                     "Demographic Race Two or More Races",
26 |                     "Native Hawaiian or Other Pacific Islander",
27 |                     "", "Limited English Proficiency Status",
28 |                     "Primary Disability Type",
29 |                     "Gifted and Talented Indicator", "Economic Disadvantage Status",
30 |                     "IDEA Indicator", "Student Identifier", "Sex")
31 |   cleaned <- map_CEDS(allCEDSnames)
32 |   expect_equal(length(allCEDSnames), length(cleaned))
33 | })
34 | 
35 | test_that("Errors make sense when column names are mixed", {
36 |   mixNames <- c("Birthdate", "white", "asian", "black", "hispanic",
37 |                 "Primary Disability Type")
38 |   expect_message(map_CEDS(mixNames), "Not all names successfully matched.", fixed = TRUE)
39 |   expect_message(map_CEDS(mixNames), "Returning NA for those not matched.", fixed = TRUE)
40 |   expect_message(map_CEDS(mixNames), "Majority of names match CEDS names", fixed = TRUE)
41 |   expect_message(map_CEDS(mixNames), "returning CEDS names", fixed = TRUE)
42 |   expect_message(map_CEDS(mixNames), "Both SDP and CEDS names detected.", fixed = TRUE)
43 |   cleaned <- map_CEDS(mixNames)
44 |   expect_equal(length(cleaned[is.na(cleaned)]), 2)
45 |   expect_equal(length(cleaned[!is.na(cleaned)]), 4)
46 |   expect_identical(cleaned[!is.na(cleaned)], c("White", "Asian", "Black or African American",
47 |                                                "Hispanic or Latino Ethnicity"))
48 |   mixNames <- c("Birthdate", "White", "Asian", "black", "hispanic",
49 |                 "Primary Disability Type")
50 |   expect_message(map_CEDS(mixNames), "Not all names successfully matched.", fixed = TRUE)
51 |   expect_message(map_CEDS(mixNames), "Returning NA for those not matched.", fixed = TRUE)
52 |   expect_message(map_CEDS(mixNames), "Majority of names match SDP names", fixed = TRUE)
53 |   expect_message(map_CEDS(mixNames), "returning SDP names", fixed = TRUE)
54 |   expect_message(map_CEDS(mixNames), "Both SDP and CEDS names detected.", fixed = TRUE)
55 |   cleaned <- map_CEDS(mixNames)
56 |   expect_equal(length(cleaned[is.na(cleaned)]), 2)
57 |   expect_equal(length(cleaned[!is.na(cleaned)]), 4)
58 |   expect_identical(cleaned[!is.na(cleaned)], c("dob", "white", "asian", "disab_type"))
59 | })
60 | 
61 | test_that("get_code_values extracts values", {
62 |   test_string <- c("Yes - Yes;No - No;NotSelected - Not Selected")
63 |   codes <- get_code_values(test_string)
64 |   expect_equal(length(codes), 2)
65 |   expect_identical(names(codes), c("values", "labels"))
66 |   expect_null(names(codes$values))
67 |   expect_null(names(codes$labels))
68 |   expect_equal(length(codes$values), 3)
69 |   expect_equal(length(codes$labels), 3)
70 |   expect_identical(codes$labels, c("Yes", "No", "Not Selected"))
71 |   expect_identical(codes$values, c("Yes", "No", "NotSelected"))
72 |   bad_string <- c("Yes - Yes,No - No,NotSelected - Not Selected")
73 |   codes <- get_code_values(bad_string)
74 | 
75 | })
76 | 
77 | test_that("make_inds makes indicator variables", {
78 |   testDat <- data.frame(id = 1:30,
79 |                         group = sample(letters[1:5], 30, replace=TRUE))
80 |   outDat <- make_inds(testDat, col = "group")
81 |   expect_true(ncol(outDat) > ncol(testDat))
82 |   expect_equal(ncol(outDat), 7)
83 |   expect_equal(nrow(testDat), nrow(outDat))
84 |   expect_identical(names(outDat)[3:7], letters[1:5])
85 |   expect_identical(names(outDat)[1:2], names(testDat)[1:2])
86 | })
87 | #
88 | # test_that("recode_options successfully recodes", {
89 | #
90 | #
91 | #
92 | # })
93 | 
94 | # recode_options
95 | 


--------------------------------------------------------------------------------
/man/sim_control.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/sim_control.R
  3 | \name{sim_control}
  4 | \alias{sim_control}
  5 | \title{Set control parameters for simulated data}
  6 | \usage{
  7 | sim_control(nschls = 2L, best_school = NULL, race_groups = NULL,
  8 |   race_prob = NULL, ses_list = NULL, minyear = 2002,
  9 |   maxyear = 2017, n_cohorts = 8L, gifted_list = NULL,
 10 |   iep_list = NULL, ell_list = NULL, ps_transfer_list = NULL,
 11 |   grade_levels = NULL, school_means = NULL, school_cov_mat = NULL,
 12 |   school_names = NULL, postsec_names = NULL,
 13 |   gpa_sim_parameters = NULL, grad_sim_parameters = NULL,
 14 |   ps_sim_parameters = NULL, assess_sim_par = NULL,
 15 |   assessment_adjustment = NULL, grad_adjustment = NULL,
 16 |   ps_adjustment = NULL, gpa_adjustment = NULL, assess_grades = NULL,
 17 |   n_postsec = 35L, postsec_method = "scorecard")
 18 | }
 19 | \arguments{
 20 | \item{nschls}{integer- number of schools to create, default is 2}
 21 | 
 22 | \item{best_school}{character, format is a number, padded with a leading 0,
 23 | indicates the school that will be the highest performing}
 24 | 
 25 | \item{race_groups}{character - vector of labels for race groups}
 26 | 
 27 | \item{race_prob}{numerics - same length as \code{race_groups}}
 28 | 
 29 | \item{ses_list}{a probability list defining probabilities of being low
 30 | socioeconomic status, see Details}
 31 | 
 32 | \item{minyear}{integer - the first year student records are observed}
 33 | 
 34 | \item{maxyear}{integer - the last year student records are observed}
 35 | 
 36 | \item{n_cohorts}{integer - the number of birth-year cohorts to produce}
 37 | 
 38 | \item{gifted_list}{a probability list defining the probability of being in
 39 | a gifted and talented program}
 40 | 
 41 | \item{iep_list}{a probability list defining probabilities of being on an
 42 | individualized education plan}
 43 | 
 44 | \item{ell_list}{a probability list defining probabilities for being an English
 45 | language learner}
 46 | 
 47 | \item{ps_transfer_list}{a probability list for transferring postsecondary
 48 | institutions after enrolling}
 49 | 
 50 | \item{grade_levels}{a probability list for grade promotion and retention}
 51 | 
 52 | \item{school_means}{numeric - a named vector of means for school level attributes}
 53 | 
 54 | \item{school_cov_mat}{matrix - a covariance matrix for the school level attributes}
 55 | 
 56 | \item{school_names}{character - a vector to draw school names from}
 57 | 
 58 | \item{postsec_names}{character - a vector to draw postsecondary institution
 59 | names from}
 60 | 
 61 | \item{gpa_sim_parameters}{list - parameters to pass to \code{gen_outcome_model}}
 62 | 
 63 | \item{grad_sim_parameters}{list - parameters to pass to \code{gen_outcome_model}}
 64 | 
 65 | \item{ps_sim_parameters}{list - parameters to pass to \code{gen_outcome_model}}
 66 | 
 67 | \item{assess_sim_par}{list - parameters to pass to \code{gen_outcome_model}}
 68 | 
 69 | \item{assessment_adjustment}{list - parameters to adjust assessment scores by
 70 | for bias}
 71 | 
 72 | \item{grad_adjustment}{list - parameters to adjust graduation probabilities by
 73 | for bias}
 74 | 
 75 | \item{ps_adjustment}{list - parameters to adjust postsecondary enrollment
 76 | probabilities by for bias}
 77 | 
 78 | \item{gpa_adjustment}{list - parameters to adjust gpa for bias}
 79 | 
 80 | \item{assess_grades}{character - grade levels to generate assessment scores for}
 81 | 
 82 | \item{n_postsec}{numeric - number of postsecondary schools to assign to}
 83 | 
 84 | \item{postsec_method}{character - options "scorecard" or NULL}
 85 | }
 86 | \value{
 87 | a named list
 88 | }
 89 | \description{
 90 | Set control parameters for simulated data
 91 | }
 92 | \details{
 93 | This function has a full set of default values that are designed to
 94 | produce realistic data. These defaults can be overridden by specifying any
 95 | of the arguments to be overridden as an option to the function call.
 96 | 
 97 | There are two unique data structures that are used to set options for simulations.
 98 | The first is a \code{probability_list}, a list which defines a grouping factor, and
 99 | for each level of the grouping factor a function and parameters to generate
100 | probability distribution from.
101 | 
102 | The \code{sim_parameters} data structure defines the parameters for the outcome
103 | simulation. Outcomes are simulated using a simulated multilevel model structure
104 | and this data structure contains the parameters that describe the model and the
105 | error structure of data generated from that model.
106 | 
107 | To modify either of these elements, use the \code{\link{validate_probability_list}}
108 | or \code{\link{validate_sim_parameter}} helper functions to ensure that all
109 | of the parameters are defined with valid values.
110 | }
111 | 


--------------------------------------------------------------------------------
/R/CEDS.R:
--------------------------------------------------------------------------------
  1 | #' Map to and from CEDS column names
  2 | #'
  3 | #' @param user character vector of variable names to match
  4 | #' @param category character, a category of CEDS data to match
  5 | #' @param CEDS optional
  6 | #'
  7 | #' @return mapped CEDS names
  8 | #' @export
  9 | map_CEDS <- function(user, category = NULL, CEDS = NULL){
 10 |   if(!all(class(user) %in% c("factor", "character"))){ # is all right conditional?
 11 |     msg <- "Please supply a character vector of names to be converted. Consider
 12 |     passing or reassigning names(object)"
 13 |     stop(msg)
 14 |   }
 15 |   # TODO: Create a crosswalk from CEDS to SDP and SDP to CEDS with clearer lookup
 16 |   sdp <- xwalk$sdp_name[match(user, xwalk$CEDS_name)]
 17 |   ceds <- xwalk$CEDS_name[match(user, xwalk$sdp_name)]
 18 |   checkLength <- function(x, y){
 19 |     x_l <- length(x[!is.na(x)])
 20 |     y_l <- length(y[!is.na(y)])
 21 |     if(x_l > 0 & y_l > 0){
 22 |       msg <- ("Both SDP and CEDS names detected.")
 23 |     } else{
 24 |       msg <- " " # empty space to allow compound messages below
 25 |     }
 26 |     if(x_l > y_l){
 27 |       message("Majority of names match SDP names, returning SDP names. ", msg)
 28 |       return(x)
 29 |     } else if(y_l > x_l){
 30 |       message("Majority of names match CEDS names, returning CEDS names. ", msg)
 31 |       return(y)
 32 |     } else {
 33 |       stop("Equal number of CEDS and SDP names identified, check names.")
 34 |     }
 35 |   }
 36 |   out <- checkLength(sdp, ceds)
 37 |   if(any(is.na(out))){
 38 |     message("Not all names successfully matched. Returning NA for those not matched.")
 39 |   }
 40 |   if(all(is.na(out))){
 41 |     warning("No names could be matched, please check names for typos.")
 42 |     out <- NULL
 43 |   } else{
 44 |     return(out)
 45 |   }
 46 | }
 47 | 
 48 | #' Get values of codes from the CEDS Crosswalk List
 49 | #'
 50 | #' @param x the data.frame character element that contains the codes
 51 | #' @importFrom purrr map
 52 | #' @importFrom magrittr %>%
 53 | #' @export
 54 | #' @return a list with the labels and levels properly formatted
 55 | get_code_values <- function(x){
 56 |   tmp <- unlist(strsplit(x, split = ";"))
 57 |   tmp <- sapply(tmp, strsplit, split = "[[:punct:]]")
 58 |   values <- map(tmp, 1) %>% unlist
 59 |   labels <- map(tmp, 2) %>% unlist
 60 |   labels <- trimws(labels)
 61 |   names(labels) <- NULL
 62 |   values <- trimws(values)
 63 |   names(values) <- NULL
 64 |   return(list(values = values, labels = labels))
 65 | }
 66 | 
 67 | # TODO: This should always append
 68 | #' Append indicator variables to a data frame based on a single factor variable
 69 | #'
 70 | #' @param data a data frame
 71 | #' @param col character, name of the factor column to generate indicators for
 72 | #' @source \url{http://stackoverflow.com/questions/35943455/creating-indicator-variable-columns-in-dplyr-chain}
 73 | #' @return a data frame with the factor levels appended as columns
 74 | #' @export
 75 | make_inds <- function(data, col) {
 76 |   for(i in col) {
 77 |     idx <- which(names(data)==i)
 78 |     v <- data[[idx]]
 79 |     if(class(v) != "factor"){
 80 |       warning("Factor not supplied, coercing...")
 81 |       v <- factor(v)
 82 |     }
 83 |     # stopifnot(class(v)=="factor")
 84 |     m <- matrix(0, nrow=nrow(data), ncol=nlevels(v))
 85 |     m[cbind(seq_along(v), as.integer(v))]<-1
 86 |     colnames(m) <- paste(levels(v))
 87 |     r <- data.frame(m)
 88 |     # Only need this if you want to drop original column
 89 |     # if (idx > 1) {
 90 |     #   r <- cbind(data[1:(idx-1)],r)
 91 |     # }
 92 |     # if (idx < ncol(data)) {
 93 |     #   r <- cbind(r, data[(idx+1):ncol(data)])
 94 |     # }
 95 |     data <- cbind(data, r)
 96 |   }
 97 |   data
 98 | }
 99 | 
100 | #' Recode options
101 | #'
102 | #' @param data a data.frame to recode the variables to CEDS from
103 | #' @param from the data definitions you are recoding from
104 | #'
105 | #' @return the data object, but with all values matching CEDS specification
106 | #' recoded to meet the CEDS specification
107 | #' @export
108 | recode_options <- function(data, from = c("SDP", "CEDS")){
109 |   if(missing(from)){
110 |     from <- "CEDS"
111 |   }
112 |   stopifnot(all(names(data) %in% xwalk$CEDS_name) |
113 |               all(names(data) %in% xwalk$sdp_name))
114 |   recode_ceds_value <- function(var, options){
115 |     var <- options$labels[match(var, options$values)]
116 |     return(var)
117 |   }
118 |   for(i in names(data)){
119 |     if(from == "SDP"){
120 |       codes <- get_code_values(xwalk$SDP_option_match[xwalk$sdp_name == i])
121 |     } else{
122 |       CEDS <- xwalk$schema[xwalk$CEDS_name == i][[1]]
123 |     }
124 |     data[, i] <- recode_ceds_value(data[, i], codes)
125 |   }
126 |   return(data)
127 | }
128 | 
129 | # # x and y are vectors of labels
130 | # reconcile_labels <- function(x, y){
131 | #   list(x, y[pmatch(tolower(x), tolower(y))])
132 | # }
133 | #
134 | # try_label <- function(x, y, value){
135 | #   lookup <- reconcile_labels(x, y)
136 | #   if(is.na(y[value])){
137 | #     out <- lookup[[2]][which(lookup[[1]] == value)]
138 | #     if(length(out) != 0){
139 | #       return(out)
140 | #     } else{
141 | #       return(NA)
142 | #     }
143 | #   } else{
144 | #     return(y[value])
145 | #   }
146 | # }
147 | 


--------------------------------------------------------------------------------
/tests/testthat/test-baseline.R:
--------------------------------------------------------------------------------
  1 | # Test baselines
  2 | 
  3 | context("Test baseline extraction")
  4 | 
  5 | test_that("ELL baseline works", {
  6 |   out <- get_baseline("ell")
  7 |   expect_is(out, "list")
  8 |   expect_equal(length(out), 3)
  9 |   expect_identical(names(out), c("data", "keys", "fun"))
 10 |   expect_equal(out$fun(1), 1)
 11 |   expect_is(out$data, "data.frame")
 12 |   expect_true(all(out$keys %in% names(out$data)))
 13 | })
 14 | 
 15 | test_that("Error messages are informative", {
 16 |   expect_error(get_baseline("iep"), "Baseline not currently defined. Maybe you can write your own?")
 17 | })
 18 | 
 19 | 
 20 | context("Test assigning baseline values")
 21 | 
 22 | ex_data <- data.frame(id = 100:116, age = c(3:19),
 23 |                   race = c(rep(c("amerind", "asian", "black", "hispanic",
 24 |                                "multiracial", "white", "hawaiian_pi"), 2),
 25 |                            "other", "bad", "error"))
 26 | 
 27 | ex_data$ell <- assign_baseline(baseline = "ell", data = ex_data)
 28 | 
 29 | test_that("Baseline returns the proper object type", {
 30 |   zzz <- assign_baseline(baseline = "ell", data = ex_data)
 31 |   expect_is(zzz, "integer")
 32 | })
 33 | 
 34 | 
 35 | test_that("Baseline assignment works for ELL example", {
 36 |   expect_true(table(is.na(ex_data$ell))[[2]] == 3)
 37 |   expect_true(table(is.na(ex_data$ell))[[1]] == 14)
 38 |   expect_true(all(ex_data$ell %in% c(0L, 1L, NA)))
 39 | })
 40 | 
 41 | # Check for error reporting
 42 | 
 43 | 
 44 | ex_data <- data.frame(id = 100:116, age = c(3:19),
 45 |                       race = c(rep(c("amerind", "asian", "black", "hispanic",
 46 |                                      "multiracial", "white", "hawaiian_pi"), 2),
 47 |                                "other", "bad", "error"))
 48 | 
 49 | test_that("Baseline returns the proper object type for program", {
 50 |   zzz <- assign_baseline(baseline = "program", data = ex_data)
 51 |   expect_is(zzz, "data.frame")
 52 |   expect_identical(names(zzz),
 53 |                    c("id", "age", "race", "ell", "iep", "frpl"))
 54 | })
 55 | 
 56 | test_that("Baseline assignment works for program example", {
 57 |   zzz <- assign_baseline(baseline = "program", data = ex_data)
 58 |   expect_true(all(zzz$ell %in% c(0L, 1L)))
 59 |   expect_true(all(zzz$iep %in% c(0L, 1L)))
 60 |   expect_true(all(zzz$frpl %in% c(0L, 1L)))
 61 | })
 62 | 
 63 | test_that("Baseline for program is well defined", {
 64 |   bl <- get_baseline("program")
 65 |   expect_is(bl, "list")
 66 |   expect_identical(names(bl), c("data", "keys", "fun"))
 67 |   expect_is(bl$data, "data.frame")
 68 |   expect_null(bl$keys)
 69 |   expect_is(bl$fun, "function")
 70 |   out <- bl$fun()
 71 |   expect_is(out, "data.frame")
 72 |   expect_identical(names(out), c("ell", "iep", "frpl"))
 73 | })
 74 | 
 75 | # grade baseline
 76 | test_that("Baseline returns the proper object type for grade", {
 77 |   zzz <- assign_baseline("grade", ex_data)
 78 |   expect_is(zzz, "data.frame")
 79 |   expect_identical(names(zzz),
 80 |                    c("id", "age", "race", "grade"))
 81 | })
 82 | 
 83 | test_that("Baseline assignment works for program example", {
 84 |   zzz <- assign_baseline("grade", ex_data)
 85 |   expect_is(zzz$grade, "character")
 86 |   expect_true(all(unique(zzz$grade) %in%
 87 |                     c("1", "10", "11", "12", "2", "3", "4", "5", "6",
 88 |                       "7", "8", "9", "KG", "PK")))
 89 | })
 90 | 
 91 | test_that("Baseline for grade is well defined", {
 92 |   bl <- get_baseline("grade")
 93 |   expect_is(bl, "list")
 94 |   expect_identical(names(bl), c("data", "keys", "fun"))
 95 |   expect_is(bl$data, "data.frame")
 96 |   expect_identical(bl$keys, "age")
 97 |   expect_is(bl$fun, "function")
 98 |   out <- bl$fun(2)
 99 |   expect_is(out, "character")
100 | })
101 | 
102 | # cond_prob
103 | ses_list <- list(
104 |   "white" = list(f = runif,
105 |                  pars = list(min = 10, max = 20)),
106 |   "hispanic" = list(f = runif,
107 |                     pars = list(min = 25, max = 30)),
108 |   "black" = list(f = runif,
109 |                  pars = list(min = 35, max = 40)),
110 |   "asian" = list(f = runif,
111 |                  pars = list(min = 45, max = 50)),
112 |   "multiracial" = list(f = runif,
113 |                        pars = list(min = 100, max = 105)),
114 |   "amerind" = list(f = runif,
115 |                    pars = list(min = 0, max = 5)),
116 |   "other" = list(f = runif,
117 |                  pars = list(min = -10, max = -5)),
118 |   "hawaiian_pi" = list(f = runif,
119 |                        pars = list(min = -20, max = -15))
120 | )
121 | 
122 | 
123 | context("Test conditional probability")
124 | 
125 | test_that("warnings but functioning", {
126 |   test_1 <- ex_data[1:5, ]
127 |   expect_warning(cond_prob(test_1, factor = "race",
128 |                    newvar = "ses", prob_list = ses_list))
129 |   zzz <- cond_prob(test_1, factor = "race",
130 |                    newvar = "ses", prob_list = ses_list)
131 |   expect_is(zzz, "data.frame")
132 |   expect_identical(names(zzz), c("id", "age", "race", "ses"))
133 |   expect_identical(zzz[, 1:3], ex_data[1:5, ])
134 |   expect_gt(zzz$ses[zzz$race == "amerind"], 0)
135 |   expect_lt(zzz$ses[zzz$race == "amerind"], 5)
136 |   expect_gt(zzz$ses[zzz$race == "asian"], 45)
137 |   expect_lt(zzz$ses[zzz$race == "asian"], 50)
138 |   expect_gt(zzz$ses[zzz$race == "black"], 35)
139 |   expect_lt(zzz$ses[zzz$race == "black"], 40)
140 |   expect_gt(zzz$ses[zzz$race == "hispanic"], 25)
141 |   expect_lt(zzz$ses[zzz$race == "hispanic"], 30)
142 |   expect_gt(zzz$ses[zzz$race == "multiracial"], 100)
143 |   expect_lt(zzz$ses[zzz$race == "multiracial"], 105)
144 | })
145 | 
146 | 
147 | ## Test ses baseline
148 | ## Test baseline assignment and conditional probability
149 | ## Test school assignment
150 | ## Test school transition assignment
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/vignettes/sim_control.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Controlling the Simulation in OpenSDPsynthR"
  3 | author: "Jared E. Knowles"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Controlling the Data Simulation}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ## Introduction
 13 | 
 14 | The `OpenSDPsynthR` package provides a set of functions to allow users to 
 15 | generate synthetic, but authentic, student-level data. Synthetic data is 
 16 | intended to make it easy to collaborate with analysts across the country 
 17 | tackling similar problems but using a shared vocabulary. The synthetic data 
 18 | will allow users to collaborate directly on code and analyses and verify that 
 19 | their analysis is working on synthetic data before translating it to live 
 20 | local data. 
 21 | 
 22 | This vignette explains how the package is structured so that it can be modified 
 23 | to meet the needs of users. 
 24 | 
 25 | ```{r}
 26 | library(OpenSDPsynthR)
 27 | default_sim <- sim_control()
 28 | ```
 29 | 
 30 | ```{r}
 31 | names(default_sim)
 32 | ```
 33 | 
 34 | There are over 25 user parameters that can be modified to control the simulation. 
 35 | The goal of these parameters is to allow the simulated student population to 
 36 | reflect a variety of possible educational environments ranging from small 
 37 | rural communities to large urban school districts. 
 38 | 
 39 | These parameters can have complex structures to allow for conditional and random 
 40 | generation of data. Parameters fall into four categories:
 41 | 
 42 | - **vectors:** a single list of parameters like school names, category names, or 
 43 | school IDs
 44 | - **conditional probability list:** an R list that contains a variable to group by, 
 45 | a function to generate data with, and a list of parameters for that function for 
 46 | each group in the grouping variable
 47 | - **outcome simulation parameters:** an R list of arguments to pass to the `simglm` 
 48 | function
 49 | - **outcome adjustments:** an R list of lists, with functions that modify a variable 
 50 | in an existing data set
 51 | 
 52 | ### Vectors
 53 | 
 54 | The following vectors can be modified by the users:
 55 | 
 56 | - `nschls`: integer, number of high schools to assign students to
 57 | - `best_schl`: character, length 1, school ID for the highest performing school,
 58 | e.g. ("01")
 59 | - `race_groups`: character, length ?, names of racial subgroups to create in the 
 60 | simulation, defaults to US Census Groups
 61 | - `race_prob`: numeric, length = length(race_groups), proportion of population 
 62 | in each racial group
 63 | - `minyear`: integer, length 1, the first year of student data available
 64 | - `maxyear`: integer, length 1, the last year of student data available
 65 | - `n_cohorts`: integer, length 1, the number of graduation cohorts to create
 66 | - `school_names`: character, length = `nschls`, names of schools
 67 | - `assess_grades`: character, grade levels to simulate assessment scores for
 68 | - `postsec_names`: character, length = `n_postsec`, names of postsecondary schools
 69 | - `postsec_method`: character, length = 1, name of method to draw postsecondary 
 70 | schools from 
 71 | 
 72 | ### Conditional Probability List
 73 | 
 74 | A conditional probability list is a list of lists in R. The `GROUPVARS` element 
 75 | specifies the grouping variable to conditionally assign probabilities. For 
 76 | example, if students are assigned gifted and talented status differently based 
 77 | on their sex, then this would specify `Sex`. The other elements of the list 
 78 | will be a separate list for each valid value of `Sex` -- in this case `Male` 
 79 | and `Female`. 
 80 | 
 81 | `Male` and `Female` are both lists that have two elements: `f` and `pars`. `f`
 82 | defines a function that is used to generate the variable, and `pars` contains 
 83 | all of the parameters for that function. 
 84 | 
 85 | ```{r}
 86 | str(default_sim$gifted_list)
 87 | ```
 88 | 
 89 | - `gifted_list`: a list defining how students are assigned to gifted and talented 
 90 | programs
 91 | - `iep_list`: a list defining how students are assigned to special education 
 92 | programs
 93 | - `ses_list`: a list deifning how students are assigned to free and reduced 
 94 | price lunch status
 95 | - `ell_list`: a list defining how students are assigned to English Language 
 96 | Learner status
 97 | - `ps_transfer_list`: a list defining the likelihood a student transfers 
 98 | postsecondary institutions
 99 | 
100 | 
101 | ### Outcome Simulation Controls
102 | 
103 | Outcome simulation controls are lists with parameters to pass to the `simreg` 
104 | function in the `simglm` package, which simulates hierarchical data and 
105 | outcomes. 
106 | 
107 | Each of these simulations requires the user to specify:
108 | 
109 | - `fixed`: a RHS formula of the format `~ 1 + var1 + var2` defining the level 
110 | 1 variables for the simulation
111 | - `random_var`: a numeric, length 1, specifying the variance in the second level 
112 | - `cov_param`: a list, length of variables in `fixed` + 1 for the intercept, 
113 | defines the function and parameters to generate the X values 
114 | - `cor_vars`: a matrix of the variance between the X variables in `fixed`
115 | - `fixed_param`: a vector of numerics, the lenth of `fixed` + 1, represent the 
116 | beta coefficients
117 | - `ngrps`: numeric, length of 1, number of second-level grouping terms
118 | - `unbalanceRange`: numeric, length of 2, representing the minimum and maximum 
119 | number of observations in each second-level cluster
120 | - `type`: character, either "linear" or NULL
121 | 
122 | There are several of these parameters:
123 | 
124 | - `gpa_sim_parameters`: simulation parameters for the GPA simulation
125 | - `grad_sim_parameters`: simulation parameters for high school graduation
126 | - `ps_sim_parameters`: simulation parameters for postsecondary enrollment
127 | - `assess_sim_par`: simulation parameters for student assessment data
128 | 
129 | ### Outcome Simulation Adjustments
130 | 
131 | If we only rely on the simulation controls above, the data will be too predictable 
132 | to be realistic, and structural inequalities along economic, racial, and gender 
133 | lines will be underrepresented. To address this, it is possible to do post-simulation 
134 | adjustments to introduce more variance to the outcomes. 
135 | 
136 | - `race_list`: 
137 | - `perturb_race`: function, 
138 | - `frl_list`: 
139 | - `perturb_frl`: function, 
140 | 
141 | - `assessment_adjustment`: adjustments to the assessment score
142 | - `grad_adjustment`: adjustments to the graduation probability
143 | - `ps_adjustment`: adjustments to the postsecondary probability
144 | - `gpa_adjustment`: adjustments to the grade point average
145 | 
146 | 
147 | ### Baselines
148 | 
149 | Currently there are two special parameters that are set based on baseline data 
150 | built into the package. These are the initial grade distribution of students, 
151 | and the initial program participation of students in `ell`, `iep`, and `frpl` 
152 | programs. 
153 | 
154 | These set some of the simulation requirements, but others are set using the 
155 | `baseline` function family.
156 | 
157 | ```{r, eval=FALSE}
158 | get_baseline("program")
159 | get_baseline("grade")
160 | ```
161 | 
162 | Currently, baseline values cannot be modified by the user, but this will come in 
163 | a future release. 
164 | 
165 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, echo = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>",
 11 |   fig.path = "tools/figs/README-",
 12 |   message = FALSE,
 13 |   warning = FALSE
 14 | )
 15 | ```
 16 | 
 17 | # OpenSDPsynthR
 18 | 
 19 | ![](tools/figs/open_sdp_logo_red.png)
 20 | 
 21 | 
 22 | A project to generate realistic synthetic unit-level longitudinal education data
 23 | to empower collaboration in education analytics.
 24 | 
 25 | ## Design Goals
 26 | 
 27 | 1. Generate synthetic education data that is realistic for use by analysts
 28 | across the education sector. Realistic means messy, and reflective of the
 29 | general pattern of relationships found in the U.S. education sector.
 30 | 2. Synthetic data should be able to be generated on-demand and responsive to
 31 | inputs from the user. These inputs should allow the user to configure the
 32 | process to produce data that resembles the patterns of data in their agency.
 33 | 3. The package should be modular and extendable allowing new data topics to be
 34 | generated as needed so synthetic data coverage can grow.
 35 | 
 36 | ## Structure
 37 | 
 38 | The package is organized into the following functions:
 39 | 
 40 | - `simpop()` is the overall function that runs the simulation, this function calls
 41 | many subfunctions to simulate different elements of the student data
 42 | - `cleaners` are functions which take the output from the `simpop` function and
 43 | reshape it into data formats for different analyses. Currently only two cleaners
 44 | are supported -- `CEDS` and `sdp_cleaner()` which prepare the data into a CEDS
 45 | like format and into the Strategic Data Project college-going analysis file
 46 | specification respectively.
 47 | - `sim_control()` -- a function that controls all of the parameters of the `simpop`
 48 | simulation. The details of this function are covered in the vignettes.
 49 | 
 50 | # Get Started
 51 | 
 52 | To use `OpenSDPsynthR`, follow the instructions below:
 53 | 
 54 | ## Install Package
 55 | 
 56 | The development version of the package is able to be installed using the
 57 | `install_github()`. To use this command you will need to install the `devtools`
 58 | package.
 59 | 
 60 | ```{r eval=FALSE}
 61 | devtools::install_github("opensdp/OpenSDPsynthR")
 62 | ```
 63 | 
 64 | ## Make some data
 65 | 
 66 | Load the package
 67 | 
 68 | ```{r, message=TRUE}
 69 | library(OpenSDPsynthR)
 70 | ```
 71 | 
 72 | The main function of the package is `simpop` which generates a list of data
 73 | elements corresponding to simulated educational careers, K-20, for a user
 74 | specified number of students. In R, a list is a data structure that can contain
 75 | multiple data elements of different structures. This can be used to emulate
 76 | the multiple tables of a Student Information System (SIS).
 77 | 
 78 | 
 79 | 
 80 | ```{r, message=TRUE}
 81 | out <- simpop(nstu = 500, seed = 213, control = sim_control(nschls = 3))
 82 | ```
 83 | 
 84 | Currently ten tables are produced:
 85 | 
 86 | ```{r}
 87 | names(out)
 88 | ```
 89 | 
 90 | 
 91 | Data elements produced include:
 92 | 
 93 | - **Student demographics:**  age, race, and sex
 94 | - **Student participation:** grade advancement, ELL status, IEP status,
 95 | FRPL status, gifted and talented status, attendance
 96 | - **Student enrollment status:** exit type, enrollment type, transfer, graduation,
 97 | dropout, etc.
 98 | - **School attributes:** name, school category, school size, Title I and Title III status, etc.
 99 | - **Student assessment:** math assessment, reading assessment, grade level assessed
100 | - **High school outcomes:** graduation, cumulative GPA, graduation type, cohort,
101 | class rank, postsecondary enrollment
102 | - **High school progression:** annual class rank, cumulative credits earned, credits
103 | earned, credits by English Language Arts and by Mathematics, credits attempted, 
104 | ontrack status
105 | - **Postsecondary enrollment:** year of enrollment, transfer indicator, name and ID of
106 | postsecondary institution, type of institution
107 | - **Postsecondary institution:** name, city, state, online only, average net price,
108 | Pell grant rate, retention four year full time, share of part time enrollment,
109 | enrollment by race, SAT and ACT score distribution for admitted students
110 | 
111 | There are two tables of metadata about the assessment data above to be used in
112 | cases where multiple types of student assessment are analyzed together.
113 | 
114 | - **Assessment information:** grade, subject, ID, type, and name of assessment
115 | - **Proficiency information:** mean score, error of score, number of students tested
116 | 
117 | 
118 | ```{r, echo=FALSE, message=FALSE, warning=FALSE, include=FALSE}
119 | table_names <- data.frame(table = NULL, column = NULL)
120 | for(i in seq_along(out)){
121 |   table_name <- names(out)[[i]]
122 |   columns <- names(out[[i]])
123 |   tmp <- data.frame(table = table_name, column = columns,
124 |                     stringsAsFactors = FALSE)
125 |   table_names <- bind_rows(table_names, tmp)
126 | }
127 | 
128 | ```
129 | 
130 | 
131 | ```{r, inclue=FALSE}
132 | head(out$demog_master %>% arrange(sid) %>% select(1:4))
133 | head(out$stu_year, 10)
134 | ```
135 | 
136 | ## Cleaners
137 | 
138 | You can reformat the synthetic data for use in specific types of projects.
139 | Currently two functions exist to format the simulated data into an analysis
140 | file matching the SDP College-going data specification and a CEDS-like
141 | data specification. More of these functions are planned in the future.
142 | 
143 | ```{r eval=FALSE}
144 | cgdata <- sdp_cleaner(out)
145 | ceds <- ceds_cleaner(out)
146 | ```
147 | 
148 | 
149 | ## Control Parameters
150 | 
151 | By default, you only need to specify the number of students to simulate to the
152 | `simpop` command. The package has default simulation parameters that will result
153 | in creating a small school district with two schools.
154 | 
155 | 
156 | ```{r demonstrateOptionList}
157 | names(sim_control())
158 | ```
159 | 
160 | These parameters can have complex structures to allow for conditional and random
161 | generation of data. Parameters fall into four categories:
162 | 
163 | - **vectors:** a single list of parameters like school names, category names, or
164 | school IDs
165 | - **conditional probability list:** an R list that contains a variable to group by,
166 | a function to generate data with, and a list of parameters for that function for
167 | each group in the grouping variable
168 | - **outcome simulation parameters:** an R list of arguments to pass to the `simglm`
169 | function
170 | - **outcome adjustments:** an R list of lists, with functions that modify a variable
171 | in an existing data set
172 | 
173 | For more details, see the simulation control vignette.
174 | 
175 | ```{r, eval=FALSE}
176 | vignette("Controlling the Data Simulation", package = "OpenSDPsynthR")
177 | ```
178 | 
179 | 
180 | ## Package Dependencies
181 | 
182 | - `dplyr`
183 | - `lubridate`
184 | - [wakefield](https://www.github.com/trinker/wakefield)
185 | - [simglm](https://www.github.com/lebebr01/simglm)
186 | 
187 | ## OpenSDP
188 | 
189 | `OpenSDPsynthR` is part of the OpenSDP project.
190 | 
191 | [OpenSDP](https://opensdp.github.io) is an online, public repository of analytic
192 | code, tools, and training intended to foster collaboration among education
193 | analysts and researchers in order to accelerate the improvement of our school
194 | systems. The community is hosted by the
195 | [Strategic Data Project](https://sdp.cepr.harvard.edu), an initiative of the
196 | [Center for Education Policy Research at Harvard University](https://cepr.harvard.edu).
197 | We welcome contributions and feedback.
198 | 
199 | These materials were originally authored by the Strategic Data Project.
200 | 


--------------------------------------------------------------------------------
/R/baselines.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #' Function to load in SDP default baseline data
  4 | #'
  5 | #' @param bl a character naming the type of baseline available
  6 | #'
  7 | #' @return the restored baseline object
  8 | #' @export
  9 | get_baseline <- function(bl){
 10 |   if(bl == "ell"){
 11 |     #data(sysdata, envir=environment())
 12 |     data <- ell
 13 |     keys <- c("race", "age")
 14 |     fun <- function(x) rbinom(1, 1, x)
 15 |   } else if(bl == "ses"){
 16 |     data <- ses
 17 |     keys <- c("race")
 18 |     fun <- function(x) rbinom(1, 1, x)
 19 |   } else if(bl == "program"){
 20 |     data <- prog_baseline
 21 |     keys <- NULL
 22 |     fun <- function() {prog_baseline[sample(rownames(prog_baseline), 1,
 23 |                                             prob = prog_baseline$prob), 1:3]}
 24 |   } else if(bl == "grade"){
 25 |     data <- age_grade
 26 |     keys <- "age"
 27 |     fun <- function(x){
 28 |       if(x %in% age_grade$age){
 29 |         probs <- age_grade[which(age_grade$age == x),][-1]
 30 |         out <-sample(names(age_grade)[-1], 1, prob = probs)
 31 |         out <- convert_grade(out)
 32 |         return(out)
 33 |       } else{
 34 |         return(NA)
 35 |       }
 36 |     }
 37 |   } else{
 38 |     stop("Baseline not currently defined. Maybe you can write your own?")
 39 |   }
 40 |   return(list(data = data, keys = keys, fun = fun))
 41 | }
 42 | 
 43 | #' Append baseline data to initial data
 44 | #'
 45 | #' @param baseline character value of the default baseline to assign
 46 | #' @param data a data.frame to append the baseline to
 47 | #'
 48 | #' @return the data.frame passed by the user with an additional variable appended
 49 | #' @importFrom dplyr left_join
 50 | #' @export
 51 | assign_baseline <- function(baseline = NULL, data){
 52 |   bl_data <- get_baseline(baseline)
 53 |   if(baseline == "grade"){
 54 |     data <- as.data.frame(data)
 55 |     out <- sapply(data[, bl_data$keys], bl_data$fun)
 56 |     out <- as.character(out)
 57 |     data <- cbind(data, out)
 58 |     names(data)[ncol(data)] <- baseline
 59 |     data[,ncol(data)] <- as.character(data[,ncol(data)])
 60 |     return(data)
 61 |   }
 62 |   if(is.null(bl_data$keys)){
 63 |     out <- replicate(nrow(data), bl_data$fun(), simplify=FALSE) %>%
 64 |       Reduce("rbind", .)
 65 |     out <- cbind(data, out)
 66 |   } else {
 67 |     if(any(!bl_data$keys %in% names(data))){
 68 |       msg <- paste("Data supplied does not have right keys to merge. Please use columns:",
 69 |                    paste(bl_data$keys, collapse = ", "), sep = " \n ")
 70 |       stop(msg)
 71 |     }
 72 |     data <- as.data.frame(left_join(data, bl_data$data, by = bl_data$keys))
 73 |     var <- names(bl_data$data)[!names(bl_data$data) %in% bl_data$keys]
 74 |     if(length(var) > 1){
 75 |       stop("Variables are labeled wrong in data.")
 76 |     }
 77 |     # Avoid binomial NA warning in ELL baseline
 78 |     suppressWarnings({
 79 |       out <- sapply(data[, var], bl_data$fun)
 80 |     })
 81 |   }
 82 |   return(out)
 83 | }
 84 | 
 85 | # Function to generate conditional probabilities and append them to data
 86 | # Input is data frame, output is a data frame
 87 | # prob_list needs to have the levels of the factor variable and be the same length
 88 | 
 89 | # TODO: UNIT TESTS
 90 | # Document prob_list structure and verify
 91 | 
 92 | #' Generate conditional probabilities by group
 93 | #'
 94 | #' @param data dataframe to add variable to
 95 | #' @param factor grouping variable that probability of \code{newvar} is conditional on
 96 | #' @param newvar name, character, of new variable defined by \code{prob_list}
 97 | #' @param prob_list a list, defining the way \code{newvar} should be generated
 98 | #'
 99 | #' @return data.frame with \code{newvar} appended to dataframe
100 | #' @export
101 | cond_prob <- function(data, factor, newvar, prob_list){
102 |   # Work around tbls
103 |   if(any(class(data) != "data.frame")){
104 |     data <- as.data.frame(data)
105 |   }
106 |   if(!factor %in% names(data)){
107 |     stop("Factor not found in data. Did you forget to create it?")
108 |   }
109 |   data[, newvar] <- NA
110 |   # Error checking, if not all factors are defined in prob_list, issue error
111 |   if(!all(unique(data[, factor]) %in% names(prob_list))){
112 |     missingLevels <-
113 |       as.character(unique(data[, factor])[!unique(data[, factor]) %in%
114 |                                             names(prob_list)])
115 |     msg <- paste0("Probability list does not specify all possible factor levels.",
116 |                   "\n", "Missing levels: \n",
117 |                   paste0(missingLevels, collapse = ",\n"))
118 |     stop(msg)
119 |   }
120 |   # If not all prob_list elements are listed in the data, issue a warning
121 |   if(!all(names(prob_list) %in% unique(data[, factor]))){
122 |     missingLevels <-
123 |       as.character(names(prob_list)[!names(prob_list) %in%
124 |                                       unique(data[, factor])])
125 |     msg <- paste0("Probability list elements not found in data.",
126 |                   "\n", "Missing levels: \n",
127 |                   paste0(missingLevels, collapse = ",\n"))
128 |     warning(msg)
129 |   }
130 | 
131 |   for(i in unique(data[, factor])){
132 |     N <- nrow(data[data[,  factor] == i, ])
133 |     data[data[,  factor] == i, newvar] <- do.call(prob_list[[i]]$f,
134 |                                                   c(list(n = N),
135 |                                                     prob_list[[i]]$pars))
136 | 
137 |   }
138 |   return(data)
139 | }
140 | 
141 | #' Assign student a grade
142 | #'
143 | #' @param age age of the student in years
144 | #' @param ability a modifier that signifies student ability?
145 | #'
146 | #' @return a vector of grade levels
147 | #' @importFrom wakefield level
148 | #' @export
149 | #'
150 | #' @examples
151 | #' age <- c(9, 10, 11, 12)
152 | #' assign_grade(age = age)
153 | assign_grade <- function(age, ability){
154 |   baseGrade <- floor(age - 7)
155 |   maxGrade <- floor(age - 4)
156 |   out <- wakefield::level(1, x = baseGrade:maxGrade, prob = c(0.01, 0.03, 0.9, 0.06))
157 |   return(out)
158 | }
159 | 
160 | 
161 | 
162 | #' Generate a random transition matrix for school enrollments
163 | #'
164 | #' @param nschls integer, the number of schools available, default = 15
165 | #' @param diag_limit a numeric between 0 and 1 that sets the minimum probability that a student
166 | #' will stay schools
167 | #'
168 | #' @return a transition matrix nschls X nschls of transition probabilities
169 | #' @export
170 | #'
171 | #' @examples
172 | #' out <- school_transitions(12)
173 | #' out
174 | school_transitions <- function(nschls = 15L, diag_limit = 0.9){
175 |   stopifnot(class(nschls) %in% c("numeric", "integer"))
176 |   stopifnot(diag_limit > 0 & diag_limit < 1)
177 |   empty <- matrix(rep(0, nschls^2), nrow = nschls, ncol = nschls)
178 |   diag(empty) <- 1
179 |   for(i in 1:nschls){
180 |     diag(empty)[i] <- runif(1, diag_limit, 1)
181 |   }
182 |   for(i in 1:nschls){
183 |     empty[i, ][empty[i,] ==0] <- rand_vect_cont(nschls-1, 1-diag(empty)[i])
184 |   }
185 |   school_tm <- empty
186 |   if(nschls < 100){
187 |     sch_dims <- sprintf("%02d", 1:nschls)
188 |   } else{
189 |     sch_dims <- sprintf("%03d", 1:nschls)
190 |   }
191 |   dimnames(school_tm) <- list(sch_dims, sch_dims)
192 |   return(school_tm)
193 | }
194 | 
195 | #' Generate a grade advancement transition matrix
196 | #'
197 | #' @param ngrades integer, the number of grade levels to simulate
198 | #' @param diag_limit the minimum probability of a student advancing to the next grade
199 | #'
200 | #' @return a matrix, ngrades x ngrades of grade transition probabilities
201 | #' @export
202 | grade_transitions <- function(ngrades=15L, diag_limit = 0.975){
203 |   stopifnot(class(ngrades) %in% c("numeric", "integer"))
204 |   stopifnot(diag_limit > 0 & diag_limit < 1)
205 |   empty <- matrix(rep(0, ngrades^2), nrow = ngrades, ncol = ngrades)
206 |   pool <- sort(runif(ngrades, diag_limit, 1))
207 |   if(ngrades > 9){
208 |     diag(empty) <- pool[c(1:3, 8:(ngrades-1), 4, 5, 6, 7, ngrades)]
209 |   } else{
210 |     diag(empty) <- pool
211 |   }
212 |   empty <- diag_offset(empty, 1L)
213 |   for(g in 1:ngrades){
214 |     i <- g + 1
215 |     elem <- (i-2):(i+2)
216 |     if(any(elem < 1) | any(elem > ngrades)){
217 |       elem <- elem[elem >= 1]
218 |       elem <- elem[elem <= ngrades]
219 |     }
220 |     elem <- elem[-which(elem == i)]
221 |     if(i > ngrades){
222 |       fill <- round(rand_vect_cont(N = length(elem), runif(1, diag_limit, 1),
223 |                                    sd = 4), 4)
224 |     } else{
225 |       fill <- round(rand_vect_cont(N = length(elem), 1-empty[g, i], sd = 4), 4)
226 |     }
227 |     fill <- sort(fill, decreasing = TRUE)
228 |     fill <- na.omit(fill[c(2, 1, 4, 3)])
229 |     attributes(fill) <- NULL # strip NA info
230 |     empty[g, ][elem] <- fill
231 |   }
232 |   empty[ngrades, ngrades] <- 1
233 |   grades_tm <- empty
234 |   dimms <- convert_grade(paste0("g", -1:(ngrades-2)))
235 |   dimnames(grades_tm) <- list(dimms, dimms)
236 |   grades_tm <- tm_convert(grades_tm)
237 |   return(grades_tm)
238 | }
239 | 
240 | 
241 | 
242 | #' Offset the diagonal values of a matrix
243 | #'
244 | #' @param matrix a square matrix
245 | #' @param offset an integer value to offset the diagonal by
246 | #'
247 | #' @return a square matrix
248 | #' @export
249 | diag_offset <- function(matrix, offset = 1L){
250 |   for(i in 1:nrow(matrix)){
251 |     matrix[i, ] <- c(rep(0, offset), matrix[i, ][1:(length(matrix[i, ])-offset)])
252 |   }
253 |   return(matrix)
254 | }
255 | 
256 | 
257 | 
258 | 
259 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | OpenSDPsynthR
  4 | =============
  5 | 
  6 | ![](tools/figs/open_sdp_logo_red.png)
  7 | 
  8 | A project to generate realistic synthetic unit-level longitudinal education data to empower collaboration in education analytics.
  9 | 
 10 | Design Goals
 11 | ------------
 12 | 
 13 | 1.  Generate synthetic education data that is realistic for use by analysts across the education sector. Realistic means messy, and reflective of the general pattern of relationships found in the U.S. education sector.
 14 | 2.  Synthetic data should be able to be generated on-demand and responsive to inputs from the user. These inputs should allow the user to configure the process to produce data that resembles the patterns of data in their agency.
 15 | 3.  The package should be modular and extendable allowing new data topics to be generated as needed so synthetic data coverage can grow.
 16 | 
 17 | Structure
 18 | ---------
 19 | 
 20 | The package is organized into the following functions:
 21 | 
 22 | -   `simpop()` is the overall function that runs the simulation, this function calls many subfunctions to simulate different elements of the student data
 23 | -   `cleaners` are functions which take the output from the `simpop` function and reshape it into data formats for different analyses. Currently only two cleaners are supported -- `CEDS` and `sdp_cleaner()` which prepare the data into a CEDS like format and into the Strategic Data Project college-going analysis file specification respectively.
 24 | -   `sim_control()` -- a function that controls all of the parameters of the `simpop` simulation. The details of this function are covered in the vignettes.
 25 | 
 26 | Get Started
 27 | ===========
 28 | 
 29 | To use `OpenSDPsynthR`, follow the instructions below:
 30 | 
 31 | Install Package
 32 | ---------------
 33 | 
 34 | The development version of the package is able to be installed using the `install_github()`. To use this command you will need to install the `devtools` package.
 35 | 
 36 | ``` r
 37 | devtools::install_github("opensdp/OpenSDPsynthR")
 38 | ```
 39 | 
 40 | Make some data
 41 | --------------
 42 | 
 43 | Load the package
 44 | 
 45 | ``` r
 46 | library(OpenSDPsynthR)
 47 | #> Loading required package: dplyr
 48 | #> 
 49 | #> Attaching package: 'dplyr'
 50 | #> The following objects are masked from 'package:stats':
 51 | #> 
 52 | #>     filter, lag
 53 | #> The following objects are masked from 'package:base':
 54 | #> 
 55 | #>     intersect, setdiff, setequal, union
 56 | #> Loading required package: lme4
 57 | #> Loading required package: Matrix
 58 | ```
 59 | 
 60 | The main function of the package is `simpop` which generates a list of data elements corresponding to simulated educational careers, K-20, for a user specified number of students. In R, a list is a data structure that can contain multiple data elements of different structures. This can be used to emulate the multiple tables of a Student Information System (SIS).
 61 | 
 62 | ``` r
 63 | out <- simpop(nstu = 500, seed = 213, control = sim_control(nschls = 3))
 64 | #> Preparing student identities for 500 students...
 65 | #> Creating annual enrollment for 500 students...
 66 | #> Assigning 500 students to initial FRPL, IEP, and ELL status
 67 | #> Assigning initial grade levels...
 68 | #> Organizing status variables for you...
 69 | #> Assigning 500 students longitudinal status trajectories...
 70 | #> Sorting your records
 71 | #> Cleaning up...
 72 | #> Creating 3 schools for you...
 73 | #> Assigning 6946 student-school enrollment spells...
 74 | #> Simulating assessment table... be patient...
 75 | #> Simulating high school outcomes... be patient...
 76 | #> Simulating annual high school outcomes... be patient...
 77 | #> Simulating postsecondary outcomes... be patient...
 78 | #> Success! Returning you student and student-year data in a list.
 79 | ```
 80 | 
 81 | Currently ten tables are produced:
 82 | 
 83 | ``` r
 84 | names(out)
 85 | #>  [1] "demog_master" "stu_year"     "schools"      "stu_assess"  
 86 | #>  [5] "hs_outcomes"  "hs_annual"    "nsc"          "ps_enroll"   
 87 | #>  [9] "assessments"  "proficiency"
 88 | ```
 89 | 
 90 | Data elements produced include:
 91 | 
 92 | -   **Student demographics:** age, race, and sex
 93 | -   **Student participation:** grade advancement, ELL status, IEP status, FRPL status, gifted and talented status, attendance
 94 | -   **Student enrollment status:** exit type, enrollment type, transfer, graduation, dropout, etc.
 95 | -   **School attributes:** name, school category, school size, Title I and Title III status, etc.
 96 | -   **Student assessment:** math assessment, reading assessment, grade level assessed
 97 | -   **High school outcomes:** graduation, cumulative GPA, graduation type, cohort, class rank, postsecondary enrollment
 98 | -   **High school progression:** annual class rank, cumulative credits earned, credits earned, credits by English Language Arts and by Mathematics, credits attempted, ontrack status
 99 | -   **Postsecondary enrollment:** year of enrollment, transfer indicator, name and ID of postsecondary institution, type of institution
100 | -   **Postsecondary institution:** name, city, state, online only, average net price, Pell grant rate, retention four year full time, share of part time enrollment, enrollment by race, SAT and ACT score distribution for admitted students
101 | 
102 | There are two tables of metadata about the assessment data above to be used in cases where multiple types of student assessment are analyzed together.
103 | 
104 | -   **Assessment information:** grade, subject, ID, type, and name of assessment
105 | -   **Proficiency information:** mean score, error of score, number of students tested
106 | 
107 | ``` r
108 | head(out$demog_master %>% arrange(sid) %>% select(1:4))
109 | #>   sid    Sex  Birthdate                         Race
110 | #> 1 001   Male 2000-01-30                        White
111 | #> 2 002   Male 1998-01-25                        White
112 | #> 3 003 Female 2000-10-22    Black or African American
113 | #> 4 004 Female 2003-10-05                        White
114 | #> 5 005 Female 2001-05-27 Hispanic or Latino Ethnicity
115 | #> 6 006 Female 1998-12-16                        White
116 | head(out$stu_year, 10)
117 | #> Source: local data frame [10 x 17]
118 | #> Groups: sid [10]
119 | #> 
120 | #>       sid  year   age grade  frpl   ell   iep gifted grade_advance
121 | #>    <fctr> <dbl> <dbl> <chr> <chr> <chr> <chr>  <chr>         <chr>
122 | #> 1     002  2002     5    KG     0     0     0      0          <NA>
123 | #> 2     009  2002     6     1     1     0     0      1          <NA>
124 | #> 3     014  2002     5    KG     1     0     0      0          <NA>
125 | #> 4     017  2002     4    PK     0     0     0      0          <NA>
126 | #> 5     023  2002     5    KG     0     0     0      0          <NA>
127 | #> 6     024  2002     4    PK     0     0     0      0          <NA>
128 | #> 7     028  2002     5    KG     0     0     0      1          <NA>
129 | #> 8     030  2002     5    KG     0     0     0      0          <NA>
130 | #> 9     031  2002     6     1     1     0     0      0          <NA>
131 | #> 10    034  2002     5    KG     1     0     0      0          <NA>
132 | #> # ... with 8 more variables: cohort_year <dbl>, cohort_grad_year <dbl>,
133 | #> #   exit_type <lgl>, enrollment_status <chr>, ndays_possible <dbl>,
134 | #> #   ndays_attend <dbl>, att_rate <dbl>, schid <chr>
135 | ```
136 | 
137 | Cleaners
138 | --------
139 | 
140 | You can reformat the synthetic data for use in specific types of projects. Currently two functions exist to format the simulated data into an analysis file matching the SDP College-going data specification and a CEDS-like data specification. More of these functions are planned in the future.
141 | 
142 | ``` r
143 | cgdata <- sdp_cleaner(out)
144 | ceds <- ceds_cleaner(out)
145 | ```
146 | 
147 | Control Parameters
148 | ------------------
149 | 
150 | By default, you only need to specify the number of students to simulate to the `simpop` command. The package has default simulation parameters that will result in creating a small school district with two schools.
151 | 
152 | ``` r
153 | names(sim_control())
154 | #>  [1] "nschls"                "best_schl"            
155 | #>  [3] "race_groups"           "race_prob"            
156 | #>  [5] "minyear"               "maxyear"              
157 | #>  [7] "gifted_list"           "iep_list"             
158 | #>  [9] "ses_list"              "ell_list"             
159 | #> [11] "ps_transfer_list"      "grade_levels"         
160 | #> [13] "n_cohorts"             "school_means"         
161 | #> [15] "school_cov_mat"        "school_names"         
162 | #> [17] "postsec_names"         "gpa_sim_parameters"   
163 | #> [19] "grad_sim_parameters"   "ps_sim_parameters"    
164 | #> [21] "assess_sim_par"        "assessment_adjustment"
165 | #> [23] "grad_adjustment"       "ps_adjustment"        
166 | #> [25] "gpa_adjustment"        "assess_grades"        
167 | #> [27] "n_postsec"             "postsec_method"
168 | ```
169 | 
170 | These parameters can have complex structures to allow for conditional and random generation of data. Parameters fall into four categories:
171 | 
172 | -   **vectors:** a single list of parameters like school names, category names, or school IDs
173 | -   **conditional probability list:** an R list that contains a variable to group by, a function to generate data with, and a list of parameters for that function for each group in the grouping variable
174 | -   **outcome simulation parameters:** an R list of arguments to pass to the `simglm` function
175 | -   **outcome adjustments:** an R list of lists, with functions that modify a variable in an existing data set
176 | 
177 | For more details, see the simulation control vignette.
178 | 
179 | ``` r
180 | vignette("Controlling the Data Simulation", package = "OpenSDPsynthR")
181 | ```
182 | 
183 | Package Dependencies
184 | --------------------
185 | 
186 | -   `dplyr`
187 | -   `lubridate`
188 | -   [wakefield](https://www.github.com/trinker/wakefield)
189 | -   [simglm](https://www.github.com/lebebr01/simglm)
190 | 
191 | OpenSDP
192 | -------
193 | 
194 | `OpenSDPsynthR` is part of the OpenSDP project.
195 | 
196 | [OpenSDP](https://opensdp.github.io) is an online, public repository of analytic code, tools, and training intended to foster collaboration among education analysts and researchers in order to accelerate the improvement of our school systems. The community is hosted by the [Strategic Data Project](https://sdp.cepr.harvard.edu), an initiative of the [Center for Education Policy Research at Harvard University](https://cepr.harvard.edu). We welcome contributions and feedback.
197 | 
198 | These materials were originally authored by the Strategic Data Project.
199 | 


--------------------------------------------------------------------------------
/vignettes/sim_diagnostics.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Structuring the OpenSDPsynthR Package"
  3 | author: "Jared E. Knowles"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Simulation Diagnostics}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ## Introduction
 13 | 
 14 | 
 15 | ### Diagnostics
 16 | 
 17 | How do we know it worked? We can look at the patterns of ELL enrollment that 
 18 | are observed and see what patterns are the most common. To do this, let's compute 
 19 | the frequency of transition states observed per student. 
 20 | 
 21 | ```{r genData}
 22 | library(OpenSDPsynthR)
 23 | simouts <- simpop(nstu = 1000, control = sim_control(nschls = 8L))
 24 | stu_year <- simouts$stu_year
 25 | 
 26 | ```
 27 | 
 28 | 
 29 | 
 30 | ```{r ellDiagnostic}
 31 | library(ggplot2)
 32 | library(tidyr)
 33 | plotdf <- stu_year %>% arrange(sid, year) %>% group_by(sid) %>% 
 34 |   do(tidy_sequence(.$ell, states = c(1, 0)))
 35 | 
 36 | plotdf$total <- rowSums(plotdf[, -1])
 37 | plotdf <- plotdf %>% gather(-sid, key = "Transition", value = "Count")
 38 | 
 39 | # plotdf %>% group_by(Transition) %>% filter(Transition != "total") %>%
 40 | #   summarize(sum(Count))
 41 | 
 42 | plotdf <- plotdf %>% filter(Transition != "total")  %>% 
 43 |   group_by(sid) %>% 
 44 |   mutate(total = sum(Count)) %>% 
 45 |   mutate(per = Count / total) %>% filter(Transition != "total")  %>% 
 46 |   separate(Transition, into = c("From", "To"), sep = "-") 
 47 | 
 48 | ggplot(plotdf, aes(Count)) + geom_histogram() + 
 49 |   scale_x_continuous(breaks = c(0:25)) + 
 50 |   facet_grid(From~To, labeller = label_both, switch = "y") + 
 51 |   theme_bw() + 
 52 |   labs(title = "Frequency of Transition States by Student - ELL", 
 53 |        y = "Count", x = "Times per Student State Observed")
 54 | 
 55 | ```
 56 | 
 57 | Looking at this chart we can see that most students went from the No state to 
 58 | a No state -- as would be expected when there are few ELLs. 
 59 | 
 60 | Through this process we've gained students in the ELL status who were not 
 61 | initially ELL. Depending on our application this may not be desirable and we 
 62 | may want to modify the transition matrix to avoid this. Otherwise, later, 
 63 | this becomes an exercise in data cleaning. 
 64 | 
 65 | Two other visual diagnostics are below. 
 66 | 
 67 | ```{r visualdiagnostics}
 68 | # Other plots
 69 | 
 70 | # ggplot(plotdf, aes(per)) + geom_density() + 
 71 | #   facet_grid(From ~ To, labeller = label_both, switch = "y") + 
 72 | #   theme_bw() + labs(title = "By Student Densities of Transitions")
 73 | 
 74 | # Heatmap
 75 | plotdf %>% group_by(From, To) %>% 
 76 |   summarise(Count = sum(Count)) %>% 
 77 |   ungroup %>% 
 78 |   mutate(total = sum(Count)) %>%
 79 |   mutate(per = Count/total) %>%
 80 | ggplot(aes(x = From, y = To, fill = per)) + 
 81 |   geom_tile(color= I("black")) + 
 82 |   geom_text(aes(label = round(per, digits = 2))) + 
 83 |   theme_minimal() +
 84 |   coord_cartesian() + labs(title = "Heatmap of ELL Transition States")
 85 | 
 86 | ```
 87 | 
 88 | 
 89 | We can also do a comparative diagnostic. Given the relatively short length of 
 90 | our sequence per student, it will be hard to estimate fit from a short sequence. 
 91 | 
 92 | 
 93 | ```{r testbyStudent}
 94 | # series <- stu_year$ell[stu_year$ID == "1705"]
 95 | # series <- stu_year$ell[stu_year$ID == "0001"]
 96 | 
 97 | test_fit <- function(series, expected){
 98 |   if(dim(table(series)) == 1){
 99 |     return(TRUE)
100 |   } else {
101 |   out <- fit_series(series, return = "fit", confidencelevel = 0.99, 
102 |                     possibleStates = rownames(expected))
103 |   low <- out$lowerEndpointMatrix < expected
104 |   hi <- out$upperEndpointMatrix > expected
105 |   return(all(low, hi))
106 |   }
107 | }
108 | 
109 | defaultFit <- sim_control()$ell_list$ALL$pars$tm
110 | 
111 | test_res <- stu_year %>% group_by(sid) %>% 
112 |   summarize(fit_ok = test_fit(ell, expected = defaultFit))
113 | 
114 | table(test_res$fit_ok)
115 | ```
116 | 
117 | 
118 | Let's look at co-occurrence of status over time. 
119 | 
120 | ```{r CrossTabs}
121 | # Look at by year patterns of relationships by student year
122 | table(FRL = stu_year$frpl, GIFTED = stu_year$gifted)
123 | table(FRL = stu_year$frpl, IEP = stu_year$iep)
124 | table(GIFTED = stu_year$gifted, IEP = stu_year$iep)
125 | 
126 | ```
127 | 
128 | Let's check polychoric correlations:
129 | 
130 | ```{r}
131 | gamma_GK(stu_year$gifted, stu_year$iep)
132 | gamma_GK(stu_year$frpl, stu_year$iep)
133 | gamma_GK(stu_year$frpl, stu_year$ell)
134 | ```
135 | 
136 | 
137 | Finally, let's see who winds up "ever" in each category
138 | 
139 | ```{r collapseEver}
140 | 
141 | test_df <- stu_year %>% group_by(sid) %>% 
142 |   summarize(iep_ever = if_else(any(iep == 1), "Yes", "No"), 
143 |             ell_ever = if_else(any(ell == 1), "Yes", "No"), 
144 |             frpl_ever = if_else(any(frpl == 1), "Yes", "No"), 
145 |             gifted_ever = if_else(any(gifted == 1), "Yes", "No"))
146 | 
147 | table(IEP_EVER = test_df$iep_ever)
148 | table(ELL_EVER = test_df$ell_ever)
149 | table(FRPL_EVER = test_df$frpl_ever)
150 | table(GIFTED_EVER = test_df$gifted_ever)
151 | 
152 | ```
153 | 
154 | 
155 | ## Assigning Schools and Outcomes
156 | 
157 | Students move through grades, schools, and outcomes. 
158 | 
159 | ```{r, include=FALSE, eval=FALSE}
160 | # Outcome assignment, outcomes are assigned in order
161 | ## sat_act
162 | ## ps_enroll
163 | 
164 | # TODO: Consider including diploma attainment...
165 | 
166 | out <- simpop(nstu = 1250, seed = 1241, sim_control(nschls = 9))
167 | final_data <- sdp_cleaner(out)
168 | 
169 | ggplot(out$assessment, aes(x = age, y = math_ss, group = sid)) + 
170 |   geom_line(alpha = I(0.2)) + 
171 |   # geom_smooth(method = 'lm', se=FALSE, color = I("black"), alpha = I(0.2)) +
172 |   facet_wrap(~schid) 
173 | 
174 | 
175 | score_table <- assess %>% group_by(year, age) %>% 
176 |   summarize(read_mean = mean(rdg_ss), 
177 |             read_sd = sd(rdg_ss), 
178 |             math_mean = mean(math_ss), 
179 |             math_sd = sd(math_ss))
180 | 
181 | assess <- left_join(assess, score_table)
182 | assess$math_std <- (assess$math_ss - assess$math_mean) / assess$math_sd
183 | assess$read_std <- (assess$rdg_ss - assess$read_mean) / assess$read_sd
184 | cor(assess$math_std, assess$read_std, use = "pairwise")
185 | 
186 | ggplot(assess, aes(x = age, y = math_std, group = sid)) +
187 |   facet_wrap(~schid) + geom_line(alpha = I(0.2)) +
188 |   geom_smooth(aes(group=1), se = FALSE)
189 | 
190 | 
191 | 
192 | ggplot(assess, aes(x = math_ss, y = rdg_ss)) + geom_point(alpha = I(0.2))
193 | ggplot(assess, aes(x = math_std, y = read_std)) + geom_point(alpha = I(0.2))
194 | 
195 | idx <- sample(unique(assess$sid), 12)
196 | ggplot(assess[assess$sid %in% idx, ], aes(x = age, y = math_std)) + 
197 |   facet_wrap(~sid) + geom_line() + geom_smooth(method = 'lm', se=FALSE) + 
198 |   geom_hline(yintercept = 0, color = I("red")) + geom_point()
199 | 
200 | ggplot(assess[assess$sid %in% idx, ], aes(x = age, y = math_ss)) + 
201 |   facet_wrap(~sid) + geom_line() + geom_smooth(se=FALSE) 
202 | 
203 | 
204 | 
205 | 
206 | g12_cohort <- out$stu_year[out$stu_year$grade == "12", ]
207 | g12_cohort <- na.omit(g12_cohort)
208 | g12_cohort <- left_join(g12_cohort, out$demog_master[, 1:4], by = "sid")
209 | g12_cohort$male <- ifelse(g12_cohort$Sex == "Male", 1, 0)
210 | hs_outcomes <- OpenSDPsynthR:::assign_hs_outcomes(g12_cohort, 
211 |                                                  control = sim_control())
212 | 
213 | zzz <- out$hs_outcomes
214 | 
215 | dddff <- do.call(gen_outcome_model, ps_sim_parameters)
216 | 
217 | 
218 | 
219 | 
220 | df <- sim_glm(fixed = fixed, random = random,
221 |                   fixed_param = fixed_param, random_param = random_param,
222 |                   random3 = NULL,
223 |                   random_param3 = NULL,
224 |                   cov_param = cov_param,
225 |                   fact_vars = fact_vars, k = NULL,
226 |                   n = ngrps, p = NULL,
227 |                   cor_vars = cor_vars, data_str = "cross", unbal = TRUE,
228 |                   unbalCont = unbalanceRange)
229 |     mod <- glmer(update(fixed, "sim_data ~ . - math_ss + (1|clustID)"),
230 |                  data = df, family = "binomial")
231 | 
232 | # out <- simpop(nstu = 400, seed = 32231, 
233 | #           control = sim_control(nschls = 3L))
234 | 
235 | # Student	Student Year	Outcome
236 | # sid	assessment	hs_diploma
237 | # race_ethnicity	school_id	cum_gpa_final
238 | # sex	on_track	sat_act
239 | # frpl_ever	frpl	ps_enroll
240 | # ell_ever	ell	dropout
241 | # gifted_ever	gifted	transfer
242 | # iep_ever	iep	disappear
243 | # grade_level	still_enroll
244 |     random <- ~ 1
245 |   random_param <- list(random_var = random_var, rand_gen = "rnorm")
246 | 
247 |  library(simglm)
248 |   
249 | assess_sim_par <- list(
250 |   fixed = ~ 1 + time + gifted + iep + frpl + ell + male,
251 |   random = ~ 1 + time,
252 |   random3 = ~ 1 + time,
253 |   cor_vars = c(-0.276, -0.309, -0.046, -0.033,
254 |               -0.03, -0.029, -0.003, 0.06, 0.007, 0.001),
255 |   fixed_param = c(0.0024, 0.75, 0.10, -0.161388, -0.075, -0.056, 0.007),
256 |   fact_vars = NULL,
257 |   # intercept + any slopes in length
258 |   random_param = list(random_var = c(0.2, 0.1), cor_vars = c(0.4), rand_gen = 'rnorm'),
259 |   random_param3 = list(random_var = c(0.3, 0.025), rand_gen = 'rnorm'), # intercept + any slopes in length
260 |   cov_param = list(
261 |                  dist_fun = c("rbinom", "rbinom", "rbinom", "rbinom", "rbinom"),
262 |                  var_type = rep("lvl1", 5), 
263 |                  opts = list(
264 |                    list(size = 1, prob = 0.1), 
265 |                    list(size = 1, prob = 0.2), 
266 |                    list(size = 1, prob = 0.45), 
267 |                    list(size = 1, prob = 0.1), 
268 |                    list(size = 1, prob = 0.52)
269 |                   )
270 |                ),
271 |   unbalCont = c(2, 16),
272 |   unbalCont3 = c(100, 800),
273 |   unbal = TRUE, 
274 |   # Total number of level 2 groups = k * n
275 |   k = 15, # level 3 groups
276 |   n = 200, # obs per group level 2 group
277 |   p = 400, # obs per group?
278 |   error_var = 1,
279 |   with_err_gen = 'rnorm',
280 |   lvl1_err_params = list(mean = 0, sd = 1),
281 |   data_str = "long"
282 | )
283 |   
284 |  assess_table <- do.call(sim_reg, assess_sim_par, quote = TRUE)
285 |  # needs to be the length of all correlations between predictors
286 | temp_three <- sim_reg(fixed = fixed, random = random, random3 = random3,
287 |                       fixed_param = fixed_param, random_param = random_param,
288 |                       random_param3 = random_param3, cov_param = cov_param,
289 |                       fact_vars = fact_vars, k = k,n = n, p = p,
290 |                       lvl1_err_params = lvl1_err_params,
291 |                       error_var= error_var, with_err_gen = with_err_gen,
292 |                       cor_vars = cor_vars, data_str = "long", unbal = TRUE,
293 |                       unbalCont = unbalCont, unbalCont3 = unbalCont3)
294 |  
295 | 
296 | library(ggplot2)
297 | ggplot(temp_three, aes(x = time, y = sim_data, group = clustID)) +
298 |   geom_line(alpha = I(0.2)) + facet_wrap(~clust3ID)
299 | 
300 | names(temp_three)[1:6] <- c("intercept", "age", "gifted", "iep", "frpl",
301 |                        "ell")
302 | names(temp_three)[14] <- "math_ss"
303 | names(temp_three)[15:17] <- c("time", "sid", "schid")
304 | 
305 | ggplot(temp_three, aes(x = age, y = math_ss, group = sid)) +
306 |   geom_line(alpha = I(0.2)) + facet_wrap(~schid)
307 | 
308 | #witihnID = time, nested w/in level 2
309 | 
310 | library(lme4)
311 | proof <- lmer(math_ss ~ 1 + age + gifted +
312 |                 iep + frpl + ell +
313 |                 (1 + age | sid) +
314 |                 (1 | schid), data = temp_three)   
315 | 
316 | ```
317 | 
318 | 
319 | 


--------------------------------------------------------------------------------
/data-raw/build_data.R:
--------------------------------------------------------------------------------
  1 | # Build data
  2 | library(purrr)
  3 | library(tidyr)
  4 | 
  5 | ## Build example ELL baseline data
  6 | ell <- read.csv("data-raw/ellDist.csv")
  7 | ell <- na.omit(ell)
  8 | ell <-  ell %>% gather(-age, key = "race", value = "prob")
  9 | 
 10 | ## Build example SDP/CEDS crosswalk
 11 | xwalk <- read.csv("data-raw/CEDS_SDP_map.csv", stringsAsFactors = FALSE)
 12 | xwalk$schema <- NA
 13 | for(i in 1:nrow(xwalk)){
 14 |   xwalk$schema[i] <- I(list(OpenSDPsynthR:::get_code_values(xwalk$CEDS_Option_set[i])))
 15 | }
 16 | 
 17 | # SES data
 18 | ses <- data.frame(race = c("black", "asian", "hispanic", "amerind", "white",
 19 |                            "other", "multiracial", "hawaiian_pi"),
 20 |                   prob = c(0.65, 0.375, 0.6, 0.4, 0.4, 0.4, 0.4, 0.4))
 21 | ses$race <- as.character(ses$race)
 22 | 
 23 | # Program Baseline
 24 | prog_baseline <- read.csv("data-raw/program_baseline.csv")
 25 | prog_baseline[, 1] <- as.character(prog_baseline[, 1])
 26 | prog_baseline[, 2] <- as.character(prog_baseline[, 2])
 27 | prog_baseline[, 3] <- as.character(prog_baseline[, 3])
 28 | names(prog_baseline) <- tolower(names(prog_baseline))
 29 | prog_baseline$frpl[prog_baseline$frpl == "2"] <- "1"
 30 | prog_baseline <- prog_baseline %>% group_by(ell, iep, frpl) %>%
 31 |   summarize(count = sum(count), prob = sum(prob)) %>% as.data.frame()
 32 | # Map CEDS names and option names
 33 | # prog_baseline[, 1:3] <- recode_options(prog_baseline[, 1:3], from = "SDP")
 34 | # names(prog_baseline)[1:3] <- map_CEDS(names(prog_baseline)[1:3])
 35 | 
 36 | ## Pull in age_grade baseline
 37 | age_grade <- read.csv("data-raw/age_grade_baseline.csv")
 38 | names(age_grade) <- c("age", paste0("g", -1:12), "total")
 39 | age_grade[, 2:15] <- round(age_grade[, 2:15] / rowSums(age_grade[, 2:15]), 5)
 40 | age_grade$total <- NULL
 41 | 
 42 | sch_names <- c("Jackson", "Wallaby", "Kendrick", "Willow Creek", "Cypress",
 43 |                     "Dover Hills", "Milton South", "Spring Port", "Dogwood", "Topshire",
 44 |                     "Hillside", "Donohue", "Upper Falls", "George", "Avondale", "East Valley",
 45 |                     "Township", "Clark", "Pollock", "Duvall", "Santiago", "Hickory",
 46 |                     "Robin", "Warren", "Alvin", "Allen", "Haskin", "Hawthorne", "Woodcliff",
 47 |                     "Wright", "Buchanan", "Oriole", "Diamond Lake", "Walnut", "Central",
 48 |                     "Arbor", "Gail Hill", "Southbridge", "Silver Oak", "Marsden",
 49 |                     "Crossroads", "Meridian", "Linden", "Success", "Jefferson", "Columbia",
 50 |                     "Falcon", "Apollo", "Hampshire", "Madison", "Briarfield", "Jubilee",
 51 |                     "Magnolia", "Bauer", "Stone Street", "Angelea", "Riva Ridge",
 52 |                     "Perez", "Lincoln", "Goldeneye", "Hazel", "Vidalia", "Lewis",
 53 |                     "Freedman", "Venice", "Wintergreen", "Holden", "Beebe", "Red Hills",
 54 |                     "Floral", "Pressley", "Winchester", "Oak Tree", "Laurel", "Kingfisher",
 55 |                     "Verndale", "Kennedy", "Flagstaff", "Laughlin", "East Harley",
 56 |                     "Rolling Knoll", "Elizabeth", "Meadow", "Wakeland", "Rockford",
 57 |                     "Hummingbird", "Castle Rock", "Sugarplum", "London Lane", "Yawkey",
 58 |                     "Sparrow", "Watercress", "Lowell", "Mountain", "Bilford", "Maple",
 59 |                     "Ritchie", "Dalton", "Homer", "Lower Falls", "Midlands", "Pike",
 60 |                     "Tucker", "Whitebridge", "Pinkney", "Athena", "Acorn", "Wilson",
 61 |                     "Common Way", "Eastbridge", "Blue Hills", "Onassis", "Capital City",
 62 |                     "Wildwood", "Noble", "John Brown", "Ferdinand", "Weeks", "Majestic",
 63 |                     "Starlight", "Hemingway", "Spellman", "Olivero", "Cornerstone",
 64 |                     "Van Dusen", "Emblem", "Peyton", "Pleasant Lake", "West Harley",
 65 |                     "Einstein", "Frederick", "Middlewood", "Burton", "Graham", "Lone Pine",
 66 |                     "Tyler", "Yellow Creek", "Sargent", "Cathedral", "Moseley", "Justice",
 67 |                     "Avocet", "Anderson", "James", "Barclay", "Marie Curie", "Hunt Circle",
 68 |                     "Eastbourne", "Sunset", "Seaside", "Paisley", "Jupiter", "Prentice",
 69 |                     "Lockland", "Canyon", "Mayer", "Westerley", "Sweetville", "Lindgren",
 70 |                     "Carmen", "Hawking", "Lindsey", "Hunnewell", "Eagle", "Holly",
 71 |                     "Reigh Count", "Valley Way", "Navigation", "Olsen", "Daisy Hill",
 72 |                     "Hope", "Eliot", "Coleman", "Milton North", "Sage", "Lang", "Sierra Lane",
 73 |                     "Commander", "Rosa Parks", "Cicero", "Chickamee", "Labyrinth",
 74 |                     "Calvin Lee", "Nightland", "Forbes", "Sterling", "Triumph", "Wellington",
 75 |                     "MacDonald", "Englander", "Tupelo", "Beckham", "Franklin", "Astro",
 76 |                     "Aristides", "Morris", "Shortleaf", "Cardinal", "Little Valley",
 77 |                     "Applewood", "Roosevelt", "Marvel", "Juniper Hill", "Esperanza",
 78 |                     "Touchstone", "Blakeville", "Shafer", "Sebastian", "Polaris",
 79 |                     "Ponderosa", "Mount Lyon", "Lafayette", "New Beacon", "Chesterfield",
 80 |                     "Harmony", "Rosebud", "Tremont", "Fortuna", "North Star", "Everett",
 81 |                     "Basswood", "Islander", "Martin", "Peabody", "Northbridge", "Hanover",
 82 |                     "Rainbow", "Bicknell", "Courtdale", "Miller", "Mulberry", "Hadley",
 83 |                     "Greenwich", "Ridley", "Birch", "Albert", "Culver", "Chelsea",
 84 |                     "Unity", "Brookfield", "Irving", "Acacia", "Alliance", "Douglass",
 85 |                     "Brennan", "Clivedale", "Shaheen", "Tesla", "Young Oak", "Duncan",
 86 |                     "Selwyn", "Kumar", "Woodland", "Cedar", "Goldfinch", "Sullivan",
 87 |                     "Buffalo Way", "Danehill", "Simpson", "Nyquist", "Chan", "Alewife",
 88 |                     "Galaxy", "Highland", "Marlowe", "Tower", "Alonzo", "Prairie",
 89 |                     "Eureka", "King", "Humphrey", "Knightwood", "Gallant", "Boundbrook",
 90 |                     "Cherry Hill", "Kirkland", "Nobscot", "Suarez", "Floyd", "Harrison",
 91 |                     "Greenfield", "Crane", "Rhodes", "Sea Glass", "Westminster",
 92 |                     "Horan", "Copper Cove", "Fairbanks", "Marshfield", "Ashmont",
 93 |                     "Heron", "Henderson", "Ridge Park", "Kent", "Friendship", "Pegasus",
 94 |                     "Garden", "Adler", "Shuster", "Dickens", "Liberty", "Edison",
 95 |                     "Blanchard", "Camino", "Baker", "Cilian", "Speedwell", "Bootes",
 96 |                     "Darwin", "Quincy", "Cold Springs", "Chestnut", "Secretariat",
 97 |                     "Gateway", "Dudley", "Horace Mann", "Glenbrook", "Pompano", "Park",
 98 |                     "Maverick", "Hyde", "Leeds", "Percival", "Davis", "Shirley",
 99 |                     "Redwood", "Murphy", "Poplar", "Fillmore", "Thompson", "Pharaoh",
100 |                     "Bennett", "Kinney", "Gilbert", "Marshall", "Monarch", "Doyle",
101 |                     "Walker", "Clarion", "Spinelli", "Inspiration", "Neptune", "Gibson",
102 |                     "Independence", "Josephine", "Finnley", "Summit", "Bloomfield",
103 |                     "Zola", "Ventura", "Lakeshore", "Phoenix", "Honeypot", "Chateau",
104 |                     "Carter ", "Velocidad", "Utopia", "Owen", "Turcotte", "Sorrel Ridge",
105 |                     "Fawzi", "Woodpecker", "Chisholm", "Hoffman", "Lever", "Ace Hall",
106 |                     "Davidson", "Weston", "Forest Heights", "Finch", "Monroe", "Caldwell",
107 |                     "Hamilton", "Vision", "Elk Grove", "Eleanor", "Quicksilver",
108 |                     "Longleaf", "Chiswick", "Buckeye", "Winthrop", "Austen", "Reyes",
109 |                     "Juneberry", "Waldron", "Adams", "Westbridge", "Fraser", "Truman",
110 |                     "Old Steeple", "Turnstone", "Memorial", "Sycamore", "Sequoia",
111 |                     "Sylvan", "Sandy Beach", "Hathaway", "Mandalay", "Deer Corner",
112 |                     "Clearview", "Corcoran", "Cabot", "Lookout Point")
113 | 
114 | # ps_list <- ps_list %>% select(ope8_id, city, state, name,
115 | #                               degrees_awarded_predominant,
116 | #                               size, online_only, avg_net_price_pub, pell_grant_rate,
117 | #                               retention_four_year_full_time, operating,
118 | #                               part_time_share, act_25th_pctl_cumulative,
119 | #                               act_75th_pctl_cumulative, sat_average_all,
120 | #                               sat_25th_pctl_math, sat_75th_pctl_math,
121 | #                               sat_25th_pctl_reading, sat_75th_pctl_reading,
122 | #                               sat_25th_pctl_writing, sat_75th_pctl_writing,
123 | #                               race_ethn_white, race_ethn_black, race_ethn_hispanic,
124 | #                               race_ethn_asian, race_ethn_two_or_more)
125 | # ps_list <- zap_formats(ps_list)
126 | # ps_list <- as_factor(ps_list)
127 | # ps_list <- ps_list %>% filter(!degrees_awarded_predominant %in% c("Not Classified",
128 | #                                                                  "Entirely graduate-degree granting"))
129 | # ps_list <- ps_list %>% filter(size > 0) %>% filter(size < 100000)
130 | # ps_list <- ps_list %>% filter(operating == 1) %>% select(-operating)
131 | 
132 | college_scorecard <- read.csv("data-raw/college_scorecard_2013.csv",
133 |                               stringsAsFactors = FALSE)
134 | 
135 | 
136 | ps_names <- c("COMMUNITY COLLEGE 400", "DEF COMMUNITY COLLEGE", "D COMMUNITY COLLEGE",
137 |               "UVW COMMUNITY COLLEGE", "COMMUNITY COLLEGE C", "A COMMUNITY COLLEGE",
138 |               "COMMUNITY COLLEGE 4", "E COMMUNITY COLLEGE", "COMMUNITY COLLEGE 500",
139 |               "COMMUNITY COLLEGE B", "HKL COMMUNITY COLLEGE", "XYZ COMMUNITY COLLEGE",
140 |               "COMMUNITY COLLEGE A", "COMMUNITY COLLEGE 1", "B COMMUNITY COLLEGE",
141 |               "MNO COMMUNITY COLLEGE", "C COMMUNITY COLLEGE", "COMMUNITY COLLEGE 6",
142 |               "PRIVATE TECHNICAL INSTITUTE", "COMMUNITY COLLEGE 100", "COMMUNITY COLLEGE 7",
143 |               "COMMUNITY COLLEGE 2", "COMMUNITY COLLEGE 200", "GHI COMMUNITY COLLEGE",
144 |               "COMMUNITY COLLEGE 3", "PQR COMMUNITY COLLEGE", "COMMUNITY COLLEGE 5",
145 |               "ABC COMMUNITY COLLEGE", "COMMUNITY COLLEGE 300", "STU COMMUNITY COLLEGE",
146 |               "UNIVERSITY XYZ", "COLLEGE OF XYZ", "UNIVERSITY OF C", "ABC STATE UNIVERSITY",
147 |               "STATE UNIVERSITY - NORTH CAMPUS", "PRIVATE COLLEGE A", "UNIVERSITY A",
148 |               "UNIVERSITY - MAIN CAMPUS", "PUBLIC UNIVERSITY 1", "STATE UNIVERSITY - EAST CAMPUS",
149 |               "PRIVATE COLLEGE B", "UNIVERSITY OF B", "UNIVERSITY - CAMPUS 2",
150 |               "STATE UNIVERSITY - WEST CAMPUS", "AB COLLEGE", "UNIVERSITY OF GH",
151 |               "COLLEGE OF ABC", "COLLEGE OF DEFGH", "PUBLIC UNIVERSITY 2",
152 |               "STATE UNIVERSITY - SOUTH CAMPUS", "COMMUNITY COLLEGE 400",
153 |               "DEF COMMUNITY COLLEGE", "COLLEGE OF XYZ", "D COMMUNITY COLLEGE",
154 |               "UVW COMMUNITY COLLEGE", "UNIVERSITY XYZ", "A COMMUNITY COLLEGE",
155 |               "COMMUNITY COLLEGE 4", "E COMMUNITY COLLEGE", "COMMUNITY COLLEGE 500",
156 |               "ABC STATE UNIVERSITY", "COMMUNITY COLLEGE B", "STATE UNIVERSITY - NORTH CAMPUS",
157 |               "HKL COMMUNITY COLLEGE", "PRIVATE COLLEGE A", "UNIVERSITY A",
158 |               "UNIVERSITY - MAIN CAMPUS", "PUBLIC UNIVERSITY 1", "STATE UNIVERSITY - EAST CAMPUS",
159 |               "PRIVATE COLLEGE B", "UNIVERSITY OF B", "UNIVERSITY - CAMPUS 2",
160 |               "COMMUNITY COLLEGE 1", "STATE UNIVERSITY - WEST CAMPUS", "XYZ COMMUNITY COLLEGE",
161 |               "AB COLLEGE", "B COMMUNITY COLLEGE", "MNO COMMUNITY COLLEGE",
162 |               "UNIVERSITY OF GH", "C COMMUNITY COLLEGE", "COMMUNITY COLLEGE 6",
163 |               "COMMUNITY COLLEGE A", "COLLEGE OF ABC", "COLLEGE OF DEFGH",
164 |               "PUBLIC UNIVERSITY 2", "PRIVATE TECHNICAL INSTITUTE", "COMMUNITY COLLEGE 7",
165 |               "UNIVERSITY OF C", "COMMUNITY COLLEGE 2", "COMMUNITY COLLEGE 200",
166 |               "GHI COMMUNITY COLLEGE", "PQR COMMUNITY COLLEGE", "COMMUNITY COLLEGE C",
167 |               "COMMUNITY COLLEGE 5", "COMMUNITY COLLEGE 100", "ABC COMMUNITY COLLEGE",
168 |               "COMMUNITY COLLEGE 3", "STATE UNIVERSITY - SOUTH CAMPUS", "COMMUNITY COLLEGE 300",
169 |               "STU COMMUNITY COLLEGE")
170 | 
171 | # saveRDS(xwalk, "data/sdp_ceds_map.rds")
172 | devtools::use_data(ell, xwalk, ses, prog_baseline, age_grade, sch_names,
173 |                    college_scorecard,
174 |                    ps_names,
175 |                    internal = TRUE, overwrite = TRUE)
176 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | # Expand DF into grid, from SO
  2 | #
  3 | #' Expand dataframe into a complete grid
  4 | #'
  5 | #' @param ... dataframe
  6 | #' @details From SO {\url{http://stackoverflow.com/questions/11693599/alternative-to-expand-grid-for-data-frames}}
  7 | #' @return an expanded data frame
  8 | #' @export
  9 | expand_grid_df <- function(...){
 10 |   Reduce(function(...) merge(..., by=NULL), list(...))
 11 | }
 12 | 
 13 | ##' Function to calculate age from date of birth.
 14 | ##' @description his function calculates age in days, months, or years from a
 15 | ##' date of birth to another arbitrary date. This returns a numeric vector in
 16 | ##' the specified units.
 17 | ##' @param dob a vector of class \code{Date} representing the date of birth/start date
 18 | ##' @param enddate a vector of class Date representing the when the observation's
 19 | ##' age is of interest, defaults to current date.
 20 | ##' @param units character, which units of age should be calculated? allowed values are
 21 | ##' days, months, and years
 22 | ##' @param precise logical indicating whether or not to calculate with leap year
 23 | ##' and leap second precision
 24 | ##' @return A numeric vector of ages the same length as the dob vector
 25 | ##' @source This function was developed in part from this response on the R-Help mailing list.
 26 | ##' @seealso See also \code{\link{difftime}} which this function uses and mimics
 27 | ##' some functionality but at higher unit levels.
 28 | ##' @author Jason P. Becker
 29 | ##' @export
 30 | ##' @examples
 31 | ##' a <- as.Date(seq(as.POSIXct('1987-05-29 018:07:00'), len=26, by="21 day"))
 32 | ##' b <- as.Date(seq(as.POSIXct('2002-05-29 018:07:00'), len=26, by="21 day"))
 33 | ##'
 34 | ##' age <- age_calc(a, units='years')
 35 | ##' age
 36 | ##' age <- age_calc(a, units='months')
 37 | ##' age
 38 | ##' age <- age_calc(a, as.Date('2005-09-01'))
 39 | ##' age
 40 | age_calc <- function (dob, enddate = Sys.Date(),
 41 |                       units = "months", precise = TRUE){
 42 |   if (!inherits(dob, "Date") |
 43 |       !inherits(enddate, "Date")) {
 44 |     stop("Both dob and enddate must be Date class objects")
 45 |   }
 46 |   if (any(enddate < dob)) {
 47 |     stop("End date must be a date after date of birth")
 48 |   }
 49 |   start <- as.POSIXlt(dob)
 50 |   end <- as.POSIXlt(enddate)
 51 |   if (precise) {
 52 |     start_is_leap <-
 53 |       ifelse(start$year %% 400 == 0,
 54 |              TRUE,
 55 |              ifelse(
 56 |                start$year %% 100 ==
 57 |                  0,
 58 |                FALSE,
 59 |                ifelse(start$year %% 4 == 0, TRUE, FALSE)
 60 |              ))
 61 |     end_is_leap <-
 62 |       ifelse(end$year %% 400 == 0,
 63 |              TRUE,
 64 |              ifelse(end$year %% 100 ==
 65 |                       0, FALSE, ifelse(end$year %%
 66 |                                          4 == 0, TRUE, FALSE)))
 67 |   }
 68 |   if (units == "days") {
 69 |     result <- difftime(end, start, units = "days")
 70 |   }
 71 |   else if (units == "months") {
 72 |     months <- sapply(mapply(
 73 |       seq,
 74 |       as.POSIXct(start),
 75 |       as.POSIXct(end),
 76 |       by = "months",
 77 |       SIMPLIFY = FALSE
 78 |     ),
 79 |     length) - 1
 80 |     if (precise) {
 81 |       month_length_end <- ifelse(end$mon == 1 & end_is_leap,
 82 |                                  29,
 83 |                                  ifelse(end$mon == 1, 28, ifelse(end$mon %in%
 84 |                                                                    c(3, 5, 8, 10), 30, 31)))
 85 |       month_length_prior <- ifelse((end$mon - 1) == 1 &
 86 |                                      start_is_leap,
 87 |                                    29,
 88 |                                    ifelse((end$mon - 1) == 1,
 89 |                                           28, ifelse((end$mon - 1) %in% c(3, 5, 8, 10),
 90 |                                                      30, 31)))
 91 |       month_frac <- ifelse(
 92 |         end$mday > start$mday,
 93 |         (end$mday -
 94 |            start$mday) / month_length_end,
 95 |         ifelse(
 96 |           end$mday <
 97 |             start$mday,
 98 |           (month_length_prior - start$mday) / month_length_prior +
 99 |             end$mday /
100 |             month_length_end,
101 |           0
102 |         )
103 |       )
104 |       result <- months + month_frac
105 |     }
106 |     else {
107 |       result <- months
108 |     }
109 |   }
110 |   else if (units == "years") {
111 |     years <- sapply(mapply(
112 |       seq,
113 |       as.POSIXct(start),
114 |       as.POSIXct(end),
115 |       by = "years",
116 |       SIMPLIFY = FALSE
117 |     ),
118 |     length) - 1
119 |     if (precise) {
120 |       start_length <- ifelse(start_is_leap, 366, 365)
121 |       end_length <- ifelse(end_is_leap, 366, 365)
122 |       start_day <- ifelse(start_is_leap & start$yday >=
123 |                             60, start$yday - 1, start$yday)
124 |       end_day <- ifelse(end_is_leap & end$yday >= 60, end$yday -
125 |                           1, end$yday)
126 |       year_frac <- ifelse(
127 |         start_day < end_day,
128 |         (end_day -
129 |            start_day) / end_length,
130 |         ifelse(
131 |           start_day > end_day,
132 |           (start_length - start_day) /
133 |             start_length + end_day / end_length,
134 |           0
135 |         )
136 |       )
137 |       result <- years + year_frac
138 |     }
139 |     else {
140 |       result <- years
141 |     }
142 |   }
143 |   else {
144 |     stop("Unrecognized units. Please choose years, months, or days.")
145 |   }
146 |   return(result)
147 | }
148 | 
149 | 
150 | #' Generate positive random numbers that sum to a value
151 | #'
152 | #' @param N number of numbers to generate
153 | #' @param M constraint on the sum
154 | #' @param sd standard deviation of values
155 | #'
156 | #' @return a vector of numerics, length N
157 | #' @importFrom stats rlnorm
158 | #' @export
159 | #' @examples
160 | #' out <- rand_vect_cont(N = 10, M = 2, sd = 1)
161 | #' sum(out)
162 | #' length(out)
163 | rand_vect_cont <- function(N, M, sd = 1) {
164 |   vec <- rlnorm(N, M/N, sd)
165 |   vec / sum(vec) * M
166 | }
167 | 
168 | #' Convert auto-generated grades to better grades
169 | #'
170 | #' @param x a vector of grades formatted with g
171 | #'
172 | #' @return a character vector of grades
173 | convert_grade <- function(x){
174 |   x <- as.character(x)
175 |   x[x == "g-1"] <- "PK"
176 |   x[x == "g0"] <- "KG"
177 |   x[x %in% paste0("g", 1:12)] <- sapply(x[x %in% paste0("g", 1:12)],
178 |                                         function(x) gsub("g", "", x))
179 |   x <- as.character(unlist(x))
180 |   return(x)
181 | }
182 | 
183 | #' Generate simulated predictions from a linear model that account for
184 | #' model and parameter error
185 | #'
186 | #' @param object an object of class lm or glm
187 | #' @param nsim number of simulations per observation to generate
188 | #' @param newdata dataframe containing the observations to generate predictions
189 | #' for
190 | #' @param resid_error should the model residual error be added to predictions,
191 | #' default is FALSE
192 | #' @importFrom mvtnorm rmvnorm
193 | #'
194 | #' @return a matrix of predictions nrow(newdata) x nsim columns
195 | #' @export
196 | better_sim.lm <- function(object, nsim, newdata, resid_error = FALSE){
197 |   newdata <- as.matrix(newdata)
198 |   se <- vcov(object)
199 |   eff <- coef(object)
200 |   coefs <- mvtnorm::rmvnorm(nsim, mean = eff, sigma = se)
201 |   if(ncol(newdata) == ncol(coefs)-1){
202 |     warning("One too few variables in newdata, appending intercept to front")
203 |     newdata <- cbind("Intercept" = 1, newdata)
204 |   } else if(ncol(newdata) != ncol(coefs)){
205 |     stop("Wrong dimensions in newdata")
206 |   }
207 |   preds <- as.matrix(newdata) %*% t(coefs)
208 |   error <- rnorm(length(preds), mean = 0, sd = sigma(object))
209 |   if(resid_error){
210 |     preds <- preds + error
211 |   }
212 |   return(preds)
213 | }
214 | 
215 | 
216 | #' Covnert a matrix to a row-wise transition matrix
217 | #'
218 | #' @param matrix a matrix tof counts
219 | #'
220 | #' @return matrix M divided by the sum of its rows
221 | #' @export
222 | #'
223 | #' @examples
224 | #' base_mat <- structure(c(44985, 740, 781, 7640), .Dim = c(2L, 2L),
225 | #' .Dimnames = list(c("0", "1"), c("0", "1")))
226 | #' tm_convert(base_mat)
227 | tm_convert <- function(matrix){
228 |   stopifnot("matrix" %in% class(matrix))
229 |   out <- matrix / rowSums(matrix)
230 |   return(out)
231 | }
232 | 
233 | #' Unscale a scaled variable
234 | #'
235 | #' @param x numeric vector that has been scaled
236 | #' @param mean a numeric, the mean to add to x
237 | #' @param sd a numeric, the standardized factor to divide x by
238 | #'
239 | #' @return x rescaled with mean and sd specified by the user
240 | #' @export
241 | unscale <- function(x, mean, sd) {
242 |   y <- (x / sd) + mean
243 |   return(y)
244 | }
245 | 
246 | #' Reliabily rescale numerics with missingness
247 | #'
248 | #' @param x a numeric vector
249 | #'
250 | #' @return x mean centered and divided by it's standard deviation
251 | #' @details If \code{sd(x)} is undefined, this returns a zero
252 | #' @export
253 | rescale <- function(x){
254 |   y <- (x - mean(x, na.rm=TRUE)) / sd(x, na.rm=TRUE)
255 |   y[is.na(y)] <- 0
256 |   return(y)
257 | }
258 | 
259 | #' Recode numeric data into plausible credit data
260 | #'
261 | #' @param x numeric, vector of numerics to be truncated
262 | #' @param top integer, maximum allowable value
263 | #' @details enforces no negative numbers and truncates data at a user specified
264 | #' maximum
265 | #' @return a vector, length of x, with truncated values
266 | #' @export
267 | recode_credits <- function(x, top = 6){
268 |   y <- num_clip(x, min = 0, max = top)
269 |   return(y)
270 | }
271 | 
272 | 
273 | #' Rescaling variables in groups
274 | #'
275 | #' @param data a dataframe containing variables you wish to rescale
276 | #' @param var name of the variable to be rescaled
277 | #' @param group_var character vector of the grouping terms
278 | #' @param newvar optional character vector for the name of the new rescaled variable
279 | #' @return data with the newvar appended
280 | #' @export
281 | #' @importFrom lazyeval interp
282 | group_rescale <- function(data, var, group_var, newvar=NULL){
283 |   if(is.null(newvar)){
284 |     newvar <- var
285 |   }
286 |   varval <- lazyeval::interp(~ OpenSDPsynthR::rescale(z), z = as.name(var))
287 |   data <- data %>% group_by_(group_var) %>%
288 |     mutate_(.dots = setNames(list(varval), newvar))
289 |   return(data)
290 | }
291 | 
292 | #' Convert a character representation of grades to numeric
293 | #'
294 | #' @param grade a character vector of grades with character labels
295 | #'
296 | #' @return a numeric vector, length of grade, representing grade levels as numbers
297 | #' @export
298 | num_grade <- function(grade){
299 |   grade[grade == "g13"] <- "13"
300 |   grade[grade == "KG"] <- "0"
301 |   grade[grade == "PK"] <- "-1"
302 |   grade <- as.numeric(grade)
303 |   return(grade)
304 | }
305 | 
306 | #' Convert NA values to 0 in a vector
307 | #'
308 | #' @param x a numeric vector containing NAs
309 | #'
310 | #' @return a numeric vector where all NA values are 0
311 | #' @export
312 | zeroNA <- function(x){
313 |   x[is.na(x)] <- 0
314 |   return(x)
315 | }
316 | 
317 | #' Clip a vector to be between a minimum and a maximum
318 | #'
319 | #' @param x a numeric vector
320 | #' @param min numeric, a floor
321 | #' @param max numeric, a ceiling
322 | #'
323 | #' @return x, truncated to be between min and max
324 | #' @export
325 | num_clip <- function(x, min, max){
326 |   x <- ifelse(x > max, max, x)
327 |   x <- ifelse(x < min, min, x)
328 |   return(x)
329 | }
330 | 
331 | #' Validate probability list formatting and structure
332 | #'
333 | #' @param list list that is to be passed as a probability list to simulation functions
334 | #'
335 | #' @return Logical TRUE if list is valid, error if list is not valid.
336 | #' @export
337 | validate_probability_list <- function(list){
338 |   stopifnot(class(list) == "list")
339 |   if(!"GROUPVARS" %in% names(list)){
340 |     stop("List must contain a named element GROUPVARS that defines the variable
341 |          in the data to assign different probability functions, e.g. Sex or Race")
342 |   }
343 |   if(!class(list$GROUPVARS) %in% c("character", "factor")){
344 |     stop("List element GROUPVARS must be a character or factor which can be
345 |          coerced to a character which defines the levels of a grouping term.")
346 |   }
347 |   if(!all(unlist(lapply(gifted_list[-which("GROUPVARS" == names(gifted_list))], class)) == "list")){
348 |     stop("List elements not GROUPVARS must each be a list.")
349 |   }
350 |   if(!all(unlist(
351 |     lapply(
352 |       lapply(
353 |         gifted_list[-which("GROUPVARS" == names(gifted_list))], names),
354 |       function(x) x == c("f", "pars"))))){
355 |     stop("All list elements except GROUPVARS must contain exactly two elements,
356 |          'f' and 'pars'.")
357 |   }
358 |   message("List validated.")
359 |   return(TRUE)
360 | }
361 | 
362 | 
363 | 
364 | #' Validate outcome simulation list formatting and structure
365 | #'
366 | #' @param list list that is to be passed as a parameter list to outcome simulation functions
367 | #'
368 | #' @return Logical TRUE if list is valid, error if list is not valid.
369 | #' @export
370 | validate_sim_parameter <- function(list){
371 |   stopifnot(class(list) == "list")
372 |   if(length(names(list)) > 9){
373 |     if(any(!c("fixed", "random", "random3", "cor_vars", "fixed_param", "fact_vars",
374 |               "random_param", "random_param3", "cov_param", "unbalCont", "unbalCont3",
375 |               "unbal", "k", "n", "p", "error_var", "with_err_gen", "lvl1_err_params",
376 |               "data_str") %in% names(list))){
377 |       stop("A three level sim list must have named elements: fixed, random,
378 |             random3, cor_vars, fixed_param, fact_vars, random_param, random_param3,
379 |             cov_param, unbalCont, unbalCont3, unbal, k, n, p, error_var,
380 |             with_err_gen, lvl1_err_params, and data_str")
381 |     }
382 |   } else{
383 |     if(any(!c("fixed", "random_var", "cov_param", "cor_vars",
384 |               "fixed_param", "ngrps", "unbalanceRange") %in% names(list))){
385 |       stop("List must have named elements 'fixed', 'random_var', 'cov_param',
386 |          'cor_vars', 'fixed_param', 'ngrps', and 'unbalanceRange'")
387 |     }
388 |   }
389 |   if(class(list$fixed) != "formula"){
390 |     stop("Fixed must be a formula object defining the fixed component of the
391 |          outcome formula.")
392 |   }
393 |   if(attr(terms(list$fixed), "response") > 0){
394 |     stop("Formula must not have a response or LHS variable defined.")
395 |   }
396 |   K <- length(attr(terms(list$fixed), "term.labels"))
397 |   if(length(list$random_var > 0)){
398 |     if(class(list$random_var) != "numeric"){
399 |       stop("Random variance must be expressed as a numeric value. Consider a value
400 |          in the range of 0.01 to 0.2")
401 |     }
402 |   }
403 |    if(length(list$random_var) > 1){
404 |     stop("Random variance element of the list can only be length 1.")
405 |   }
406 |   if(class(list$cov_param) != "list"){
407 |     stop("cov_param element must be a list that defines the parameters of the
408 |          covariate simulation.")
409 |   }
410 |   if(!all(c("dist_fun", "var_type", "opts") %in% names(list$cov_param))){
411 |     stop("cov_param list must have three elements, dist_fun, var_type, and opts")
412 |   }
413 |   if("time" %in% attr(terms(list$fixed), "term.labels")){
414 |     if(length(list$cov_param$dist_fun) != K-1){
415 |       stop("cov_param$dist_fun must have one fewer element than terms in list$fixed")
416 |     }
417 |     K_star <- K-1 # adjust for time variable being dropped out in simglm
418 |     if(length(list$cor_vars) != (K_star^2 - K_star)/2){
419 |       cor_elements <- (K_star^2 - K_star) /2
420 |       stop("Correlation vector cor_vars must specify ", cor_elements, " correlation
421 |          matrix elements.")
422 |     }
423 |   } else{
424 |     if(length(list$cov_param$dist_fun) != K){
425 |       stop("cov_param$dist_fun must have the same length as the terms in list$fixed")
426 |     }
427 |     if(length(list$cor_vars) != (K^2 - K)/2){
428 |       cor_elements <- (K^2 - K) /2
429 |       stop("Correlation vector cor_vars must specify ", cor_elements, " correlation
430 |          matrix elements.")
431 |     }
432 |   }
433 |   if(length(list$fixed_param) != K+1){
434 |     stop("List element fixed_param must define beta coefficient values for ", K+1,
435 |          " parameters specified in the fixed formula and one intercept.")
436 |   }
437 |   if(class(list$fixed_param) != "numeric"){
438 |     stop("Please define fixed parameters as numeric.")
439 |   }
440 |   # For two level simulation this is the specification
441 |   if("ngrps" %in% names(list)){
442 |     if(length(list$ngrps) != 1){
443 |       stop("Element ngrps must be length 1 numeric or integer")
444 |     }
445 |     if(!class(list$ngrps) %in% c("numeric", "integer")){
446 |       stop("Element ngrps must be an integer or numeric")
447 |     }
448 |   }
449 |   if("unbalanceRange" %in% names(list)){
450 |     if(length(list$unbalanceRange) != 2){
451 |       stop("Element unbalanceRange must be length 2, numeric or integer")
452 |     }
453 |     if(!class(list$unbalanceRange) %in% c("numeric", "integer")){
454 |       stop("unbalanceRange must be numeric or integer")
455 |     }
456 |   }
457 |   message("Simulation parameters validated.")
458 |   return(TRUE)
459 | }
460 | 
461 | 
462 | #' Simple error assertions
463 | #'
464 | #' @param expr logical expression
465 | #' @param error character, error message
466 | #'
467 | #' @return An error message, or nothing
468 | assert <- function (expr, error) {
469 |   if (! expr) stop(error, call. = FALSE)
470 | }
471 | 
472 | 
473 | #' Sum matrix elements in a list
474 | #'
475 | #' @param l a list
476 | #'
477 | #' @return a list of summed matrix elements
478 | #' @importFrom magrittr %>%
479 | m_sum <- function(l) {
480 |   Reduce(l, `+`) %>% list()
481 | }
482 | 
483 | #' Tidy a two-state markov sequence for output
484 | #'
485 | #' @param seq a vector of sequence elements, with only two states
486 | #' @param states a vector of length two naming both possible states
487 | #'
488 | #' @return a data.frame
489 | #' @importFrom markovchain createSequenceMatrix
490 | #' @export
491 | #' @examples
492 | #' tidy_sequence(seq = c("Yes", "No", "No", "No", "Yes", "Yes"),
493 | #'              states = c("Yes", "No"))
494 | tidy_sequence <- function(seq, states){
495 |   stopifnot(length(states) == 2)
496 |   tmp <- markovchain::createSequenceMatrix(seq, possibleStates = states)
497 |   newNames <- outer(dimnames(tmp)[[1]], dimnames(tmp)[[2]], paste, sep = "-")
498 |   dim(newNames) <- NULL
499 |   dim(tmp) <- c(1, 4)
500 |   tmp <- as.data.frame(tmp)
501 |   colnames(tmp) <- newNames
502 |   return(tmp)
503 | }
504 | 
505 | #' Compute conditional Markov sequences in a pipeline
506 | #'
507 | #' @param groupname variable that identifies the list element to use
508 | #' @param n number of elements to generate, usually defined by \link[dplyr]{n} in dplyr
509 | #' @param lst probability list that contains function and function parameters
510 | #' @param ... additional arguments passed into the function
511 | #' @return A sequence generated by functions in the probability list
512 | #' @export
513 | markov_cond_list <- function(groupname, n, lst, ...){
514 |   # TODO: add checks to lst
515 |   do.call(lst[[groupname]]$f,
516 |           c(list(n = n), lst[[groupname]]$pars, ...),
517 |           quote = FALSE)
518 | }
519 | 
520 | 


--------------------------------------------------------------------------------
/inst/CG_Analyze_Example.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "SDP College Going Analyze Example"
  3 | author: "Jared E. Knowles"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{CG Analyze}
  8 |   %\VignetteEncoding{UTF-8}
  9 |   %\VignetteEngine{knitr::rmarkdown}
 10 | editor_options: 
 11 |   chunk_output_type: console
 12 | ---
 13 | 
 14 | ```{r, echo = FALSE}
 15 | knitr::opts_chunk$set(
 16 |   collapse = TRUE,
 17 |   comment = "#>",
 18 |   fig.path = "tools/figs/README-",
 19 |   message = FALSE,
 20 |   warning = FALSE,
 21 |   fig.width =8, fig.height = 6.25
 22 | )
 23 | ```
 24 | 
 25 | ## Open SDP Data
 26 | 
 27 | First, we need to generate data suitable to conducting a college-going analysis. 
 28 | 
 29 | ```{r echo=FALSE, warning=FALSE, message=FALSE}
 30 | library(OpenSDPsynthR)
 31 | library(ggplot2)
 32 | library(tidyr)
 33 | library(scales)
 34 | library(magrittr)
 35 | 
 36 | 
 37 | simouts <- simpop(nstu = 40000, seed = 8763434, 
 38 |                   control = sim_control(nschls = 12L, minyear = 1996,
 39 |                                         n_postsec = 50L,
 40 |                                         n_cohorts = 3,
 41 |                                         maxyear = 2017)) 
 42 | # Check hs_outcomes and g12_cohort
 43 | 
 44 | # 
 45 | # // Ninth grade cohorts you can observe persisting to the second year of college
 46 | # global chrt_ninth_begin_persist_yr2 = 2008
 47 | # global chrt_ninth_end_persist_yr2 = 2010
 48 | #  
 49 | # // Ninth grade cohorts you can observe graduating high school on time
 50 | # global chrt_ninth_begin_grad = 2013
 51 | # global chrt_ninth_end_grad = 2015
 52 | #  
 53 | # // Ninth grade cohorts you can observe graduating high school one year late
 54 | # global chrt_ninth_begin_grad_late = 2012
 55 | # global chrt_ninth_end_grad_late = 2014
 56 | #  
 57 | # // High school graduation cohorts you can observe enrolling in college the fall after graduation
 58 | # global chrt_grad_begin = 2009
 59 | # global chrt_grad_end = 2011
 60 | #  
 61 | # // High school graduation cohorts you can observe enrolling in college two years after hs graduation
 62 | # global chrt_grad_begin_delayed = 2008
 63 | # global chrt_grad_end_delayed = 2010
 64 | 
 65 | ```
 66 | 
 67 | 
 68 | ## Attainment
 69 | 
 70 | ### Overall Progression
 71 | 
 72 | ```{r}
 73 | plotdf <- filter(cg_data, chrt_ninth >= 2004 &
 74 |                    chrt_ninth <= 2008) %>% 
 75 |   filter(!is.na(ontime))
 76 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0)
 77 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0)
 78 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 &
 79 |                                              plotdf$enroll_any_yr2 == 1)
 80 | 
 81 | agencyData <- plotdf %>%
 82 |   summarize(grad = mean(grad),
 83 |             seamless_transitioners_any = mean(seamless_transitioners_any, na.rm=TRUE),
 84 |             second_year_persisters = mean(second_year_persisters, na.rm=TRUE),
 85 |             N = n())
 86 | agencyData$school_name <- "AGENCY AVERAGE"
 87 | # // 2. Calculate the mean of each outcome variable by first high school attended
 88 | schoolData <- plotdf %>% group_by(first_hs_code) %>%
 89 |   summarize(grad = mean(grad),
 90 |             seamless_transitioners_any = mean(seamless_transitioners_any,
 91 |                                               na.rm=TRUE),
 92 |             second_year_persisters = mean(second_year_persisters, na.rm=TRUE),
 93 |             N = n())
 94 | ## high school attended
 95 | names(schoolData)[1] <- "school_name"
 96 | # // 3. Identify the agency maximum values for each of the three outcome variables
 97 | maxSchool <- schoolData %>% summarize_all(.funs = funs("max"))
 98 | maxSchool$school_name <- "AGENCY MAX HS"
 99 | # // 4. Identify the agency minimum values for each of the three outcome variables
100 | minSchool <- schoolData %>% summarize_all(.funs = funs("min"))
101 | minSchool$school_name <- "AGENCY MIN HS"
102 | # // 5. Append the three tempfiles to the school-level file loaded into R
103 | schoolData <- bind_rows(schoolData, agencyData,
104 | minSchool, maxSchool)
105 | rm(agencyData, minSchool, maxSchool)
106 | # // Step 6: Prepare to graph the results
107 | library(tidyr)
108 | schoolData$cohort <- 1
109 | schoolData <- schoolData %>% gather(key = outcome,
110 |                                     value = measure, -N, -school_name)
111 | schoolData$subset <- grepl("AGENCY", schoolData$school_name)
112 | library(ggplot2)
113 | library(scales)
114 | schoolData$outcome[schoolData$outcome == "cohort"] <- "Ninth Graders"
115 | schoolData$outcome[schoolData$outcome == "grad"] <- "On-time Graduates"
116 | schoolData$outcome[schoolData$outcome == "seamless_transitioners_any"] <-
117 |   "Seamless College Transitioner"
118 | schoolData$outcome[schoolData$outcome == "second_year_persisters"] <-
119 |   "Second Year Persisters"
120 | 
121 | ```
122 | 
123 | ```{r}
124 | ggplot(schoolData[schoolData$subset,],
125 | aes(x = outcome, y = measure, group = school_name,
126 | color = school_name, linetype = school_name)) +
127 | geom_line(size = 1.1) + geom_point(aes(group = 1), color = I("black")) +
128 | geom_text(aes(label = round(measure * 100, 1)), vjust = -0.8, hjust = -0.25,
129 | color = I("black")) +
130 | scale_y_continuous(limits = c(0, 1), label = percent) +
131 | theme_bw() + theme(legend.position = c(0.825, 0.825)) +
132 | guides(color = guide_legend("", keywidth = 6,
133 | label.theme = element_text(face = "bold",
134 | size = 8,
135 | angle = 0)),
136 | linetype = "none") +
137 | labs(y = "Percent of Ninth Graders",
138 | title = "Student Progression from 9th Grade Through College",
139 | subtitle = "Agency Average", x = "",
140 | caption = paste0("Sample: 2004-2005 Agency first-time ninth graders. \n",
141 | "Postsecondary enrollment outcomes from NSC matched records. \n",
142 | "All other data from Agency administrative records."))
143 | ```
144 | 
145 | ### Progression by Race and Ethnicity
146 | 
147 | ```{r}
148 | plotdf <- filter(cg_data, chrt_ninth >= 2004 &
149 |                    chrt_ninth <= 2008)%>% 
150 |   filter(!is.na(ontime))
151 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0)
152 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0)
153 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 &
154 |                                              plotdf$enroll_any_yr2 == 1)
155 | 
156 | # // Step 3: Create agency-level average outcomes
157 | progressRace <- plotdf %>% group_by(race_ethnicity) %>%
158 |   summarize(grad = mean(grad),
159 |             seamless_transitioners_any = mean(seamless_transitioners_any, na.rm=TRUE),
160 |             second_year_persisters = mean(second_year_persisters, na.rm=TRUE), N = n())
161 | # // Step 4: Reformat the data for plotting
162 | progressRace$cohort <- 1
163 | progressRace <- progressRace %>% gather(key = outcome,
164 |                                         value = measure, -N, -race_ethnicity)
165 | # // Step 5: Recode variables for plot-friendly labels
166 | progressRace$outcome[progressRace$outcome == "cohort"] <- "Ninth Graders"
167 | progressRace$outcome[progressRace$outcome == "grad"] <- "On-time Graduates"
168 | progressRace$outcome[progressRace$outcome == "seameless_transitioners_any"] <-
169 |   "Seamless College Transitioner"
170 | progressRace$outcome[progressRace$outcome == "second_year_persisters"] <-
171 |   "Second Year Persisters"
172 | progressRace$subset <- ifelse(progressRace$race_ethnicity %in% 
173 |               c("Black or African American", "White", "Asian", "Hispanic or Latino Ethnicity"),
174 |                               TRUE, FALSE)
175 | 
176 | ```
177 | 
178 | ```{r}
179 | ggplot(progressRace[progressRace$subset,],
180 |        aes(x = outcome, y = measure, group = race_ethnicity,
181 |            color = race_ethnicity, linetype = race_ethnicity)) +
182 |   geom_line(size = 1.1) + geom_point(aes(group = 1), color = I("black")) +
183 |   geom_text(aes(label = round(measure * 100, 1)), vjust = -0.8, 
184 |             hjust = -0.25, color = I("black")) +
185 | scale_y_continuous(limits = c(0, 1), label = percent) +
186 | theme_bw() + theme(legend.position = c(0.825, 0.825)) +
187 | guides(color = guide_legend("", keywidth = 6,
188 |                             label.theme =element_text(face = "bold", size = 8,
189 |                                                       angle = 0)), linetype = "none") +
190 | labs(y = "Percent of Ninth Graders",
191 | title = "Student Progression from 9th Grade Through College",
192 | subtitle = "By Student Race/Ethnicity", x = "",
193 | caption = paste0("Sample: 2004-2005 Agency first-time ninth graders. \n",
194 | "Postsecondary enrollment outcomes from NSC matched records. \n",
195 | "All other data from Agency administrative records."))
196 | ```
197 | 
198 | ### Progression by Race/Ethnicity by FRL
199 | 
200 | 
201 | ```{r}
202 | plotdf <- filter(cg_data, chrt_ninth >= 2004 &
203 |                    chrt_ninth <= 2008)%>% 
204 |   filter(!is.na(ontime))
205 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0)
206 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0)
207 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 &
208 |                                              plotdf$enroll_any_yr2 == 1)
209 | plotdf <- plotdf %>% filter(frpl_ever_hs == 0)
210 | 
211 | progressRaceFRL <- plotdf %>% group_by(race_ethnicity) %>%
212 |   summarize(grad = mean(grad),
213 |             seameless_transitioners_any = mean(seamless_transitioners_any, na.rm=TRUE),
214 |             second_year_persisters = mean(second_year_persisters, na.rm=TRUE),
215 | N = n())
216 | # // Step 5: Reformat the data file so that one variable contains all the
217 | # outcomes of interest
218 | progressRaceFRL %<>% filter(N >= 20)
219 | # // Step 6: Prepare to graph the results
220 | ## Reshape the data
221 | progressRaceFRL$cohort <- 1
222 | progressRaceFRL <- progressRaceFRL %>% 
223 |   gather(key = outcome,value = measure, -N, -race_ethnicity)
224 | 
225 | ## Recode the variables for plot friendly labels
226 | # // Step 5: Recode variables for plot-friendly labels
227 | progressRaceFRL$outcome[progressRaceFRL$outcome == "cohort"] <- "Ninth Graders"
228 | progressRaceFRL$outcome[progressRaceFRL$outcome == "grad"] <- "On-time Graduates"
229 | progressRaceFRL$outcome[progressRaceFRL$outcome == "seameless_transitioners_any"] <-
230 |   "Seamless College Transitioner"
231 | progressRaceFRL$outcome[progressRaceFRL$outcome == "second_year_persisters"] <-
232 |   "Second Year Persisters"
233 | progressRaceFRL$subset <- ifelse(progressRaceFRL$race_ethnicity %in% 
234 |               c("Black or African American", "White", "Asian", "Hispanic or Latino Ethnicity"),
235 |                               TRUE, FALSE)
236 | ```
237 | 
238 | 
239 | 
240 | ```{r}
241 | ggplot(
242 |   progressRaceFRL[progressRaceFRL$subset, ],
243 |   aes(
244 |   x = outcome,
245 |   y = measure,
246 |   group = race_ethnicity,
247 |   color = race_ethnicity,
248 |   linetype = race_ethnicity
249 |   )
250 |   ) +
251 |   geom_line(size = 1.1) + geom_point(aes(group = 1), color = I("black")) +
252 |   geom_text(
253 |   aes(label = round(measure * 100, 1)),
254 |   vjust = -0.8,
255 |   hjust = -0.25,
256 |   color = I("black")
257 |   ) +
258 |   scale_y_continuous(limits = c(0, 1), label = percent) +
259 |   theme_bw() + theme(legend.position = c(0.825, 0.825)) +
260 |   guides(color = guide_legend(
261 |   "",
262 |   keywidth = 6,
263 |   label.theme = element_text(face = "bold", size = 8, angle = 0)
264 |   ),
265 |   linetype = "none") +
266 |   labs(
267 |   y = "Percent of Ninth Graders",
268 |   title = "Student Progression from 9th Grade Through College",
269 |   subtitle = paste0(
270 |   c(
271 |   "Among Students Qualifying for Free or Reduced Price Lunch \n",
272 |   "By Student Race/Ethnicity"
273 |   )
274 |   ),
275 |   x = "",
276 |   caption = paste0(
277 |   "Sample: 2004-2005 Agency first-time ninth graders. \n",
278 |   "Postsecondary enrollment outcomes from NSC matched records.\n",
279 |   "All other data from Agency administrative records."
280 |   )
281 |   )
282 | ```
283 | 
284 | ### Progression by On-Track Status
285 | 
286 | ```{r}
287 | plotdf <- filter(cg_data, chrt_ninth >= 2004 &
288 |                    chrt_ninth <= 2008)
289 | 
290 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0)
291 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0)
292 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 &
293 |                                              plotdf$enroll_any_yr2 == 1)
294 | 
295 | # // Step 3: Generate on track indicators that take into account students’ GPAs
296 | # upon completion of their first year in high school
297 | plotdf$ot <- NA
298 | plotdf$ot[plotdf$ontrack_yr1 == 0] <- "Off-Track to Graduate"
299 | # Check for correctness
300 | plotdf$ot[plotdf$ontrack_yr1 == 1 & plotdf$cum_gpa_yr1 < 3 &
301 |             !is.na(plotdf$cum_gpa_yr1)] <- "On-Track to Graduate, GPA < 3.0"
302 | plotdf$ot[plotdf$ontrack_yr1 == 1 & plotdf$cum_gpa_yr1 >= 3 &
303 |             !is.na(plotdf$cum_gpa_yr1)] <- "On-Track to Graduate, GPA >= 3.0"
304 | # // Step 4: Calculate aggregates for the Agency by on track status
305 | progressTrack <- plotdf %>% group_by(ot) %>%
306 |   summarize(grad = mean(grad),
307 |     seamless_transitioners_any = mean(seamless_transitioners_any, na.rm=TRUE),
308 |     second_year_persisters = mean(second_year_persisters, na.rm=TRUE),
309 |     N = n())
310 | 
311 | # of interest
312 | progressTrack$cohort <- 1
313 | progressTrack <- progressTrack %>% gather(key = outcome,
314 | value = measure, -N, -ot)
315 | progressTrack$outcome[progressTrack$outcome == "cohort"] <- "Ninth Graders"
316 | progressTrack$outcome[progressTrack$outcome == "grad"] <- "On-time Graduates"
317 | progressTrack$outcome[progressTrack$outcome == "seamless_transitioners_any"] <-
318 | "Seamless College Transitioner"
319 | progressTrack$outcome[progressTrack$outcome == "second_year_persisters"] <-
320 | "Second Year Persisters"
321 | 
322 | ```
323 | 
324 | ```{r}
325 | ann_txt <- data.frame(outcome = rep("Second Year Persisters", 3),
326 | measure = c(0.22, 0.55, 0.85),
327 | textlabel = c("Off-Track \nto Graduate",
328 | "On-Track to Graduate,\n GPA < 3.0",
329 | "On-Track to Graduate,\n GPA >= 3.0"))
330 | ann_txt$ot <- ann_txt$textlabel
331 | ggplot(progressTrack,
332 | aes(x = outcome, y = measure, group = ot,
333 | color = ot, linetype = ot)) +
334 | geom_line(size = 1.1) + geom_point(aes(group = 1), color = I("black")) +
335 | geom_text(aes(label = round(measure * 100, 1)), vjust = -0.8, hjust = -0.25,
336 | color = I("black")) +
337 | geom_text(data = ann_txt, aes(label = textlabel)) +
338 | scale_y_continuous(limits = c(0, 1), label = percent) +
339 | theme_bw() + theme(legend.position = c(0.825, 0.825)) +
340 | scale_color_brewer(type = "qual", palette = 2) +
341 | guides(color = "none",
342 | linetype = "none") +
343 | labs(y = "Percent of Ninth Graders",
344 | title = "Student Progression from 9th Grade Through College",
345 | subtitle = "By Course Credits and GPA after First High School Year", x = "",
346 | caption = paste0("Sample: 2004-2005 Agency first-time ninth graders. \n",
347 | "Postsecondary enrollment outcomes from NSC matched records. \n",
348 | "All other data from Agency administrative records."))
349 | ```
350 | 
351 | ## Ninth to Tenth Grade Transition by On-Track Status
352 | 
353 | 
354 | ### Proportion of Students On-Track by High School
355 | 
356 | ```{r}
357 | plotdf <- filter(cg_data, chrt_ninth >= 2005 &
358 |                    chrt_ninth <= 2008) %>% 
359 |   filter(!is.na(ontime))
360 | 
361 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0)
362 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0)
363 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 &
364 |                                              plotdf$enroll_any_yr2 == 1)
365 | 
366 | plotdf$ot <- NA
367 | plotdf$ot[plotdf$ontrack_yr1 == 0] <- "Off-Track to Graduate"
368 | plotdf$ot[plotdf$ontrack_yr1 == 1 & plotdf$cum_gpa_yr1 < 3 &
369 |             !is.na(plotdf$cum_gpa_yr1)] <- "On-Track to Graduate, GPA < 3.0"
370 | plotdf$ot[plotdf$ontrack_yr1 == 1 & plotdf$cum_gpa_yr1 >= 3 &
371 |             !is.na(plotdf$cum_gpa_yr1)] <- "On-Track to Graduate, GPA >= 3.0"
372 | 
373 | progressBars <- bind_rows(
374 |   plotdf %>% group_by(ot) %>% tally() %>% ungroup %>%
375 |     mutate(count = sum(n), first_hs_code = "Agency Average"),
376 |   plotdf %>% group_by(first_hs_code, ot) %>% tally() %>% ungroup %>%
377 |     group_by(first_hs_code) %>%
378 |     mutate(count = sum(n))
379 | )
380 | 
381 | # replace first_hs_name = subinstr(first_hs_name, " High School", "", .)
382 | # progressBars$first_hs_name <- gsub(" High School", "", progressBars$first_hs_name)
383 | # // Step 5: For students who are off-track upon completion of their first year
384 | # of high school, convert the values to be negative for ease of
385 | # visualization in the graph
386 | progressBars$n[progressBars$ot == "Off-Track to Graduate"] <- 
387 |   -progressBars$n[progressBars$ot == "Off-Track to Graduate"]
388 | 
389 | ```
390 | 
391 | ```{r}
392 | # // Step 6: Plot
393 | ggplot(progressBars, aes(x = reorder(first_hs_code, n/count),
394 |                          y = n/count, group = ot)) +
395 |   geom_bar(aes(fill = ot), stat = 'identity', color = I("black")) +
396 |   geom_text(aes(label = round(100* n/count, 0)),
397 |             position = position_stack(vjust=0.3)) +
398 |   theme_bw() +
399 |   scale_y_continuous(limits = c(-0.8,1), label = percent,
400 |                      name = "Percent of Ninth Graders",
401 |                      breaks = seq(-0.8, 1, 0.2)) +
402 |   scale_fill_brewer(name = "", type = "qual", palette = 6) +
403 |   theme(axis.text.x = element_text(angle = 30, color = "black", vjust = 0.5),
404 |         legend.position = c(0.15, 0.875)) +
405 |   labs(title = "Proportion of Students On-Track to Graduate by School",
406 |        subtitle = "End of Ninth Grade On-Track Status \n By High School", x = "",
407 |        caption = paste0("Sample: 2004-2005 and 2005-20065 Agency first-time ninth
408 | graders. \n",
409 | "Postsecondary enrollment outcomes from NSC matched records. \n",
410 | "All other data from Agency administrative records."))
411 | ```
412 | 
413 | 
414 | ### Ninth to Tenth Grade Transition by On-Track Status
415 | 
416 | 
417 | ## High School Graduation
418 | 
419 | ### High School Completion Rates by School
420 | 
421 | ```{r}
422 | plotdf <- filter(cg_data, chrt_ninth >= 2005 &
423 |                    chrt_ninth <= 2008) %>% 
424 |   filter(!is.na(ontime))
425 | 
426 | plotdf$grad <- ifelse(!is.na(plotdf$chrt_grad) & plotdf$ontime ==1, 1, 0)
427 | plotdf$seamless_transitioners_any <- ifelse(plotdf$ps == 1, 1, 0)
428 | plotdf$second_year_persisters <- as.numeric(plotdf$ps == 1 &
429 |                                              plotdf$enroll_any_yr2 == 1)
430 | 
431 | schoolLevel <- bind_rows(
432 |   plotdf %>% group_by(first_hs_code) %>%
433 |     summarize(ontime_grad = mean(ontime, na.rm=TRUE),
434 |               late_grad = mean(late, na.rm=TRUE),
435 |               count = n()),
436 |   plotdf %>% ungroup %>%
437 |     summarize(first_hs_code = "Agency AVERAGE",
438 |               ontime_grad = mean(ontime, na.rm=TRUE),
439 |               late_grad = mean(late, na.rm=TRUE),
440 |               count = n())
441 | )
442 | # // Step 3: Reshape the data wide
443 | schoolLevel <- schoolLevel %>% gather(key = outcome,
444 |                                       value = measure, -count, -first_hs_code)
445 | schoolLevel$outcome[schoolLevel$outcome == "ontime_grad"] <- "On-Time HS Graduate"
446 | schoolLevel$outcome[schoolLevel$outcome == "late_grad"] <- "Graduate in 4+ Years"
447 | ```
448 | 
449 | ```{r}
450 | ggplot(schoolLevel, aes(x = reorder(first_hs_code, measure), y = measure,
451 | group = first_hs_code, fill = outcome)) +
452 |   geom_bar(aes(fill = outcome), stat = 'identity', color = I("black")) +
453 |   geom_text(aes(label = round(100 * measure, 0)),
454 |             position = position_stack(vjust = 0.8)) +
455 |   theme_bw() + theme(panel.grid = element_blank(), axis.ticks.x = element_blank()) +
456 |   scale_y_continuous(limits = c(0, 1), label = percent, 
457 |                      name = "Percent of Ninth Graders") + 
458 |   scale_fill_brewer(name = "", type = "qual", palette = 7) +
459 |   theme(axis.text.x = element_text(color = "black", angle = 30, vjust = 0.5),
460 |         legend.position = c(0.15, 0.825)) +
461 |   labs(title = "High School Graduation Rates by High School",
462 |        x = "",
463 |        caption = paste0("Sample: 2004-2005 Agency first-time ninth graders. \n",
464 |                         "Data from Agency administrative records."))
465 | 
466 | ```
467 | 
468 | 
469 | ### High School Completion Rates by Average 8th Grade Achievement
470 | 
471 | 
472 | ```{r}
473 | plotdf <- filter(cg_data, chrt_ninth >= 2005 &
474 |                    chrt_ninth <= 2008) %>% 
475 |   filter(!is.na(ontime)) %>% filter(!is.na(test_math_8_std))
476 | 
477 | 
478 | schoolLevel <- bind_rows(
479 | plotdf %>% group_by(first_hs_code) %>%
480 | summarize(ontime_grad = mean(ontime, na.rm=TRUE),
481 | std_score = mean(test_math_8_std, na.rm=TRUE),
482 | count = n()),
483 | plotdf %>% ungroup %>%
484 | summarize(first_hs_code = "Agency AVERAGE",
485 | ontime_grad = mean(ontime, na.rm=TRUE),
486 | std_score = mean(test_math_8_std, na.rm=TRUE),
487 | count = n())
488 | )
489 | 
490 | 
491 | 
492 | ```
493 | 
494 | ```{r}
495 | ggplot(schoolLevel[schoolLevel$first_hs_code != "Agency AVERAGE", ],
496 |        aes(x = std_score, y = ontime_grad)) +
497 |   geom_vline(xintercept = as.numeric(schoolLevel[schoolLevel$first_hs_code ==
498 |                                                    "Agency AVERAGE", "std_score"]),
499 |              linetype = 4, color = I("goldenrod"), size = 1.1) +
500 |   geom_hline(yintercept = as.numeric(schoolLevel[schoolLevel$first_hs_code ==
501 |                                                    "Agency AVERAGE", "ontime_grad"]),
502 |              linetype = 4, color = I("purple"), size = 1.1) +
503 |   geom_point(size = I(2)) +
504 |   theme_bw() + theme(panel.grid = element_blank()) +coord_cartesian() +
505 |   annotate(geom = "text", x = -.85, y = 0.025,
506 |            label = "Below average math scores & \n below average graduation rates",
507 |            size = I(2.5)) +
508 |   annotate(geom = "text", x = .85, y = 0.025,
509 |            label = "Above average math scores & \n below average graduation rates",
510 |            size = I(2.5)) +
511 |   annotate(geom = "text", x = .85, y = 0.975,
512 |            label = "Above average math scores & \n above average graduation rates",
513 |            size = I(2.5)) +
514 |   annotate(geom = "text", x = -.85, y = 0.975,
515 |            label = "Below average math scores & \n above average graduation rates",
516 |            size = I(2.5)) +
517 |   annotate(geom = "text", x = .205, y = 0.025,
518 |            label = "Agency Average \n Test Score",
519 |            size = I(2.5), color = I("goldenrod")) +
520 |   annotate(geom = "text", x = .85, y = 0.61,
521 |            label = "Agency Average Graduation Rate",
522 |            size = I(2.5)) +
523 |   scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.2)) +
524 |   scale_y_continuous(limits = c(0, 1), label = percent,
525 |                      name = "Percent of Ninth Graders", breaks = seq(0, 1, 0.1)) +
526 |   geom_text(aes(label = first_hs_code), nudge_y = 0.065, vjust = "top", size = I(4),
527 |             nudge_x = 0.01) +
528 |   labs(title = "High School Graduation Rates by High School",
529 |        x = "Average 8th Grade Math Standardized Score",
530 |        subtitle = "By Student Achievement Profile Upon High School Entry",
531 |        caption = paste0("Sample: 2004-2005 through 2005-2006 Agency first-time ",
532 | "ninth graders with eighth grade math test scores. \n",
533 | "Data from Agency administrative records."))
534 | ```
535 | 
536 | 


--------------------------------------------------------------------------------