├── .github ├── .gitignore └── workflows │ ├── pkgdown.yaml │ ├── check-standard.yaml │ └── test-coverage.yaml ├── src ├── Makevars ├── .gitignore ├── RcppExports.cpp └── compression.cpp ├── vignettes ├── .gitignore ├── faq.Rmd └── plotting.Rmd ├── LICENSE ├── data ├── school_ses.rda ├── schools00.rda └── schools05.rda ├── tests ├── testthat.R └── testthat │ ├── test_mutual_total_nested.R │ ├── test_exposure_isolation.R │ ├── test_entropy.R │ ├── test_matrix_to_long.R │ ├── test_dissimilarity.R │ ├── test_plots.R │ ├── test_mutual_local.R │ ├── test_mutual_local_expected.R │ ├── test_mutual_total_expected.R │ ├── test_mutual_within.R │ ├── test_ipf.R │ ├── test_compression.R │ └── test_mutual_total.R ├── man ├── figures │ └── README-segplot-1.png ├── scree_plot.Rd ├── school_ses.Rd ├── isolation.Rd ├── segregation.Rd ├── exposure.Rd ├── schools00.Rd ├── schools05.Rd ├── segcurve.Rd ├── merge_units.Rd ├── get_crosswalk.Rd ├── entropy.Rd ├── matrix_to_long.Rd ├── mutual_total_nested.Rd ├── compress.Rd ├── dissimilarity_expected.Rd ├── segplot.Rd ├── mutual_local_expected.Rd ├── dissimilarity.Rd ├── mutual_total_expected.Rd ├── mutual_local.Rd ├── mutual_within.Rd ├── mutual_total.Rd ├── ipf.Rd └── mutual_difference.Rd ├── .gitignore ├── codecov.yml ├── cran-comments.md ├── .Rbuildignore ├── CITATION.cff ├── R ├── RcppExports.R ├── entropy.R ├── data.R ├── exposure.R ├── dissimilarity.R ├── segregation.R ├── ipf.R ├── plots.R └── mutual_expected.R ├── inst └── CITATION ├── NAMESPACE ├── _pkgdown.yml ├── DESCRIPTION ├── NEWS.md ├── data-raw └── clean_data.R ├── README.Rmd └── README.md /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | CXX_STD=CXX17 2 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2018 2 | COPYRIGHT HOLDER: Benjamin Elbers 3 | -------------------------------------------------------------------------------- /data/school_ses.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbersb/segregation/HEAD/data/school_ses.rda -------------------------------------------------------------------------------- /data/schools00.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbersb/segregation/HEAD/data/schools00.rda -------------------------------------------------------------------------------- /data/schools05.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbersb/segregation/HEAD/data/schools05.rda -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(segregation) 3 | 4 | test_check("segregation") 5 | -------------------------------------------------------------------------------- /man/figures/README-segplot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elbersb/segregation/HEAD/man/figures/README-segplot-1.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.Rproj 2 | .Rproj.user 3 | .Rhistory 4 | .RData 5 | .Ruserdata 6 | data-raw/sc* 7 | inst/doc 8 | docs/ 9 | .DS_Store 10 | .vscode/* -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | patch: 10 | default: 11 | target: auto 12 | threshold: 1% 13 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | 3 | * local macOS 13.3.1, R 4.2.0 4 | * tested using Github Actions (mac, win, ubuntu) 5 | * tested using win-devel 6 | 7 | ## R CMD check results 8 | 9 | No ERRORs, WARNINGSs, or NOTES. 10 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CRAN-RELEASE$ 2 | ^.*\.Rproj$ 3 | ^.*\.rhistory$ 4 | ^\.Rproj\.user$ 5 | ^data-raw$ 6 | ^README\.Rmd$ 7 | ^README-.*\.png$ 8 | ^cran-comments\.md$ 9 | ^codecov\.yml$ 10 | ^render_README\.R$ 11 | ^docs$ 12 | ^\.lintr$ 13 | ^\.github$ 14 | ^_pkgdown.yml$ 15 | ^CITATION.cff$ 16 | ^\.vscode$ 17 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | preferred-citation: 3 | type: article 4 | message: "If you use {segregation} in your research, please cite the following paper." 5 | authors: 6 | - family-names: "Elbers" 7 | given-names: "Benjamin" 8 | orcid: "https://orcid.org/0000-0001-5392-3448" 9 | title: "A Method for Studying Differences in Segregation Across Time and Space" 10 | doi: "10.1177/0049124121986204" 11 | journal: "Sociological Methods & Research" 12 | year: 2021 13 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | compress_compute_cpp <- function(neighbors_option, m_neighbors, n_neighbors, m_data, unit_names, max_iter) { 5 | .Call(`_segregation_compress_compute_cpp`, neighbors_option, m_neighbors, n_neighbors, m_data, unit_names, max_iter) 6 | } 7 | 8 | get_crosswalk_cpp <- function(old_unit, new_unit) { 9 | .Call(`_segregation_get_crosswalk_cpp`, old_unit, new_unit) 10 | } 11 | 12 | -------------------------------------------------------------------------------- /man/scree_plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/compression.R 3 | \name{scree_plot} 4 | \alias{scree_plot} 5 | \title{Scree plot for segregation compression} 6 | \usage{ 7 | scree_plot(compression, tail = Inf) 8 | } 9 | \arguments{ 10 | \item{compression}{A "segcompression" object returned by \link{compress}.} 11 | 12 | \item{tail}{Return only the last \code{tail} units (default: \code{Inf})} 13 | } 14 | \value{ 15 | Returns a ggplot2 plot. 16 | } 17 | \description{ 18 | A plot that allows to visually see the effect of compression 19 | on mutual information. 20 | } 21 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry( 2 | header="To cite 'segregation' in publications use:", 3 | bibtype="Article", 4 | title="A Method for Studying Differences in Segregation Across Time and Space", 5 | author=as.person("Benjamin Elbers"), 6 | journal="Sociological Methods & Research", 7 | year="2021", 8 | volume="52", 9 | number="1", 10 | pages="5-42", 11 | doi="10.1177/0049124121986204", 12 | textVersion=paste( 13 | "Benjamin Elbers. 2021.", 14 | "A Method for Studying Differences in Segregation Across Time and Space", 15 | "Sociological Methods & Research 52(1): 5-42. doi: 10.1177/0049124121986204" 16 | ) 17 | ) -------------------------------------------------------------------------------- /man/school_ses.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{school_ses} 5 | \alias{school_ses} 6 | \title{Student-level data including SES status} 7 | \format{ 8 | A data frame with 5,153 rows and 3 variables: 9 | \describe{ 10 | \item{school_id}{school ID} 11 | \item{ethnic_group}{one of A, B, or C} 12 | \item{ses_quintile}{SES of the student (1 = lowest, 5 = highest)} 13 | } 14 | } 15 | \usage{ 16 | school_ses 17 | } 18 | \description{ 19 | Fake dataset used for examples. This is an individual-level 20 | dataset of students in schools. 21 | } 22 | \keyword{datasets} 23 | -------------------------------------------------------------------------------- /tests/testthat/test_mutual_total_nested.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_mutual_total_nested") 7 | 8 | test_that("works both ways around", { 9 | decomp <- mutual_total_nested(schools00, "race", 10 | c("state", "district", "school"), 11 | weight = "n" 12 | ) 13 | term1 <- mutual_total(schools00, "race", "state", weight = "n")$est 14 | term2 <- mutual_total(schools00, "race", "district", within = "state", weight = "n")$est 15 | term3 <- mutual_total(schools00, "race", "school", within = c("state", "district"), weight = "n")$est 16 | expect_equal(decomp$est, c(term1, term2, term3)) 17 | }) 18 | -------------------------------------------------------------------------------- /man/isolation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exposure.R 3 | \name{isolation} 4 | \alias{isolation} 5 | \title{Calculates isolation indices} 6 | \usage{ 7 | isolation(data, group, unit, weight = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{A data frame.} 11 | 12 | \item{group}{A categorical variable 13 | contained in \code{data}. Defines the first dimension 14 | over which segregation is computed.} 15 | 16 | \item{unit}{A vector of variables 17 | contained in \code{data}. Defines the second dimension 18 | over which segregation is computed.} 19 | 20 | \item{weight}{Numeric. (Default \code{NULL})} 21 | } 22 | \value{ 23 | Returns a data.table with group column and isolation index. 24 | } 25 | \description{ 26 | Returns isolation index of each group 27 | } 28 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(as.dendrogram,segcompression) 4 | S3method(print,segcompression) 5 | export(compress) 6 | export(dissimilarity) 7 | export(dissimilarity_expected) 8 | export(entropy) 9 | export(exposure) 10 | export(get_crosswalk) 11 | export(ipf) 12 | export(isolation) 13 | export(matrix_to_long) 14 | export(merge_units) 15 | export(mutual_difference) 16 | export(mutual_local) 17 | export(mutual_local_expected) 18 | export(mutual_total) 19 | export(mutual_total_expected) 20 | export(mutual_total_nested) 21 | export(mutual_within) 22 | export(scree_plot) 23 | export(segcurve) 24 | export(segplot) 25 | import(RcppProgress) 26 | import(data.table) 27 | importFrom(Rcpp,sourceCpp) 28 | importFrom(stats,as.dendrogram) 29 | useDynLib(segregation, .registration = TRUE) 30 | -------------------------------------------------------------------------------- /man/segregation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/segregation.R 3 | \docType{package} 4 | \name{segregation} 5 | \alias{segregation-package} 6 | \alias{segregation} 7 | \title{segregation: Entropy-based segregation indices} 8 | \description{ 9 | Calculate and decompose entropy-based, multigroup segregation indices, with a focus 10 | on the Mutual Information Index (M) and Theil's Information Index (H). 11 | Provides tools to decompose the measures by groups and units, and by within 12 | and between terms. Includes standard error estimation by bootstrapping. 13 | } 14 | \seealso{ 15 | \url{https://elbersb.com/segregation} 16 | } 17 | \author{ 18 | \strong{Maintainer}: Benjamin Elbers \email{be2239@columbia.edu} (\href{https://orcid.org/0000-0001-5392-3448}{ORCID}) 19 | 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /man/exposure.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exposure.R 3 | \name{exposure} 4 | \alias{exposure} 5 | \title{Calculates pairwise exposure indices} 6 | \usage{ 7 | exposure(data, group, unit, weight = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{A data frame.} 11 | 12 | \item{group}{A categorical variable 13 | contained in \code{data}. Defines the first dimension 14 | over which segregation is computed.} 15 | 16 | \item{unit}{A vector of variables 17 | contained in \code{data}. Defines the second dimension 18 | over which segregation is computed.} 19 | 20 | \item{weight}{Numeric. (Default \code{NULL})} 21 | } 22 | \value{ 23 | Returns a data.table with columns "of", "to", and 24 | "exposure". Read results as "exposure of group x to group y". 25 | } 26 | \description{ 27 | Returns the pairwise exposure indices between groups 28 | } 29 | -------------------------------------------------------------------------------- /man/schools00.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{schools00} 5 | \alias{schools00} 6 | \title{Ethnic/racial composition of schools for 2000/2001} 7 | \format{ 8 | A data frame with 8,142 rows and 5 variables: 9 | \describe{ 10 | \item{state}{either A, B, or C} 11 | \item{district}{school agency/district ID} 12 | \item{school}{school ID} 13 | \item{race}{either native, asian, hispanic, black, or white} 14 | \item{n}{n of students by school and race} 15 | } 16 | } 17 | \usage{ 18 | schools00 19 | } 20 | \description{ 21 | Fake dataset used for examples. Loosely based on data provided by 22 | the National Center for Education Statistics, Common Core of Data, 23 | with information on U.S. primary schools in three U.S. states. 24 | The original data can be downloaded at \url{https://nces.ed.gov/ccd/}. 25 | } 26 | \keyword{datasets} 27 | -------------------------------------------------------------------------------- /man/schools05.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{schools05} 5 | \alias{schools05} 6 | \title{Ethnic/racial composition of schools for 2005/2006} 7 | \format{ 8 | A data frame with 8,013 rows and 5 variables: 9 | \describe{ 10 | \item{state}{either A, B, or C} 11 | \item{district}{school agency/district ID} 12 | \item{school}{school ID} 13 | \item{race}{either native, asian, hispanic, black, or white} 14 | \item{n}{n of students by school and race} 15 | } 16 | } 17 | \usage{ 18 | schools05 19 | } 20 | \description{ 21 | Fake dataset used for examples. Loosely based on data provided by 22 | the National Center for Education Statistics, Common Core of Data, 23 | with information on U.S. primary schools in three U.S. states. 24 | The original data can be downloaded at \url{https://nces.ed.gov/ccd/}. 25 | } 26 | \keyword{datasets} 27 | -------------------------------------------------------------------------------- /man/segcurve.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plots.R 3 | \name{segcurve} 4 | \alias{segcurve} 5 | \title{A visual representation of two-group segregation} 6 | \usage{ 7 | segcurve(data, group, unit, weight = NULL, segment = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{A data frame.} 11 | 12 | \item{group}{A categorical variable contained in \code{data}. 13 | Defines the first dimension over which segregation is computed.} 14 | 15 | \item{unit}{A categorical variable contained in \code{data}. 16 | Defines the second dimension over which segregation is computed.} 17 | 18 | \item{weight}{Numeric. (Default \code{NULL})} 19 | 20 | \item{segment}{A categorical variable contained in \code{data}. (Default \code{NULL}) 21 | If given, several segregation curves will be shown, one for each segment.} 22 | } 23 | \value{ 24 | Returns a ggplot2 object. 25 | } 26 | \description{ 27 | Produces one or several segregation curves, as defined in Duncan and Duncan (1955) 28 | } 29 | -------------------------------------------------------------------------------- /man/merge_units.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/compression.R 3 | \name{merge_units} 4 | \alias{merge_units} 5 | \title{Creates a compressed dataset} 6 | \usage{ 7 | merge_units(compression, n_units = NULL, percent = NULL, parts = FALSE) 8 | } 9 | \arguments{ 10 | \item{compression}{A "segcompression" object returned by \link{compress}.} 11 | 12 | \item{n_units}{Determines the number of merges by specifying the number of 13 | units to remain in the compressed dataset. 14 | Only \code{n_units} or \code{percent} must be given. (default: \code{NULL})} 15 | 16 | \item{percent}{Determines the number of merges by specifying the percentage 17 | of total segregation information retained in the compressed dataset. 18 | Only \code{n_units} or \code{percent} must be given. (default: \code{NULL})} 19 | 20 | \item{parts}{(default: FALSE)} 21 | } 22 | \value{ 23 | Returns a data.table. 24 | } 25 | \description{ 26 | After running \link{compress}, this function creates a dataset where 27 | units are merged. 28 | } 29 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://elbersb.com/segregation 2 | 3 | authors: 4 | Benjamin Elbers: 5 | href: https://elbersb.com 6 | 7 | template: 8 | bootstrap: 5 9 | 10 | reference: 11 | - title: Segregation indices 12 | contents: 13 | - dissimilarity 14 | - exposure 15 | - isolation 16 | - mutual_total 17 | - mutual_total_nested 18 | - mutual_within 19 | - mutual_local 20 | - title: Visualizing segregation 21 | contents: 22 | - segcurve 23 | - segplot 24 | - title: Debiasing 25 | contents: 26 | - mutual_total_expected 27 | - mutual_local_expected 28 | - dissimilarity_expected 29 | - title: Comparing differences 30 | contents: 31 | - mutual_difference 32 | - ipf 33 | - title: Compressing segregation 34 | contents: 35 | - compress 36 | - merge_units 37 | - get_crosswalk 38 | - scree_plot 39 | - title: Datasets 40 | contents: 41 | - school_ses 42 | - schools00 43 | - schools05 44 | - title: Helper functions 45 | contents: 46 | - entropy 47 | - matrix_to_long 48 | -------------------------------------------------------------------------------- /man/get_crosswalk.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/compression.R 3 | \name{get_crosswalk} 4 | \alias{get_crosswalk} 5 | \title{Create crosswalk after compression} 6 | \usage{ 7 | get_crosswalk(compression, n_units = NULL, percent = NULL, parts = FALSE) 8 | } 9 | \arguments{ 10 | \item{compression}{A "segcompression" object returned by \link{compress}.} 11 | 12 | \item{n_units}{Determines the number of merges by specifying the number of 13 | units to remain in the compressed dataset. 14 | Only \code{n_units} or \code{percent} must be given. (default: \code{NULL})} 15 | 16 | \item{percent}{Determines the number of merges by specifying the percentage 17 | of total segregation information retained in the compressed dataset. 18 | Only \code{n_units} or \code{percent} must be given. (default: \code{NULL})} 19 | 20 | \item{parts}{(default: FALSE)} 21 | } 22 | \value{ 23 | Returns a ggplot2 plot. 24 | 25 | Returns a data.table. 26 | } 27 | \description{ 28 | After running \link{compress}, this function creates a crosswalk table. 29 | Usually it is preferred to call \link{merge_units} directly. 30 | } 31 | -------------------------------------------------------------------------------- /man/entropy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/entropy.R 3 | \name{entropy} 4 | \alias{entropy} 5 | \title{Calculates the entropy of a distribution} 6 | \usage{ 7 | entropy(data, group, within = NULL, weight = NULL, base = exp(1)) 8 | } 9 | \arguments{ 10 | \item{data}{A data frame.} 11 | 12 | \item{group}{A categorical variable or a vector of variables 13 | contained in \code{data}.} 14 | 15 | \item{within}{A categorical variable or a vector of variables 16 | contained in \code{data}.} 17 | 18 | \item{weight}{Numeric. (Default \code{NULL})} 19 | 20 | \item{base}{Base of the logarithm that is used in the entropy 21 | calculation. Defaults to the natural logarithm.} 22 | } 23 | \value{ 24 | A single number, the entropy. 25 | } 26 | \description{ 27 | Returns the entropy of the distribution defined by 28 | \code{group}. 29 | } 30 | \examples{ 31 | d <- data.frame(cat = c("A", "B"), n = c(25, 75)) 32 | entropy(d, "cat", weight = "n") # => .56 33 | # this is equivalent to -.25*log(.25)-.75*log(.75) 34 | 35 | d <- data.frame(cat = c("A", "B"), n = c(50, 50)) 36 | # use base 2 for the logarithm, then entropy is maximized at 1 37 | entropy(d, "cat", weight = "n", base = 2) # => 1 38 | } 39 | -------------------------------------------------------------------------------- /man/matrix_to_long.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/segregation.R 3 | \name{matrix_to_long} 4 | \alias{matrix_to_long} 5 | \title{Turns a contingency table into long format} 6 | \usage{ 7 | matrix_to_long( 8 | matrix, 9 | group = "group", 10 | unit = "unit", 11 | weight = "n", 12 | drop_zero = TRUE 13 | ) 14 | } 15 | \arguments{ 16 | \item{matrix}{A matrix, where the rows represent the units, and the 17 | column represent the groups.} 18 | 19 | \item{group}{Variable name for group. (Default \code{group})} 20 | 21 | \item{unit}{Variable name for unit. (Default \code{unit})} 22 | 23 | \item{weight}{Variable name for frequency weight. (Default \code{weight})} 24 | 25 | \item{drop_zero}{Drop unit-group combinations with zero weight. (Default \code{TRUE})} 26 | } 27 | \value{ 28 | A data.table. 29 | } 30 | \description{ 31 | Returns a data.table in long form, such that it is suitable 32 | for use in \link{mutual_total}, etc. Colnames and rownames of 33 | the matrix will be respected. 34 | } 35 | \examples{ 36 | m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3) 37 | colnames(m) <- c("Black", "White") 38 | long <- matrix_to_long(m, group = "race", unit = "school") 39 | mutual_total(long, "race", "school", weight = "n") 40 | } 41 | -------------------------------------------------------------------------------- /tests/testthat/test_exposure_isolation.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_exposure_isolation") 7 | 8 | test_that("two group case", { 9 | two <- data.table::as.data.table(schools00) 10 | two <- two[race %in% c("white", "black")] 11 | exp <- exposure(two, "race", "school", "n") 12 | # relationship: sum of xPy and yPx = 1 13 | expect_equal(exp[, .(sum = sum(exposure)), by = .(of)][["sum"]], c(1, 1)) 14 | # relationship: xPy = yPx Y/X 15 | white_total <- two[race == "white", sum(n)] 16 | black_total <- two[race == "black", sum(n)] 17 | expect_equal( 18 | exp[of == "black" & to == "white", exposure], 19 | exp[of == "white" & to == "black", exposure] * white_total / black_total 20 | ) 21 | }) 22 | 23 | test_that("exposure", { 24 | exp <- exposure(schools00, "race", "school", "n") 25 | expect_equal( 26 | exp[, .(sum = sum(exposure)), by = .(of)][["sum"]], 27 | rep(1, 5) 28 | ) 29 | }) 30 | 31 | test_that("exposure and isolation", { 32 | exp <- exposure(schools00, "race", "school", "n")[of == to] 33 | iso <- isolation(schools00, "race", "school", "n") 34 | comp <- merge(exp, iso, by.x = "of", by.y = "race") 35 | expect_equal(comp[["isolation"]], comp[["exposure"]]) 36 | }) 37 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | release: 8 | types: [published] 9 | workflow_dispatch: 10 | 11 | name: pkgdown.yaml 12 | 13 | permissions: read-all 14 | 15 | jobs: 16 | pkgdown: 17 | runs-on: ubuntu-latest 18 | # Only restrict concurrency for non-PR jobs 19 | concurrency: 20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 21 | env: 22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 23 | CENSUS_API_KEY: ${{ secrets.CENSUS_API_KEY }} 24 | permissions: 25 | contents: write 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | - uses: r-lib/actions/setup-pandoc@v2 30 | 31 | - uses: r-lib/actions/setup-r@v2 32 | with: 33 | use-public-rspm: true 34 | 35 | - uses: r-lib/actions/setup-r-dependencies@v2 36 | with: 37 | extra-packages: any::pkgdown, local::., any::tigris 38 | needs: website 39 | 40 | - name: Build site 41 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 42 | shell: Rscript {0} 43 | 44 | - name: Deploy to GitHub pages 🚀 45 | if: github.event_name != 'pull_request' 46 | uses: JamesIves/github-pages-deploy-action@v4.5.0 47 | with: 48 | clean: false 49 | branch: gh-pages 50 | folder: docs 51 | -------------------------------------------------------------------------------- /man/mutual_total_nested.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mutual.R 3 | \name{mutual_total_nested} 4 | \alias{mutual_total_nested} 5 | \title{Calculates a nested decomposition of segregation for M and H} 6 | \usage{ 7 | mutual_total_nested(data, group, unit, weight = NULL, base = exp(1)) 8 | } 9 | \arguments{ 10 | \item{data}{A data frame.} 11 | 12 | \item{group}{A categorical variable or a vector of variables 13 | contained in \code{data}. Defines the first dimension 14 | over which segregation is computed.} 15 | 16 | \item{unit}{A vector of variables 17 | contained in \code{data}. Defines the levels at which 18 | the decomposition should be computed.} 19 | 20 | \item{weight}{Numeric. (Default \code{NULL})} 21 | 22 | \item{base}{Base of the logarithm that is used in the calculation. 23 | Defaults to the natural logarithm.} 24 | } 25 | \value{ 26 | Returns a data.table similar to \code{\link{mutual_total}}, 27 | but with column \code{between} and \code{within} that define 28 | the levels of nesting. 29 | } 30 | \description{ 31 | Returns the between-within decomposition defined by 32 | the sequence of variables in \code{unit}. 33 | } 34 | \examples{ 35 | mutual_total_nested(schools00, "race", c("state", "district", "school"), 36 | weight = "n" 37 | ) 38 | # This is a simpler way to run the following manually: 39 | # mutual_total(schools00, "race", "state", weight = "n") 40 | # mutual_total(schools00, "race", "district", within = "state", weight = "n") 41 | # mutual_total(schools00, "race", "school", within = c("state", "district"), weight = "n") 42 | } 43 | -------------------------------------------------------------------------------- /man/compress.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/compression.R 3 | \name{compress} 4 | \alias{compress} 5 | \title{Compresses a data matrix based on mutual information (segregation)} 6 | \usage{ 7 | compress( 8 | data, 9 | group, 10 | unit, 11 | weight = NULL, 12 | neighbors = "local", 13 | n_neighbors = 50, 14 | max_iter = Inf 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{A data frame.} 19 | 20 | \item{group}{A categorical variable 21 | contained in \code{data}. Defines the first dimension 22 | over which segregation is computed.} 23 | 24 | \item{unit}{A categorical variable 25 | contained in \code{data}. Defines the second dimension 26 | over which segregation is computed.} 27 | 28 | \item{weight}{Numeric. Only frequency weights are allowed. 29 | (Default \code{NULL})} 30 | 31 | \item{neighbors}{Either a data frame or a character. If data frame, then 32 | it needs exactly two columns, where each row identifies 33 | a set of "neighbors" that may be merged. 34 | If "local", considers the \code{n_neighbors} closest neighbors 35 | in terms of local segregation. 36 | If "all", all units are considered as possible neighbors. This 37 | may be very time-consuming.} 38 | 39 | \item{n_neighbors}{Only relevant if \code{neighbors} is \code{"local"}.} 40 | 41 | \item{max_iter}{Maximum number of iterations (Default \code{Inf})} 42 | } 43 | \value{ 44 | Returns a data.table. 45 | } 46 | \description{ 47 | Given a data set that identifies suitable neighbors for merging, 48 | this function will merge units iteratively, where in each iteration 49 | the neighbors with the smallest reduction in terms of total M will be merged. 50 | } 51 | -------------------------------------------------------------------------------- /.github/workflows/check-standard.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: R-CMD-check.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | R-CMD-check: 14 | runs-on: ${{ matrix.config.os }} 15 | 16 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 17 | 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | config: 22 | - {os: macos-latest, r: 'release'} 23 | - {os: windows-latest, r: 'release'} 24 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 25 | - {os: ubuntu-latest, r: 'release'} 26 | - {os: ubuntu-latest, r: 'oldrel-1'} 27 | 28 | env: 29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 30 | CENSUS_API_KEY: ${{ secrets.CENSUS_API_KEY }} 31 | R_KEEP_PKG_SOURCE: yes 32 | 33 | steps: 34 | - uses: actions/checkout@v4 35 | 36 | - uses: r-lib/actions/setup-pandoc@v2 37 | 38 | - uses: r-lib/actions/setup-r@v2 39 | with: 40 | r-version: ${{ matrix.config.r }} 41 | http-user-agent: ${{ matrix.config.http-user-agent }} 42 | use-public-rspm: true 43 | 44 | - uses: r-lib/actions/setup-r-dependencies@v2 45 | with: 46 | extra-packages: any::rcmdcheck 47 | needs: check 48 | 49 | - uses: r-lib/actions/check-r-package@v2 50 | with: 51 | upload-snapshots: true 52 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 53 | -------------------------------------------------------------------------------- /R/entropy.R: -------------------------------------------------------------------------------- 1 | #' Calculates the entropy of a distribution 2 | #' 3 | #' Returns the entropy of the distribution defined by 4 | #' \code{group}. 5 | #' 6 | #' @param data A data frame. 7 | #' @param group A categorical variable or a vector of variables 8 | #' contained in \code{data}. 9 | #' @param within A categorical variable or a vector of variables 10 | #' contained in \code{data}. 11 | #' @param weight Numeric. (Default \code{NULL}) 12 | #' @param base Base of the logarithm that is used in the entropy 13 | #' calculation. Defaults to the natural logarithm. 14 | #' @return A single number, the entropy. 15 | #' @examples 16 | #' d <- data.frame(cat = c("A", "B"), n = c(25, 75)) 17 | #' entropy(d, "cat", weight = "n") # => .56 18 | #' # this is equivalent to -.25*log(.25)-.75*log(.75) 19 | #' 20 | #' d <- data.frame(cat = c("A", "B"), n = c(50, 50)) 21 | #' # use base 2 for the logarithm, then entropy is maximized at 1 22 | #' entropy(d, "cat", weight = "n", base = 2) # => 1 23 | #' @import data.table 24 | #' @export 25 | entropy <- function(data, group, within = NULL, weight = NULL, base = exp(1)) { 26 | # use provided weight 27 | if (!is.null(weight)) { 28 | data[, "freq"] <- data[[weight]] 29 | } else { 30 | data[, "freq"] <- 1 31 | } 32 | data.table::setDT(data) 33 | n_total <- sum(data[, "freq"]) 34 | 35 | if (is.null(within)) { 36 | vars <- group 37 | entropy_within <- 0 38 | } else { 39 | vars <- c(group, within) 40 | p_within <- data[, list(p = sum(freq)), by = within][["p"]] / n_total 41 | entropy_within <- -sum(p_within * logf(p_within, base)) 42 | } 43 | 44 | p <- data[, list(p = sum(freq)), by = vars][["p"]] / n_total 45 | -sum(p * logf(p, base)) - entropy_within 46 | } 47 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Ethnic/racial composition of schools for 2000/2001 2 | #' 3 | #' Fake dataset used for examples. Loosely based on data provided by 4 | #' the National Center for Education Statistics, Common Core of Data, 5 | #' with information on U.S. primary schools in three U.S. states. 6 | #' The original data can be downloaded at \url{https://nces.ed.gov/ccd/}. 7 | #' 8 | #' @format A data frame with 8,142 rows and 5 variables: 9 | #' \describe{ 10 | #' \item{state}{either A, B, or C} 11 | #' \item{district}{school agency/district ID} 12 | #' \item{school}{school ID} 13 | #' \item{race}{either native, asian, hispanic, black, or white} 14 | #' \item{n}{n of students by school and race} 15 | #' } 16 | "schools00" 17 | 18 | #' Ethnic/racial composition of schools for 2005/2006 19 | #' 20 | #' Fake dataset used for examples. Loosely based on data provided by 21 | #' the National Center for Education Statistics, Common Core of Data, 22 | #' with information on U.S. primary schools in three U.S. states. 23 | #' The original data can be downloaded at \url{https://nces.ed.gov/ccd/}. 24 | #' 25 | #' @format A data frame with 8,013 rows and 5 variables: 26 | #' \describe{ 27 | #' \item{state}{either A, B, or C} 28 | #' \item{district}{school agency/district ID} 29 | #' \item{school}{school ID} 30 | #' \item{race}{either native, asian, hispanic, black, or white} 31 | #' \item{n}{n of students by school and race} 32 | #' } 33 | "schools05" 34 | 35 | #' Student-level data including SES status 36 | #' 37 | #' Fake dataset used for examples. This is an individual-level 38 | #' dataset of students in schools. 39 | #' 40 | #' @format A data frame with 5,153 rows and 3 variables: 41 | #' \describe{ 42 | #' \item{school_id}{school ID} 43 | #' \item{ethnic_group}{one of A, B, or C} 44 | #' \item{ses_quintile}{SES of the student (1 = lowest, 5 = highest)} 45 | #' } 46 | "school_ses" 47 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: segregation 2 | Type: Package 3 | Title: Entropy-Based Segregation Indices 4 | Version: 1.1.0.9000 5 | Authors@R: person("Benjamin", "Elbers", email = "be2239@columbia.edu", 6 | role = c("aut", "cre"), comment = c(ORCID = "0000-0001-5392-3448")) 7 | Description: Computes segregation indices, including the Index of Dissimilarity, 8 | as well as the information-theoretic indices developed by 9 | Theil (1971) , namely 10 | the Mutual Information Index (M) and Theil's Information Index (H). 11 | The M, further described by Mora and Ruiz-Castillo (2011) 12 | and Frankel and Volij (2011) , 13 | is a measure of segregation that is highly decomposable. The package provides 14 | tools to decompose the index by units and groups (local segregation), 15 | and by within and between terms. The package also provides a method to decompose 16 | differences in segregation as described by Elbers (2021) . 17 | The package includes standard error estimation by bootstrapping, which also corrects for 18 | small sample bias. The package also contains functions for visualizing segregation patterns. 19 | License: MIT + file LICENSE 20 | Depends: R (>= 3.5.0) 21 | Imports: 22 | data.table, 23 | checkmate, 24 | Rcpp, 25 | RcppProgress, 26 | Encoding: UTF-8 27 | LazyData: true 28 | Suggests: 29 | testthat, 30 | covr, 31 | knitr, 32 | rmarkdown, 33 | dplyr, 34 | ggplot2, 35 | scales, 36 | tidycensus, 37 | tigris, 38 | rrapply, 39 | dendextend, 40 | patchwork 41 | URL: https://elbersb.github.io/segregation/ 42 | BugReports: https://github.com/elbersb/segregation/issues 43 | RoxygenNote: 7.3.2 44 | VignetteBuilder: knitr 45 | SystemRequirements: C++17 46 | LinkingTo: 47 | Rcpp, 48 | RcppProgress 49 | -------------------------------------------------------------------------------- /tests/testthat/test_entropy.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_entropy") 7 | 8 | test_that("custom log function", { 9 | expect_equal(logf(2, exp(1)), log(2)) 10 | }) 11 | 12 | test_that("correct entropy calculation", { 13 | expect_equal(entropy(data.frame(x = c(1)), "x"), 0) 14 | expect_equal(entropy(data.frame(x = c(1, 2)), "x"), log(2)) 15 | expect_equal(entropy(data.frame(x = c(1, 2, 3)), "x"), log(3)) 16 | 17 | expect_equal( 18 | entropy(data.frame(x = c(1, 2), n = c(10, 10)), "x", weight = "n"), 19 | log(2) 20 | ) 21 | expect_equal( 22 | entropy(data.frame(x = c(1, 2), n = c(10, 10)), "x", weight = "n", base = 2), 23 | 1 24 | ) 25 | expect_equal( 26 | entropy(data.frame(x = c(1, 2), n = c(10, 30)), "x", weight = "n"), 27 | .25 * log(1 / .25) + .75 * log(1 / .75) 28 | ) 29 | }) 30 | 31 | test_that("conditional entropy calculation", { 32 | # complete independence 33 | df <- data.frame( 34 | race = c("w", "w", "b", "b"), 35 | district = c(1, 2, 1, 2), 36 | n = c(10, 10, 10, 10) 37 | ) 38 | 39 | expect_equal(entropy(df, "race", weight = "n"), log(2)) 40 | expect_equal(entropy(df, "district", weight = "n"), log(2)) 41 | expect_equal(entropy(df, c("race", "district"), weight = "n"), log(4)) 42 | expect_equal(entropy(df, "race", within = "district", weight = "n"), log(2)) 43 | 44 | # complete dependence 45 | df2 <- data.frame( 46 | race = c("w", "w", "b", "b"), 47 | district = c("1", "2", "1", "2"), 48 | n = c(10, 0, 0, 10) 49 | ) 50 | 51 | expect_equal(entropy(df2, "race", weight = "n"), log(2)) 52 | expect_equal(entropy(df2, "district", weight = "n"), log(2)) 53 | expect_equal(entropy(df2, c("race", "district"), weight = "n"), log(2)) 54 | expect_equal(entropy(df2, "race", within = "district", weight = "n"), 0) 55 | }) 56 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: test-coverage.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | test-coverage: 14 | runs-on: ubuntu-latest 15 | env: 16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - uses: r-lib/actions/setup-r@v2 22 | with: 23 | use-public-rspm: true 24 | 25 | - uses: r-lib/actions/setup-r-dependencies@v2 26 | with: 27 | extra-packages: any::covr, any::xml2 28 | needs: coverage 29 | 30 | - name: Test coverage 31 | run: | 32 | cov <- covr::package_coverage( 33 | quiet = FALSE, 34 | clean = FALSE, 35 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") 36 | ) 37 | print(cov) 38 | covr::to_cobertura(cov) 39 | shell: Rscript {0} 40 | 41 | - uses: codecov/codecov-action@v5 42 | with: 43 | # Fail if error if not on PR, or if on PR and token is given 44 | fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} 45 | files: ./cobertura.xml 46 | plugins: noop 47 | disable_search: true 48 | token: ${{ secrets.CODECOV_TOKEN }} 49 | 50 | - name: Show testthat output 51 | if: always() 52 | run: | 53 | ## -------------------------------------------------------------------- 54 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true 55 | shell: bash 56 | 57 | - name: Upload test results 58 | if: failure() 59 | uses: actions/upload-artifact@v4 60 | with: 61 | name: coverage-test-failures 62 | path: ${{ runner.temp }}/package 63 | -------------------------------------------------------------------------------- /man/dissimilarity_expected.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mutual_expected.R 3 | \name{dissimilarity_expected} 4 | \alias{dissimilarity_expected} 5 | \title{Calculates expected values when true segregation is zero} 6 | \usage{ 7 | dissimilarity_expected( 8 | data, 9 | group, 10 | unit, 11 | weight = NULL, 12 | fixed_margins = TRUE, 13 | n_bootstrap = 100 14 | ) 15 | } 16 | \arguments{ 17 | \item{data}{A data frame.} 18 | 19 | \item{group}{A categorical variable or a vector of variables 20 | contained in \code{data}. Defines the first dimension 21 | over which segregation is computed.} 22 | 23 | \item{unit}{A categorical variable or a vector of variables 24 | contained in \code{data}. Defines the second dimension 25 | over which segregation is computed.} 26 | 27 | \item{weight}{Numeric. (Default \code{NULL})} 28 | 29 | \item{fixed_margins}{Should the margins be fixed or simulated? (Default \code{TRUE})} 30 | 31 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})} 32 | } 33 | \value{ 34 | A data.table with one row, corresponding to the expected value of 35 | the D index when true segregation is zero. 36 | } 37 | \description{ 38 | When sample sizes are small, one group has a small proportion, or 39 | when there are many units, segregation indices are typically upwardly 40 | biased, even when true segregation is zero. This function simulates 41 | tables with zero segregation, given the marginals of the dataset, 42 | and calculates segregation. If the expected values are large, 43 | the interpretation of index scores might have to be adjusted. 44 | } 45 | \examples{ 46 | # build a smaller table, with 100 students distributed across 47 | # 10 schools, where one racial group has 10\% of the students 48 | small <- data.frame( 49 | school = c(1:10, 1:10), 50 | race = c(rep("r1", 10), rep("r2", 10)), 51 | n = c(rep(1, 10), rep(9, 10)) 52 | ) 53 | dissimilarity_expected(small, "race", "school", weight = "n") 54 | # with an increase in sample size (n=1000), the values improve 55 | small$n <- small$n * 10 56 | dissimilarity_expected(small, "race", "school", weight = "n") 57 | } 58 | -------------------------------------------------------------------------------- /tests/testthat/test_matrix_to_long.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_matrix_to_long") 7 | 8 | test_that("accept only matrix", { 9 | a <- data.frame() 10 | expect_error(matrix_to_long(a)) 11 | }) 12 | 13 | test_that("no names", { 14 | m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3) 15 | long <- matrix_to_long(m) 16 | expect_equal(names(long), c("unit", "group", "n")) 17 | expect_equal(long$unit, as.character(rep(1:3, 2))) 18 | expect_equal(long$group, as.character(c(1, 1, 1, 2, 2, 2))) 19 | }) 20 | 21 | test_that("rownames only", { 22 | m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3) 23 | colnames(m) <- c("A", "B") 24 | long <- matrix_to_long(m) 25 | expect_equal(names(long), c("unit", "group", "n")) 26 | expect_equal(long$unit, as.character(rep(1:3, 2))) 27 | expect_equal(long$group, as.character(c("A", "A", "A", "B", "B", "B"))) 28 | }) 29 | 30 | test_that("colnames only", { 31 | m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3) 32 | rownames(m) <- c("S1", "S2", "S3") 33 | long <- matrix_to_long(m) 34 | expect_equal(names(long), c("unit", "group", "n")) 35 | expect_equal(long$unit, rep(c("S1", "S2", "S3"), 2)) 36 | expect_equal(long$group, as.character(c(1, 1, 1, 2, 2, 2))) 37 | }) 38 | 39 | test_that("rownames + colnames", { 40 | m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3) 41 | colnames(m) <- c("A", "B") 42 | rownames(m) <- c("S1", "S2", "S3") 43 | long <- matrix_to_long(m) 44 | expect_equal(names(long), c("unit", "group", "n")) 45 | expect_equal(long$unit, rep(c("S1", "S2", "S3"), 2)) 46 | expect_equal(long$group, as.character(c("A", "A", "A", "B", "B", "B"))) 47 | }) 48 | 49 | test_that("arguments", { 50 | # drop zero 51 | m <- matrix(c(10, 20, 30, 0, 20, 0), nrow = 3) 52 | long1 <- matrix_to_long(m) 53 | expect_equal(nrow(long1), 4) 54 | long2 <- matrix_to_long(m, drop_zero = FALSE) 55 | expect_equal(nrow(long2), 6) 56 | 57 | # change names 58 | long3 <- matrix_to_long(m, "race", "school", weight = "weight") 59 | expect_equal(names(long3), c("school", "race", "weight")) 60 | }) 61 | -------------------------------------------------------------------------------- /man/segplot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plots.R 3 | \name{segplot} 4 | \alias{segplot} 5 | \title{A visual representation of segregation} 6 | \usage{ 7 | segplot( 8 | data, 9 | group, 10 | unit, 11 | weight, 12 | order = "segregation", 13 | secondary_plot = NULL, 14 | reference_distribution = NULL, 15 | bar_space = 0, 16 | hline = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{A data frame.} 21 | 22 | \item{group}{A categorical variable or a vector of variables 23 | contained in \code{data}. Defines the first dimension 24 | over which segregation is computed.} 25 | 26 | \item{unit}{A categorical variable or a vector of variables 27 | contained in \code{data}. Defines the second dimension 28 | over which segregation is computed.} 29 | 30 | \item{weight}{Numeric. (Default \code{NULL})} 31 | 32 | \item{order}{A character, either 33 | "segregation", "entropy", "majority", or "majority_fixed". 34 | Affects the ordering of the units. 35 | The horizontal ordering of the groups can be changed 36 | by using a factor variable for \code{group}. 37 | The difference between "majority" and "majority_fixed" is that the former 38 | will reorder the groups in such a way that the majority group actually comes first. 39 | If you want to control the ordering yourself, use "majority_fixed" and specify 40 | the \code{group} variable as a factor variable.} 41 | 42 | \item{secondary_plot}{If \code{NULL} (default), no secondary plot is drawn. 43 | If "segregation", a secondary plot is drawn that shows adjusted local segregation 44 | scores for each unit. If "cumulative", a secondary plot is drawn that shows 45 | the cumulative contribution of each unit toward the total H (calculated as the 46 | proportion of each unit times the adjusted local segregation of each unit)0.} 47 | 48 | \item{reference_distribution}{Specifies the reference distribution, given as 49 | a two-column data frame, to be plotted on the right. 50 | If order is \code{segregation}, then this reference distribution is 51 | also used to compute the local segregation scores.} 52 | 53 | \item{bar_space}{Specifies space between single units.} 54 | 55 | \item{hline}{Default \code{NULL}. If a color is specified, 56 | horizontal lines will be drawn where groups are separated.} 57 | } 58 | \value{ 59 | Returns a ggplot2 or patchwork object. 60 | } 61 | \description{ 62 | Produces a segregation plot. 63 | } 64 | -------------------------------------------------------------------------------- /tests/testthat/test_dissimilarity.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_dissimilarity") 7 | 8 | test_that("correct calculations", { 9 | m0 <- matrix_to_long(matrix(c(100, 100, 100, 100, 100, 100), ncol = 2)) 10 | expect_equal(dissimilarity(m0, "group", "unit", weight = "n")$est[[1]], 0) 11 | 12 | m1 <- matrix_to_long(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2)) 13 | m2 <- matrix_to_long(matrix(c(80, 80, 20, 20, 20, 20, 80, 80), ncol = 2)) 14 | expect_equal(dissimilarity(m1, "group", "unit", weight = "n")$est[[1]], .6) 15 | expect_equal(dissimilarity(m2, "group", "unit", weight = "n")$est[[1]], .6) 16 | 17 | m3 <- matrix_to_long(matrix(c(100, 100, 0, 0, 0, 0, 100, 100), ncol = 2)) 18 | expect_equal(dissimilarity(m3, "group", "unit", weight = "n")$est[[1]], 1) 19 | }) 20 | 21 | test_that("alternative calculation", { 22 | tab <- t(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2)) 23 | div <- sweep(tab, 1, rowSums(tab), "/") 24 | d <- 1 / 2 * sum(apply(div, 2, segregation:::abs_diff)) 25 | m1 <- matrix_to_long(t(tab)) 26 | expect_equal(dissimilarity(m1, "group", "unit", weight = "n")$est[[1]], d) 27 | }) 28 | 29 | test_that("SE works", { 30 | m0 <- matrix_to_long(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2)) 31 | d <- dissimilarity(m0, "group", "unit", weight = "n", se = TRUE) 32 | expect_equal(dim(d), c(1, 5)) 33 | expect_equal(d$se > 0, TRUE) 34 | expect_equal(dim(attr(d, "bootstrap")), c(100, 2)) 35 | }) 36 | 37 | test_that("names of columns", { 38 | m0 <- matrix_to_long(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2), 39 | group = "race", unit = "tract" 40 | ) 41 | 42 | d <- dissimilarity(m0, "race", "tract", weight = "n") 43 | expect_equal(dim(d), c(1, 2)) 44 | 45 | data.table::setDT(m0) 46 | d <- dissimilarity(m0, "race", "tract", weight = "n") 47 | expect_equal(dim(d), c(1, 2)) 48 | }) 49 | 50 | 51 | test_that("bootstrapping fails when sample size is non-integer", { 52 | m0 <- matrix_to_long(matrix(c(100.3, 60, 40, 0, 0, 40, 60, 100), ncol = 2)) 53 | expect_error(dissimilarity(m0, "group", "unit", weight = "n", se = TRUE)) 54 | }) 55 | 56 | test_that("gives error when group > 2", { 57 | m0 <- matrix_to_long(matrix(c(100, 60, 40, 10, 20, 40, 60, 100, 50), ncol = 3)) 58 | expect_error(dissimilarity(m0, "group", "unit", weight = "n")) 59 | }) 60 | -------------------------------------------------------------------------------- /man/mutual_local_expected.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mutual_expected.R 3 | \name{mutual_local_expected} 4 | \alias{mutual_local_expected} 5 | \title{Calculates expected local segregation scores when true segregation is zero} 6 | \usage{ 7 | mutual_local_expected( 8 | data, 9 | group, 10 | unit, 11 | weight = NULL, 12 | fixed_margins = TRUE, 13 | n_bootstrap = 100, 14 | base = exp(1) 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{A data frame.} 19 | 20 | \item{group}{A categorical variable or a vector of variables 21 | contained in \code{data}. Defines the first dimension 22 | over which segregation is computed.} 23 | 24 | \item{unit}{A categorical variable or a vector of variables 25 | contained in \code{data}. Defines the group for which local 26 | segregation indices are calculated.} 27 | 28 | \item{weight}{Numeric. (Default \code{NULL})} 29 | 30 | \item{fixed_margins}{Should the margins be fixed or simulated? (Default \code{TRUE})} 31 | 32 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})} 33 | 34 | \item{base}{Base of the logarithm that is used in the calculation. 35 | Defaults to the natural logarithm.} 36 | } 37 | \value{ 38 | A data.table with two rows, corresponding to the expected values of 39 | segregation when true segregation is zero. 40 | } 41 | \description{ 42 | When sample sizes are small, one group has a small proportion, or 43 | when there are many units, segregation indices are typically upwardly 44 | biased, even when true segregation is zero. This function simulates 45 | tables with zero segregation, given the marginals of the dataset, 46 | and calculates local segregation scores. If the expected values are large, 47 | the interpretation of index scores might have to be adjusted. 48 | } 49 | \examples{ 50 | \dontrun{ 51 | # the schools00 dataset has a large sample size, so expected segregation is close to zero 52 | mutual_local_expected(schools00, "race", "school", weight = "n") 53 | 54 | # but we can build a smaller table, with 100 students distributed across 55 | # 10 schools, where one racial group has 10\% of the students 56 | small <- data.frame( 57 | school = c(1:10, 1:10), 58 | race = c(rep("r1", 10), rep("r2", 10)), 59 | n = c(rep(1, 10), rep(9, 10)) 60 | ) 61 | mutual_local_expected(small, "race", "school", weight = "n") 62 | # with an increase in sample size (n=1000), the values improve 63 | small$n <- small$n * 10 64 | mutual_local_expected(small, "race", "school", weight = "n") 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 11 | #endif 12 | 13 | // compress_compute_cpp 14 | List compress_compute_cpp(std::string neighbors_option, StringMatrix m_neighbors, int n_neighbors, NumericMatrix m_data, std::vector unit_names, int max_iter); 15 | RcppExport SEXP _segregation_compress_compute_cpp(SEXP neighbors_optionSEXP, SEXP m_neighborsSEXP, SEXP n_neighborsSEXP, SEXP m_dataSEXP, SEXP unit_namesSEXP, SEXP max_iterSEXP) { 16 | BEGIN_RCPP 17 | Rcpp::RObject rcpp_result_gen; 18 | Rcpp::RNGScope rcpp_rngScope_gen; 19 | Rcpp::traits::input_parameter< std::string >::type neighbors_option(neighbors_optionSEXP); 20 | Rcpp::traits::input_parameter< StringMatrix >::type m_neighbors(m_neighborsSEXP); 21 | Rcpp::traits::input_parameter< int >::type n_neighbors(n_neighborsSEXP); 22 | Rcpp::traits::input_parameter< NumericMatrix >::type m_data(m_dataSEXP); 23 | Rcpp::traits::input_parameter< std::vector >::type unit_names(unit_namesSEXP); 24 | Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP); 25 | rcpp_result_gen = Rcpp::wrap(compress_compute_cpp(neighbors_option, m_neighbors, n_neighbors, m_data, unit_names, max_iter)); 26 | return rcpp_result_gen; 27 | END_RCPP 28 | } 29 | // get_crosswalk_cpp 30 | List get_crosswalk_cpp(std::vector old_unit, std::vector new_unit); 31 | RcppExport SEXP _segregation_get_crosswalk_cpp(SEXP old_unitSEXP, SEXP new_unitSEXP) { 32 | BEGIN_RCPP 33 | Rcpp::RObject rcpp_result_gen; 34 | Rcpp::RNGScope rcpp_rngScope_gen; 35 | Rcpp::traits::input_parameter< std::vector >::type old_unit(old_unitSEXP); 36 | Rcpp::traits::input_parameter< std::vector >::type new_unit(new_unitSEXP); 37 | rcpp_result_gen = Rcpp::wrap(get_crosswalk_cpp(old_unit, new_unit)); 38 | return rcpp_result_gen; 39 | END_RCPP 40 | } 41 | 42 | static const R_CallMethodDef CallEntries[] = { 43 | {"_segregation_compress_compute_cpp", (DL_FUNC) &_segregation_compress_compute_cpp, 6}, 44 | {"_segregation_get_crosswalk_cpp", (DL_FUNC) &_segregation_get_crosswalk_cpp, 2}, 45 | {NULL, NULL, 0} 46 | }; 47 | 48 | RcppExport void R_init_segregation(DllInfo *dll) { 49 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 50 | R_useDynamicSymbols(dll, FALSE); 51 | } 52 | -------------------------------------------------------------------------------- /man/dissimilarity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dissimilarity.R 3 | \name{dissimilarity} 4 | \alias{dissimilarity} 5 | \title{Calculates Index of Dissimilarity} 6 | \usage{ 7 | dissimilarity( 8 | data, 9 | group, 10 | unit, 11 | weight = NULL, 12 | se = FALSE, 13 | CI = 0.95, 14 | n_bootstrap = 100 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{A data frame.} 19 | 20 | \item{group}{A categorical variable or a vector of variables 21 | contained in \code{data}. Defines the first dimension 22 | over which segregation is computed. The D index only 23 | allows two distinct groups.} 24 | 25 | \item{unit}{A categorical variable or a vector of variables 26 | contained in \code{data}. Defines the second dimension 27 | over which segregation is computed.} 28 | 29 | \item{weight}{Numeric. (Default \code{NULL})} 30 | 31 | \item{se}{If \code{TRUE}, the segregation estimates are bootstrapped to provide 32 | standard errors and to apply bias correction. The bias that is reported 33 | has already been applied to the estimates (i.e. the reported estimates are "debiased") 34 | (Default \code{FALSE})} 35 | 36 | \item{CI}{If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval 37 | in addition to the bootstrap standard error. 38 | This is based on percentiles of the bootstrap distribution, and a valid interpretation 39 | relies on a larger number of bootstrap iterations. (Default \code{0.95})} 40 | 41 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})} 42 | } 43 | \value{ 44 | Returns a data.table with one row. The column \code{est} contains 45 | the Index of Dissimilarity. 46 | If \code{se} is set to \code{TRUE}, an additional column \code{se} contains 47 | the associated bootstrapped standard errors, an additional column \code{CI} contains 48 | the estimate confidence interval as a list column, an additional column \code{bias} contains 49 | the estimated bias, and the column \code{est} contains the bias-corrected estimates. 50 | } 51 | \description{ 52 | Returns the total segregation between \code{group} and \code{unit} using 53 | the Index of Dissimilarity. 54 | } 55 | \examples{ 56 | # Example where D and H deviate 57 | m1 <- matrix_to_long(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2)) 58 | m2 <- matrix_to_long(matrix(c(80, 80, 20, 20, 20, 20, 80, 80), ncol = 2)) 59 | dissimilarity(m1, "group", "unit", weight = "n") 60 | dissimilarity(m2, "group", "unit", weight = "n") 61 | } 62 | \references{ 63 | Otis Dudley Duncan and Beverly Duncan. 1955. "A Methodological Analysis of Segregation Indexes," 64 | American Sociological Review 20(2): 210-217. 65 | } 66 | -------------------------------------------------------------------------------- /man/mutual_total_expected.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mutual_expected.R 3 | \name{mutual_total_expected} 4 | \alias{mutual_total_expected} 5 | \title{Calculates expected values when true segregation is zero} 6 | \usage{ 7 | mutual_total_expected( 8 | data, 9 | group, 10 | unit, 11 | weight = NULL, 12 | within = NULL, 13 | fixed_margins = TRUE, 14 | n_bootstrap = 100, 15 | base = exp(1) 16 | ) 17 | } 18 | \arguments{ 19 | \item{data}{A data frame.} 20 | 21 | \item{group}{A categorical variable or a vector of variables 22 | contained in \code{data}. Defines the first dimension 23 | over which segregation is computed.} 24 | 25 | \item{unit}{A categorical variable or a vector of variables 26 | contained in \code{data}. Defines the second dimension 27 | over which segregation is computed.} 28 | 29 | \item{weight}{Numeric. (Default \code{NULL})} 30 | 31 | \item{within}{Apply algorithm within each group defined by this variable, 32 | and report the weighted average. (Default \code{NULL})} 33 | 34 | \item{fixed_margins}{Should the margins be fixed or simulated? (Default \code{TRUE})} 35 | 36 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})} 37 | 38 | \item{base}{Base of the logarithm that is used in the calculation. 39 | Defaults to the natural logarithm.} 40 | } 41 | \value{ 42 | A data.table with two rows, corresponding to the expected values of 43 | segregation when true segregation is zero. 44 | } 45 | \description{ 46 | When sample sizes are small, one group has a small proportion, or 47 | when there are many units, segregation indices are typically upwardly 48 | biased, even when true segregation is zero. This function simulates 49 | tables with zero segregation, given the marginals of the dataset, 50 | and calculates segregation. If the expected values are large, 51 | the interpretation of index scores might have to be adjusted. 52 | } 53 | \examples{ 54 | \dontrun{ 55 | # the schools00 dataset has a large sample size, so expected segregation is close to zero 56 | mutual_total_expected(schools00, "race", "school", weight = "n") 57 | 58 | # but we can build a smaller table, with 100 students distributed across 59 | # 10 schools, where one racial group has 10\% of the students 60 | small <- data.frame( 61 | school = c(1:10, 1:10), 62 | race = c(rep("r1", 10), rep("r2", 10)), 63 | n = c(rep(1, 10), rep(9, 10)) 64 | ) 65 | mutual_total_expected(small, "race", "school", weight = "n") 66 | # with an increase in sample size (n=1000), the values improve 67 | small$n <- small$n * 10 68 | mutual_total_expected(small, "race", "school", weight = "n") 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /R/exposure.R: -------------------------------------------------------------------------------- 1 | #' Calculates pairwise exposure indices 2 | #' 3 | #' Returns the pairwise exposure indices between groups 4 | #' 5 | #' @param data A data frame. 6 | #' @param group A categorical variable 7 | #' contained in \code{data}. Defines the first dimension 8 | #' over which segregation is computed. 9 | #' @param unit A vector of variables 10 | #' contained in \code{data}. Defines the second dimension 11 | #' over which segregation is computed. 12 | #' @param weight Numeric. (Default \code{NULL}) 13 | #' @return Returns a data.table with columns "of", "to", and 14 | #' "exposure". Read results as "exposure of group x to group y". 15 | #' @import data.table 16 | #' @export 17 | exposure <- function(data, group, unit, weight = NULL) { 18 | checkmate::assert_data_frame(data) 19 | checkmate::assert_character(group, len = 1) 20 | checkmate::assert_character(unit, min.len = 1) 21 | checkmate::assert_character(weight, null.ok = TRUE) 22 | checkmate::assert_names(names(data), must.include = c(group, unit, weight)) 23 | 24 | of <- prepare_data(data, group, unit, weight) 25 | to <- copy(of) 26 | of[, n_unit := sum(freq), by = unit] 27 | of[, n_group := sum(freq), by = group] 28 | setnames(of, "freq", "freq_of") 29 | setnames(of, group, "of") 30 | setnames(to, "freq", "freq_to") 31 | setnames(to, group, "to") 32 | d <- merge(of, to, by = unit, allow.cartesian = TRUE) 33 | exp <- d[, .(exposure = sum(freq_of * freq_to / (n_unit * n_group))), by = .(of, to)] 34 | exp[] 35 | } 36 | 37 | #' Calculates isolation indices 38 | #' 39 | #' Returns isolation index of each group 40 | #' 41 | #' @param data A data frame. 42 | #' @param group A categorical variable 43 | #' contained in \code{data}. Defines the first dimension 44 | #' over which segregation is computed. 45 | #' @param unit A vector of variables 46 | #' contained in \code{data}. Defines the second dimension 47 | #' over which segregation is computed. 48 | #' @param weight Numeric. (Default \code{NULL}) 49 | #' @return Returns a data.table with group column and isolation index. 50 | #' @import data.table 51 | #' @export 52 | #' @import data.table 53 | #' @export 54 | isolation <- function(data, group, unit, weight = NULL) { 55 | checkmate::assert_data_frame(data) 56 | checkmate::assert_character(group, len = 1) 57 | checkmate::assert_character(unit, min.len = 1) 58 | checkmate::assert_character(weight, null.ok = TRUE) 59 | checkmate::assert_names(names(data), must.include = c(group, unit, weight)) 60 | 61 | d <- prepare_data(data, group, unit, weight) 62 | 63 | d[, n_unit := sum(freq), by = unit] 64 | d[, n_group := sum(freq), by = group] 65 | iso <- d[, .(isolation = sum(freq^2 / (n_unit * n_group))), by = group] 66 | iso[] 67 | } 68 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # segregation (development version) 2 | 3 | - breaking: rename `mutual_expected` to `mutual_total_expected` 4 | - add `mutual_local_expected` 5 | - ipf: decrease default precision (longer runtime, but more precise results) 6 | - entropy: add 'within' argument 7 | 8 | # segregation 1.1.0 9 | 10 | - various improvements to compression algorithm 11 | - add dendrogram visualization 12 | - allow multiple curves in `segcurve` function 13 | - segplot: add optional 'secondary_plot' argument 14 | - segplot: remove 'title' argument 15 | - segplot: add optional 'hline' argument 16 | 17 | # segregation 1.0.0 18 | 19 | - add mutual_total_nested 20 | - add within argument to mutual_expected 21 | - add dissimilarity_expected 22 | - add suite of compression-related functions (some in C++) 23 | - add segplot function 24 | - add functions exposure and isolation 25 | - fix that roxygen2 problem 26 | 27 | # segregation 0.6.0 28 | 29 | - faster mutual_total(..., within) 30 | - updated docs 31 | - some minor bug fixes 32 | - some improved error messages 33 | 34 | # segregation 0.5.0 35 | 36 | - dissimilarity: support for the index of dissimilarity 37 | - add CI argument for confidence intervals 38 | - mutual_within: report ent_ratio instead of h_weight 39 | - matrix_to_long: convert contingency tables into long form 40 | - add introductory vignette 41 | 42 | # segregation 0.4.0 43 | 44 | - faster bootstrap 45 | - return bootstrap estimates as attr 46 | - add mutual_expected 47 | - apply bias-correction via bootstrap by default when se=TRUE 48 | 49 | # segregation 0.3.0 50 | 51 | - always return data.table 52 | - for ipf function, warn when groups/units are dropped 53 | - return sample size of source dataset for IPF 54 | - don't allow bootstrap when sample size is not an integer, but allow non-integer sample weights (which are unproblematic) 55 | - simplify precision parameter for ipf procedure 56 | - increase default bootstrap to 100 57 | - fix data.table issue (#3) 58 | 59 | # segregation 0.2.0 60 | 61 | - add "shapley" decomposition method, revisit other difference decomposition methods 62 | - better logging of bootstrap/IPF 63 | - several small fixes 64 | - add lintr package 65 | - add warning when attempting bootstrap with non-integer weights 66 | 67 | # segregation 0.1.0 68 | 69 | - switch group and unit definitions, to be consistent with the literature 70 | - add Theil's Information Index (H) 71 | - add entropy function 72 | - add mutual_within function to decompose weighted within indices 73 | - add "wide" option to mutual_local and mutual_within 74 | - add "ipf" (iterative proportional fitting) function and a difference decomposition based on IPF 75 | - "mrc_adjusted" difference decomposition is defined only on overlap sample of units and groups 76 | - internal refactoring 77 | 78 | # segregation 0.0.1 79 | 80 | Initial release. 81 | -------------------------------------------------------------------------------- /data-raw/clean_data.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(haven) 3 | 4 | set.seed(179463) 5 | 6 | prepare <- function(d) { 7 | d %>% 8 | # only primary schools 9 | filter(level == 1) %>% 10 | na.omit() %>% 11 | filter(state %in% c("AL", "CO", "CT")) %>% 12 | mutate(state = recode(state, AL = "A", "CO" = "B", "CT" = "C")) %>% 13 | mutate(check = native + asian + hisp + black + white) %>% 14 | # only schools with >100 students 15 | filter(total == check, total > 100) %>% 16 | select(-total, -check, -level) %>% 17 | gather(key = race, value = n, -state, -district, -school) %>% 18 | filter(n > 0) %>% 19 | # sort 20 | arrange(state, district, school, race) 21 | } 22 | 23 | # NCES Common Core data for 2000-01 24 | rawd <- read_sas("sc001aai.zip") 25 | schools00 <- select(rawd, 26 | state = LSTATE00, district = LEAID, 27 | school = NCESSCH, level = LEVEL00, total = MEMBER00, 28 | native = AM00, asian = ASIAN00, hisp = HISP00, 29 | black = BLACK00, white = WHITE00 30 | ) %>% prepare 31 | schools00$n <- round(rnorm(nrow(schools00), schools00$n, sd=sqrt(schools00$n))) 32 | schools00 <- filter(schools00, n > 0) 33 | 34 | 35 | # NCES Common Core data for 2005-06 36 | rawd <- read_sas("sc051aai_sas.zip") 37 | schools05 <- select(rawd, 38 | state = LSTATE05, district = LEAID, 39 | school = NCESSCH, level = LEVEL05, total = MEMBER05, 40 | native = AM05, asian = ASIAN05, hisp = HISP05, 41 | black = BLACK05, white = WHITE05 42 | ) %>% prepare 43 | schools05$n <- round(rnorm(nrow(schools05), schools05$n, sd=sqrt(schools05$n))) 44 | schools05 <- filter(schools05, n > 0) 45 | 46 | # create new school and district identifiers 47 | district <- bind_rows( 48 | select(schools00, state, district), 49 | select(schools05, state, district)) %>% 50 | distinct() %>% 51 | group_by(state) %>% 52 | mutate(district_new=paste0(state, 1:n())) %>% 53 | select(district, district_new) 54 | schools00 <- left_join(schools00, district) 55 | schools05 <- left_join(schools05, district) 56 | 57 | school <- bind_rows(select(schools00, district_new, school), 58 | select(schools05, district_new, school)) %>% 59 | distinct() %>% 60 | group_by(district_new) %>% 61 | mutate(school_new=paste0(district_new, '_', 1:n())) %>% 62 | ungroup() %>% 63 | select(school, school_new) 64 | schools00 <- left_join(schools00, school) 65 | schools05 <- left_join(schools05, school) 66 | 67 | schools00 <- schools00 %>% 68 | select(state, district = district_new, school = school_new, 69 | race, n) %>% 70 | mutate_at(vars(state, district, school, race), as_factor) 71 | 72 | schools05 <- schools05 %>% 73 | select(state, district = district_new, school = school_new, 74 | race, n) %>% 75 | mutate_at(vars(state, district, school, race), as_factor) 76 | 77 | print(schools00) 78 | save(schools00, file = "../data/schools00.rda") 79 | print(schools05) 80 | save(schools05, file = "../data/schools05.rda") 81 | -------------------------------------------------------------------------------- /man/mutual_local.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mutual.R 3 | \name{mutual_local} 4 | \alias{mutual_local} 5 | \title{Calculates local segregation scores based on M} 6 | \usage{ 7 | mutual_local( 8 | data, 9 | group, 10 | unit, 11 | weight = NULL, 12 | se = FALSE, 13 | CI = 0.95, 14 | n_bootstrap = 100, 15 | base = exp(1), 16 | wide = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{A data frame.} 21 | 22 | \item{group}{A categorical variable or a vector of variables 23 | contained in \code{data}. Defines the dimension 24 | over which segregation is computed.} 25 | 26 | \item{unit}{A categorical variable or a vector of variables 27 | contained in \code{data}. Defines the group for which local 28 | segregation indices are calculated.} 29 | 30 | \item{weight}{Numeric. (Default \code{NULL})} 31 | 32 | \item{se}{If \code{TRUE}, the segregation estimates are bootstrapped to provide 33 | standard errors and to apply bias correction. The bias that is reported 34 | has already been applied to the estimates (i.e. the reported estimates are "debiased") 35 | (Default \code{FALSE})} 36 | 37 | \item{CI}{If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval 38 | in addition to the bootstrap standard error. 39 | This is based on percentiles of the bootstrap distribution, and a valid interpretation 40 | relies on a larger number of bootstrap iterations. (Default \code{0.95})} 41 | 42 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})} 43 | 44 | \item{base}{Base of the logarithm that is used in the calculation. 45 | Defaults to the natural logarithm.} 46 | 47 | \item{wide}{Returns a wide dataframe instead of a long dataframe. 48 | (Default \code{FALSE})} 49 | } 50 | \value{ 51 | Returns a data.table with two rows for each category defined by \code{unit}, 52 | for a total of \code{2*(number of units)} rows. 53 | The column \code{est} contains two statistics that 54 | are provided for each unit: \code{ls}, the local segregation score, and 55 | \code{p}, the proportion of the unit from the total number of cases. 56 | If \code{se} is set to \code{TRUE}, an additional column \code{se} contains 57 | the associated bootstrapped standard errors, an additional column \code{CI} contains 58 | the estimate confidence interval as a list column, an additional column \code{bias} contains 59 | the estimated bias, and the column \code{est} contains the bias-corrected estimates. 60 | If \code{wide} is set to \code{TRUE}, returns instead a wide dataframe, with one 61 | row for each \code{unit}, and the associated statistics in separate columns. 62 | } 63 | \description{ 64 | Returns local segregation indices for each category defined 65 | by \code{unit}. 66 | } 67 | \examples{ 68 | # which schools are most segregated? 69 | (localseg <- mutual_local(schools00, "race", "school", 70 | weight = "n", wide = TRUE 71 | )) 72 | 73 | sum(localseg$p) # => 1 74 | 75 | # the sum of the weighted local segregation scores equals 76 | # total segregation 77 | sum(localseg$ls * localseg$p) # => .425 78 | mutual_total(schools00, "school", "race", weight = "n") # M => .425 79 | } 80 | \references{ 81 | Henri Theil. 1971. Principles of Econometrics. New York: Wiley. 82 | 83 | Ricardo Mora and Javier Ruiz-Castillo. 2011. 84 | "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194. 85 | } 86 | -------------------------------------------------------------------------------- /tests/testthat/test_plots.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("plots") 7 | 8 | skip_if(!requireNamespace("ggplot2", quietly = TRUE)) 9 | skip_if(!requireNamespace("patchwork", quietly = TRUE)) 10 | 11 | plot_majority <- segplot(schools00, "race", "school", weight = "n", order = "majority", hline = "white") 12 | plot_majority_fixed <- segplot(schools00, "race", "school", weight = "n", order = "majority_fixed") 13 | plot_seg <- segplot(schools00, "race", "school", weight = "n", order = "segregation") 14 | plot_entropy <- segplot(schools00, "race", "school", 15 | weight = "n", order = "entropy", 16 | bar_space = 0.1 17 | ) 18 | 19 | test_that("dimensions", { 20 | expect_equal(nrow(plot_majority$data), nrow(plot_seg$data)) 21 | expect_equal(nrow(plot_majority$data), nrow(plot_entropy$data)) 22 | expect_equal(nrow(plot_majority$data), nrow(plot_majority_fixed$data)) 23 | }) 24 | 25 | test_that("reference", { 26 | reference <- data.table::as.data.table(schools00) 27 | reference <- reference[, .(N = sum(n)), by = .(race)] 28 | reference[, p := N / sum(N)] 29 | expect_error(segplot(schools00, "race", "school", 30 | weight = "n", order = "segregation", 31 | reference_distribution = reference 32 | )) 33 | expect_error(segplot(schools00, "race", "school", 34 | weight = "n", order = "segregation", 35 | reference_distribution = reference[1:2, ] 36 | )) 37 | expect_error(segplot(schools00, "race", "school", 38 | weight = "n", order = "segregation", 39 | reference_distribution = 1 40 | )) 41 | 42 | reference[, N := NULL] 43 | sp <- segplot(schools00, "race", "school", 44 | weight = "n", order = "segregation" 45 | ) 46 | sp_ref <- segplot(schools00, "race", "school", 47 | weight = "n", order = "segregation", 48 | reference_distribution = reference 49 | ) 50 | # identical order 51 | expect_equal(sp$data[["unit"]], sp_ref$data[["unit"]]) 52 | }) 53 | 54 | test_that("secondary plot", { 55 | plot_2a <- segplot(schools00, "race", "school", 56 | weight = "n", 57 | order = "segregation", secondary_plot = "segregation" 58 | ) 59 | plot_2b <- segplot(schools00, "race", "school", 60 | weight = "n", 61 | order = "segregation", secondary_plot = "cumulative" 62 | ) 63 | 64 | expect_equal(plot_seg$data, plot_2a[[1]]$data) 65 | expect_equal(plot_seg$data, plot_2b[[1]]$data) 66 | expect_true("patchwork" %in% class(plot_2a)) 67 | expect_true("patchwork" %in% class(plot_2b)) 68 | }) 69 | 70 | test_that("segcurve", { 71 | expect_error(segcurve(schools00, "race", "school", weight = "n")) 72 | expect_error(segcurve(schools00, "race", "school", weight = "n", segment = c("a", "b"))) 73 | expect_error(segcurve(schools00, "race", "school", weight = "n", segment = c("state", "school"))) 74 | 75 | p1 <- segcurve(subset(schools00, race %in% c("white", "black")), 76 | "race", "school", 77 | weight = "n" 78 | ) 79 | p2 <- segcurve(subset(schools00, race %in% c("white", "asian")), 80 | "race", "school", 81 | weight = "n" 82 | ) 83 | p3 <- segcurve(subset(schools00, race %in% c("white", "asian")), 84 | "race", "school", 85 | weight = "n", 86 | segment = "state" 87 | ) 88 | expect_equal(p3$labels$colour, "state") 89 | }) 90 | -------------------------------------------------------------------------------- /tests/testthat/test_mutual_local.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_mutual_local") 7 | 8 | test_data <- data.frame( 9 | u = c(rep("a", 4), rep("b", 4)), 10 | g = rep(c(1, 2, 3, 4), 2), 11 | n = c(40, 20, 5, 1, 20, 40, 60, 80) 12 | ) 13 | 14 | local <- mutual_local(test_data, "u", "g", weight = "n") 15 | local2 <- mutual_local(test_data, "g", "u", weight = "n") 16 | localse <- mutual_local(test_data, "u", "g", weight = "n", se = TRUE, n_bootstrap = 10) 17 | localbase2 <- mutual_local(test_data, "g", "u", weight = "n", base = 2) 18 | 19 | test_that("local calculation works", { 20 | expect_equal(sum(local[stat == "p", est]), 1) 21 | expect_equal(sum(local2[local2$stat == "p", est]), 1) 22 | expect_equal(sum(localse[localse$stat == "p", est]), 1) 23 | 24 | expect_equal( 25 | sum(local[stat == "p", est] * local[stat == "ls", est]), 26 | sum(local2[local2$stat == "p", est] * local2[local2$stat == "ls", est]) 27 | ) 28 | expect_equal( 29 | sum(local[stat == "p", est] * local[stat == "ls", est]), 30 | mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est] 31 | ) 32 | expect_equal( 33 | sum(localbase2[stat == "p", est] * localbase2[stat == "ls", est]), 34 | mutual_total(test_data, "u", "g", weight = "n", base = 2)[stat == "M", est] 35 | ) 36 | }) 37 | 38 | test_that("return works", { 39 | expect_equal(nrow(local), 8) 40 | expect_equal(ncol(local), 3) 41 | }) 42 | 43 | test_that("bootstrapping works", { 44 | expect_equal(nrow(localse), 8) 45 | expect_equal(ncol(localse), 6) 46 | }) 47 | 48 | test_that("bootstrap attributes exists", { 49 | expect_equal(dim(attr(localse, "bootstrap")), c(10 * length(unique(test_data$g)) * 2, 3)) 50 | }) 51 | 52 | test_that("bootstrapping fails when sample size is non-integer", { 53 | test_data <- data.frame( 54 | u = c(rep("a", 4), rep("b", 4)), 55 | g = rep(c(1, 2, 3, 4), 2), 56 | n = c(40, 20, 5, 1, 20, 40, 60, 80.3) 57 | ) 58 | 59 | expect_error(mutual_local(test_data, "u", "g", weight = "n", se = TRUE)) 60 | # rescale 61 | test_data$n2 <- test_data$n / sum(test_data$n) * round(sum(test_data$n)) 62 | ret <- mutual_local(test_data, "u", "g", weight = "n2", se = TRUE) 63 | expect_equal(all(ret$se > 0), TRUE) 64 | }) 65 | 66 | test_that("option wide works", { 67 | nowide <- mutual_local(test_data, "u", "g", weight = "n") 68 | nowide_se <- mutual_local(test_data, "u", "g", 69 | weight = "n", 70 | se = TRUE, n_bootstrap = 10 71 | ) 72 | wide <- mutual_local(test_data, "u", "g", weight = "n", wide = TRUE) 73 | wide_se <- mutual_local(test_data, "u", "g", 74 | weight = "n", wide = TRUE, 75 | se = TRUE, n_bootstrap = 10 76 | ) 77 | 78 | expect_equal(ncol(nowide) + 3, ncol(nowide_se)) 79 | expect_equal(nrow(nowide), 8) 80 | expect_equal(nrow(nowide), nrow(nowide_se)) 81 | expect_equal(nrow(nowide), nrow(wide) * 2) 82 | expect_equal(ncol(wide) + 6, ncol(wide_se)) 83 | expect_equal(nrow(wide), nrow(wide_se)) 84 | 85 | expect_equal(nowide[stat == "ls", est], wide$ls) 86 | expect_equal(nowide[stat == "p", est], wide$p) 87 | 88 | total <- mutual_total(test_data, "u", "g", weight = "n") 89 | expect_equal( 90 | total[stat == "M", est], 91 | sum(nowide[stat == "ls", est] * nowide[stat == "p", "est"]) 92 | ) 93 | expect_equal(total[stat == "M", est], sum(wide$ls * wide$p)) 94 | expect_equal(total[stat == "H", est], sum(wide$ls * wide$p) / entropy(test_data, "u", weight = "n")) 95 | 96 | expect_equal(all(nowide_se[["se"]] > 0), TRUE) 97 | }) 98 | -------------------------------------------------------------------------------- /tests/testthat/test_mutual_local_expected.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_mutual_local_expected") 7 | 8 | data1 <- data.frame( 9 | u = rep(c(1, 2, 3, 4), 2), 10 | g = c(rep("a", 4), rep("b", 4)), 11 | n = c(40, 20, 5, 1, 20, 40, 60, 80), 12 | stringsAsFactors = FALSE 13 | ) 14 | 15 | data2 <- data.frame( 16 | u = c(1:10, 1:10), 17 | g = c(rep("a", 10), rep("b", 10)), 18 | n = c(rep(1, 10), rep(9, 10)), 19 | stringsAsFactors = FALSE 20 | ) 21 | 22 | test_that("works both ways around", { 23 | data1a <- mutual_local_expected(data1, "u", "g", weight = "n") 24 | data1b <- mutual_local_expected(data1, "g", "u", weight = "n") 25 | expect_equal(nrow(data1a), 2) 26 | expect_equal(nrow(data1b), 4) 27 | expect_equal(data1a[, sum(est * p_mean)], data1b[, sum(est * p_mean)], tolerance = 0.01) 28 | 29 | data2a <- mutual_local_expected(data2, "u", "g", weight = "n") 30 | data2b <- mutual_local_expected(data2, "g", "u", weight = "n") 31 | expect_equal(nrow(data2a), 2) 32 | expect_equal(nrow(data2b), 10) 33 | expect_equal(data2a[, sum(est * p_mean)], data2b[, sum(est * p_mean)], tolerance = 0.01) 34 | }) 35 | 36 | test_that("works nested", { 37 | data1$superunit <- data1$u <= 3 38 | a <- mutual_local_expected(data1, "g", c("superunit", "u"), weight = "n") 39 | b <- mutual_local_expected(data1, "g", "u", weight = "n") 40 | 41 | expect_equal(a$est, b$est, tolerance = 0.01) 42 | expect_equal(a$se, b$se, tolerance = 0.01) 43 | expect_equal(length(a$est), 4) 44 | }) 45 | 46 | test_that("fixed margins = FALSE", { 47 | data1a <- mutual_local_expected(data1, "u", "g", weight = "n", fixed_margins = FALSE) 48 | data1b <- mutual_local_expected(data1, "g", "u", weight = "n", fixed_margins = FALSE) 49 | expect_equal(nrow(data1a), 2) 50 | expect_equal(nrow(data1b), 4) 51 | expect_equal(data1a[, sum(est * p_mean)], data1b[, sum(est * p_mean)], tolerance = 0.01) 52 | 53 | data2a <- mutual_local_expected(data2, "u", "g", weight = "n", fixed_margins = FALSE) 54 | data2b <- mutual_local_expected(data2, "g", "u", weight = "n", fixed_margins = FALSE) 55 | expect_equal(nrow(data2a), 2) 56 | expect_equal(nrow(data2b), 10) 57 | expect_equal(data2a[, sum(est * p_mean)], data2b[, sum(est * p_mean)], tolerance = 0.01) 58 | }) 59 | 60 | test_that("aligns with mutual_total_expected", { 61 | expect_equal( 62 | mutual_local_expected(data1, "u", "g", weight = "n")[, sum(est * p_mean)], 63 | mutual_total_expected(data1, "u", "g", weight = "n")[stat == "M under 0", est], 64 | tolerance = 0.01 65 | ) 66 | 67 | expect_equal( 68 | mutual_local_expected(data2, "u", "g", weight = "n")[, sum(est * p_mean)], 69 | mutual_total_expected(data2, "u", "g", weight = "n")[stat == "M under 0", est], 70 | tolerance = 0.01 71 | ) 72 | 73 | expect_equal( 74 | mutual_local_expected(data1, "u", "g", weight = "n", fixed_margins = FALSE)[, sum(est * p_mean)], 75 | mutual_total_expected(data1, "u", "g", weight = "n", fixed_margins = FALSE)[stat == "M under 0", est], 76 | tolerance = 0.01 77 | ) 78 | 79 | expect_equal( 80 | mutual_local_expected(data2, "u", "g", weight = "n", fixed_margins = FALSE)[, sum(est * p_mean)], 81 | mutual_total_expected(data2, "u", "g", weight = "n", fixed_margins = FALSE)[stat == "M under 0", est], 82 | tolerance = 0.01 83 | ) 84 | }) 85 | 86 | 87 | test_that("errors", { 88 | dat <- data.frame( 89 | u = rep(c(1, 2, 3, 4), 2), 90 | g = c(rep("a", 4), rep("b", 4)), 91 | n = c(40.2, 20, 5, 1, 20, 40, 60, 80), 92 | stringsAsFactors = FALSE 93 | ) 94 | 95 | expect_error( 96 | mutual_local_expected(dat, "g", "u", weight = "n"), 97 | "bootstrap with a total sample size" 98 | ) 99 | }) 100 | -------------------------------------------------------------------------------- /man/mutual_within.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mutual.R 3 | \name{mutual_within} 4 | \alias{mutual_within} 5 | \title{Calculates detailed within-category segregation scores for M and H} 6 | \usage{ 7 | mutual_within( 8 | data, 9 | group, 10 | unit, 11 | within, 12 | weight = NULL, 13 | se = FALSE, 14 | CI = 0.95, 15 | n_bootstrap = 100, 16 | base = exp(1), 17 | wide = FALSE 18 | ) 19 | } 20 | \arguments{ 21 | \item{data}{A data frame.} 22 | 23 | \item{group}{A categorical variable or a vector of variables 24 | contained in \code{data}. Defines the first dimension 25 | over which segregation is computed.} 26 | 27 | \item{unit}{A categorical variable or a vector of variables 28 | contained in \code{data}. Defines the second dimension 29 | over which segregation is computed.} 30 | 31 | \item{within}{A categorical variable or a vector of variables 32 | contained in \code{data} that defines the within-segregation categories.} 33 | 34 | \item{weight}{Numeric. (Default \code{NULL})} 35 | 36 | \item{se}{If \code{TRUE}, the segregation estimates are bootstrapped to provide 37 | standard errors and to apply bias correction. The bias that is reported 38 | has already been applied to the estimates (i.e. the reported estimates are "debiased") 39 | (Default \code{FALSE})} 40 | 41 | \item{CI}{If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval 42 | in addition to the bootstrap standard error. 43 | This is based on percentiles of the bootstrap distribution, and a valid interpretation 44 | relies on a larger number of bootstrap iterations. (Default \code{0.95})} 45 | 46 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})} 47 | 48 | \item{base}{Base of the logarithm that is used in the calculation. 49 | Defaults to the natural logarithm.} 50 | 51 | \item{wide}{Returns a wide dataframe instead of a long dataframe. 52 | (Default \code{FALSE})} 53 | } 54 | \value{ 55 | Returns a data.table with four rows for each category defined by \code{within}. 56 | The column \code{est} contains four statistics that 57 | are provided for each unit: 58 | \code{M} is the within-category M, and \code{p} is the proportion of the category. 59 | Multiplying \code{M} and \code{p} gives the contribution of each within-category 60 | towards the total M. 61 | \code{H} is the within-category H, and \code{ent_ratio} provides the entropy ratio, 62 | defined as \code{EW/E}, where \code{EW} is the within-category entropy, 63 | and \code{E} is the overall entropy. 64 | Multiplying \code{H}, \code{p}, and \code{ent_ratio} gives the contribution of each within-category 65 | towards the total H. 66 | If \code{se} is set to \code{TRUE}, an additional column \code{se} contains 67 | the associated bootstrapped standard errors, an additional column \code{CI} contains 68 | the estimate confidence interval as a list column, an additional column \code{bias} contains 69 | the estimated bias, and the column \code{est} contains the bias-corrected estimates. 70 | If \code{wide} is set to \code{TRUE}, returns instead a wide dataframe, with one 71 | row for each \code{within} category, and the associated statistics in separate columns. 72 | } 73 | \description{ 74 | Calculates the segregation between \code{group} and \code{unit} 75 | within each category defined by \code{within}. 76 | } 77 | \examples{ 78 | \dontrun{ 79 | (within <- mutual_within(schools00, "race", "school", 80 | within = "state", 81 | weight = "n", wide = TRUE 82 | )) 83 | # the M for state "A" is .409 84 | # manual calculation 85 | schools_A <- schools00[schools00$state == "A", ] 86 | mutual_total(schools_A, "race", "school", weight = "n") # M => .409 87 | 88 | # to recover the within M and H from the output, multiply 89 | # p * M and p * ent_ratio * H, respectively 90 | sum(within$p * within$M) # => .326 91 | sum(within$p * within$ent_ratio * within$H) # => .321 92 | # compare with: 93 | mutual_total(schools00, "race", "school", within = "state", weight = "n") 94 | } 95 | } 96 | \references{ 97 | Henri Theil. 1971. Principles of Econometrics. New York: Wiley. 98 | 99 | Ricardo Mora and Javier Ruiz-Castillo. 2011. 100 | "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194. 101 | } 102 | -------------------------------------------------------------------------------- /R/dissimilarity.R: -------------------------------------------------------------------------------- 1 | abs_diff <- function(x) { 2 | if (length(x) == 1) { 3 | abs(x) 4 | } else { 5 | abs(diff(x)) 6 | } 7 | } 8 | 9 | #' @import data.table 10 | dissimilarity_compute <- function(data, group, unit) { 11 | data[, n_group := sum(freq), by = group] 12 | est <- 1 / 2 * data[, abs_diff(freq / n_group), by = unit][, sum(V1)] 13 | data.table(stat = "D", est = est, stringsAsFactors = FALSE) 14 | } 15 | 16 | #' Calculates Index of Dissimilarity 17 | #' 18 | #' Returns the total segregation between \code{group} and \code{unit} using 19 | #' the Index of Dissimilarity. 20 | #' 21 | #' @param data A data frame. 22 | #' @param group A categorical variable or a vector of variables 23 | #' contained in \code{data}. Defines the first dimension 24 | #' over which segregation is computed. The D index only 25 | #' allows two distinct groups. 26 | #' @param unit A categorical variable or a vector of variables 27 | #' contained in \code{data}. Defines the second dimension 28 | #' over which segregation is computed. 29 | #' @param weight Numeric. (Default \code{NULL}) 30 | #' @param se If \code{TRUE}, the segregation estimates are bootstrapped to provide 31 | #' standard errors and to apply bias correction. The bias that is reported 32 | #' has already been applied to the estimates (i.e. the reported estimates are "debiased") 33 | #' (Default \code{FALSE}) 34 | #' @param CI If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval 35 | #' in addition to the bootstrap standard error. 36 | #' This is based on percentiles of the bootstrap distribution, and a valid interpretation 37 | #' relies on a larger number of bootstrap iterations. (Default \code{0.95}) 38 | #' @param n_bootstrap Number of bootstrap iterations. (Default \code{100}) 39 | #' @return Returns a data.table with one row. The column \code{est} contains 40 | #' the Index of Dissimilarity. 41 | #' If \code{se} is set to \code{TRUE}, an additional column \code{se} contains 42 | #' the associated bootstrapped standard errors, an additional column \code{CI} contains 43 | #' the estimate confidence interval as a list column, an additional column \code{bias} contains 44 | #' the estimated bias, and the column \code{est} contains the bias-corrected estimates. 45 | #' @references 46 | #' Otis Dudley Duncan and Beverly Duncan. 1955. "A Methodological Analysis of Segregation Indexes," 47 | #' American Sociological Review 20(2): 210-217. 48 | #' @examples 49 | #' # Example where D and H deviate 50 | #' m1 <- matrix_to_long(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2)) 51 | #' m2 <- matrix_to_long(matrix(c(80, 80, 20, 20, 20, 20, 80, 80), ncol = 2)) 52 | #' dissimilarity(m1, "group", "unit", weight = "n") 53 | #' dissimilarity(m2, "group", "unit", weight = "n") 54 | #' @import data.table 55 | #' @export 56 | dissimilarity <- function(data, group, unit, weight = NULL, 57 | se = FALSE, CI = 0.95, n_bootstrap = 100) { 58 | stopifnot(CI > 0 & CI < 1) 59 | if (length(unique(data[[group]])) != 2) { 60 | stop("The D index only allows two distinct groups") 61 | } 62 | 63 | d <- prepare_data(data, group, unit, weight) 64 | ret <- dissimilarity_compute(d, group, unit) 65 | 66 | if (se == TRUE) { 67 | vars <- attr(d, "vars") 68 | n_total <- sum(d[["freq"]]) 69 | 70 | if (all.equal(n_total, round(n_total)) == TRUE) { 71 | message(paste0(n_bootstrap, " bootstrap iterations on ", n_total, " observations")) 72 | } else { 73 | stop(paste0( 74 | "bootstrap with a total sample size that is not an integer is not allowed, ", 75 | "maybe scale your weights?" 76 | )) 77 | } 78 | # draw from a multinomial with weights specified by the cell counts 79 | draws <- stats::rmultinom(n_bootstrap, n_total, d[["freq"]] / n_total) 80 | 81 | boot_ret <- lapply(seq_len(n_bootstrap), function(i) { 82 | if (i %% 5 == 0) update_log(bs_n = i, bs_max = n_bootstrap) 83 | d[, freq := as.double(draws[, i])] 84 | dissimilarity_compute(d[freq > 0], group, unit) 85 | }) 86 | close_log() 87 | boot_ret <- rbindlist(boot_ret) 88 | ret <- bootstrap_summary(ret, boot_ret, "stat", CI) 89 | setattr(ret, "bootstrap", boot_ret) 90 | } 91 | ret 92 | } 93 | -------------------------------------------------------------------------------- /man/mutual_total.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mutual.R 3 | \name{mutual_total} 4 | \alias{mutual_total} 5 | \title{Calculates the Mutual Information Index M and Theil's Entropy Index H} 6 | \usage{ 7 | mutual_total( 8 | data, 9 | group, 10 | unit, 11 | within = NULL, 12 | weight = NULL, 13 | se = FALSE, 14 | CI = 0.95, 15 | n_bootstrap = 100, 16 | base = exp(1) 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{A data frame.} 21 | 22 | \item{group}{A categorical variable or a vector of variables 23 | contained in \code{data}. Defines the first dimension 24 | over which segregation is computed.} 25 | 26 | \item{unit}{A categorical variable or a vector of variables 27 | contained in \code{data}. Defines the second dimension 28 | over which segregation is computed.} 29 | 30 | \item{within}{A categorical variable or a vector of variables 31 | contained in \code{data}. The variable(s) should be a superset of either 32 | the \code{unit} or the \code{group} for the calculation to be meaningful. 33 | If provided, segregation is 34 | computed within the groups defined by the variable, and then averaged. 35 | (Default \code{NULL})} 36 | 37 | \item{weight}{Numeric. (Default \code{NULL})} 38 | 39 | \item{se}{If \code{TRUE}, the segregation estimates are bootstrapped to provide 40 | standard errors and to apply bias correction. The bias that is reported 41 | has already been applied to the estimates (i.e. the reported estimates are "debiased") 42 | (Default \code{FALSE})} 43 | 44 | \item{CI}{If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval 45 | in addition to the bootstrap standard error. 46 | This is based on percentiles of the bootstrap distribution, and a valid interpretation 47 | relies on a larger number of bootstrap iterations. (Default \code{0.95})} 48 | 49 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})} 50 | 51 | \item{base}{Base of the logarithm that is used in the calculation. 52 | Defaults to the natural logarithm.} 53 | } 54 | \value{ 55 | Returns a data.table with two rows. The column \code{est} contains 56 | the Mutual Information Index, M, and Theil's Entropy Index, H. The H is the 57 | the M divided by the \code{group} entropy. If \code{within} was given, 58 | M and H are weighted averages of the within-category segregation scores. 59 | If \code{se} is set to \code{TRUE}, an additional column \code{se} contains 60 | the associated bootstrapped standard errors, an additional column \code{CI} contains 61 | the estimate confidence interval as a list column, an additional column \code{bias} contains 62 | the estimated bias, and the column \code{est} contains the bias-corrected estimates. 63 | } 64 | \description{ 65 | Returns the total segregation between \code{group} and \code{unit}. 66 | If \code{within} is given, calculates segregation within each 67 | \code{within} category separately, and takes the weighted average. 68 | Also see \code{\link{mutual_within}} for detailed within calculations. 69 | } 70 | \examples{ 71 | # calculate school racial segregation 72 | mutual_total(schools00, "school", "race", weight = "n") # M => .425 73 | 74 | # note that the definition of groups and units is arbitrary 75 | mutual_total(schools00, "race", "school", weight = "n") # M => .425 76 | 77 | # if groups or units are defined by a combination of variables, 78 | # vectors of variable names can be provided - 79 | # here there is no difference, because schools 80 | # are nested within districts 81 | mutual_total(schools00, "race", c("district", "school"), 82 | weight = "n" 83 | ) # M => .424 84 | 85 | # estimate standard errors and 95\% CI for M and H 86 | \dontrun{ 87 | mutual_total(schools00, "race", "school", 88 | weight = "n", 89 | se = TRUE, n_bootstrap = 1000 90 | ) 91 | 92 | # estimate segregation within school districts 93 | mutual_total(schools00, "race", "school", 94 | within = "district", weight = "n" 95 | ) # M => .087 96 | 97 | # estimate between-district racial segregation 98 | mutual_total(schools00, "race", "district", weight = "n") # M => .338 99 | # note that the sum of within-district and between-district 100 | # segregation equals total school-race segregation; 101 | # here, most segregation is between school districts 102 | } 103 | } 104 | \references{ 105 | Henri Theil. 1971. Principles of Econometrics. New York: Wiley. 106 | 107 | Ricardo Mora and Javier Ruiz-Castillo. 2011. 108 | "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194. 109 | } 110 | -------------------------------------------------------------------------------- /man/ipf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ipf.R 3 | \name{ipf} 4 | \alias{ipf} 5 | \title{Adjustment of marginal distributions using iterative proportional fitting} 6 | \usage{ 7 | ipf( 8 | source, 9 | target, 10 | group, 11 | unit, 12 | weight = NULL, 13 | max_iterations = 100, 14 | precision = 1e-07 15 | ) 16 | } 17 | \arguments{ 18 | \item{source}{A "source" data frame. The marginals of this 19 | dataset are adjusted to the marginals of \code{target}.} 20 | 21 | \item{target}{A "target" data frame. The function returns a dataset 22 | where the marginal distributions of \code{group} and \code{unit} categories 23 | are approximated by those of \code{target}.} 24 | 25 | \item{group}{A categorical variable or a vector of variables 26 | contained in \code{source} and \code{target}. Defines the first distribution 27 | for adjustment.} 28 | 29 | \item{unit}{A categorical variable or a vector of variables 30 | contained in \code{source} and \code{target}. Defines the second distribution 31 | for adjustment.} 32 | 33 | \item{weight}{Numeric. (Default \code{NULL})} 34 | 35 | \item{max_iterations}{Maximum number of iterations used for the IPF algorithm.} 36 | 37 | \item{precision}{Convergence criterion for the IPF algorithm. In every iteration, 38 | the ratio of the source and target marginals are calculated for every category of 39 | \code{group} and \code{unit}. The algorithm converges when all ratios are smaller 40 | than \code{1 + precision}.} 41 | } 42 | \value{ 43 | Returns a data frame that retains 44 | the association structure of \code{source} while approximating 45 | the marginal distributions for \code{group} and \code{unit} of \code{target}. 46 | The dataset identifies each combination of \code{group} and \code{unit}, 47 | and categories that only occur in either \code{source} or \code{target} are dropped. 48 | The adjusted frequency of each combination is given by the column \code{n}, 49 | while \code{n_target} and \code{n_source} contain the zero-adjusted frequencies 50 | in the target and source dataset, respectively. 51 | } 52 | \description{ 53 | Adjusts the marginal distributions for \code{group} and \code{unit} 54 | in \code{source} to the respective marginal distributions in \code{target}, using the iterative 55 | proportional fitting algorithm (IPF). 56 | } 57 | \details{ 58 | The algorithm works by scaling 59 | the marginal distribution of \code{group} in the \code{source} data frame towards the 60 | marginal distribution of \code{target}; then repeating this process for \code{unit}. The 61 | algorithm then keeps alternating between \code{group} and \code{unit} until the marginals 62 | of the adjusted data frame are within the allowed precision. This results in a dataset that 63 | retains the association structure of \code{source} while approximating 64 | the marginal distribution of \code{target}. If the number of \code{unit} and 65 | \code{group} categories is different in \code{source} and \code{target}, the data frame returns 66 | the combination of \code{unit} and \code{group} categories that occur in both datasets. 67 | Zero values are replaced by a small, non-zero number (1e-4). 68 | Note that the values returned sum to the observations of the source data frame, not the 69 | target data frame. This is different from other IPF implementations, but ensures that the IPF 70 | does not change the number of observations. 71 | } 72 | \examples{ 73 | \dontrun{ 74 | # adjusts the marginals of group and unit categories so that 75 | # schools00 has similar marginals as schools05 76 | adj <- ipf(schools00, schools05, "race", "school", weight = "n") 77 | 78 | # check that the new "race" marginals are similar to the target marginals 79 | # (the same could be done for schools) 80 | aggregate(adj$n, list(adj$race), sum) 81 | aggregate(adj$n_target, list(adj$race), sum) 82 | 83 | # note that the adjusted dataset contains fewer 84 | # schools than either the source or the target dataset, 85 | # because the marginals are only defined for the overlap 86 | # of schools 87 | length(unique(schools00$school)) 88 | length(unique(schools05$school)) 89 | length(unique(adj$school)) 90 | } 91 | } 92 | \references{ 93 | W. E. Deming and F. F. Stephan. 1940. 94 | "On a Least Squares Adjustment of a Sampled Frequency Table 95 | When the Expected Marginal Totals are Known". 96 | Annals of Mathematical Statistics. 11 (4): 427–444. 97 | 98 | T. Karmel and M. Maclachlan. 1988. 99 | "Occupational Sex Segregation — Increasing or Decreasing?" Economic Record 64: 187-195. 100 | } 101 | -------------------------------------------------------------------------------- /tests/testthat/test_mutual_total_expected.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_mutual_total_expected") 7 | 8 | data1 <- data.frame( 9 | u = rep(c(1, 2, 3, 4), 2), 10 | g = c(rep("a", 4), rep("b", 4)), 11 | n = c(40, 20, 5, 1, 20, 40, 60, 80), 12 | stringsAsFactors = FALSE 13 | ) 14 | 15 | data2 <- data.frame( 16 | u = c(1:10, 1:10), 17 | g = c(rep("a", 10), rep("b", 10)), 18 | n = c(rep(1, 10), rep(9, 10)), 19 | stringsAsFactors = FALSE 20 | ) 21 | 22 | test_that("works both ways around", { 23 | expect_equal( 24 | mutual_total_expected(data1, "u", "g", weight = "n")[stat == "M under 0", est], 25 | mutual_total_expected(data1, "g", "u", weight = "n")[stat == "M under 0", est], 26 | tolerance = 0.01 27 | ) 28 | 29 | expect_equal( 30 | mutual_total_expected(data2, "u", "g", weight = "n")[stat == "M under 0", est], 31 | mutual_total_expected(data2, "g", "u", weight = "n")[stat == "M under 0", est], 32 | tolerance = 0.05 33 | ) 34 | }) 35 | 36 | test_that("works nested", { 37 | data1$superunit <- data1$u <= 3 38 | a <- mutual_total_expected(data1, "g", c("superunit", "u"), weight = "n") 39 | b <- mutual_total_expected(data1, "g", "u", weight = "n") 40 | 41 | expect_equal(a$est, b$est, tolerance = 0.01) 42 | expect_equal(a$se, b$se, tolerance = 0.01) 43 | expect_equal(length(a$est), 2) 44 | }) 45 | 46 | test_that("fixed margins = FALSE", { 47 | expect_equal( 48 | mutual_total_expected(data1, "u", "g", weight = "n", fixed_margins = FALSE)[stat == "M under 0", est], 49 | mutual_total_expected(data1, "g", "u", weight = "n", fixed_margins = FALSE)[stat == "M under 0", est], 50 | tolerance = 0.01 51 | ) 52 | }) 53 | 54 | test_that("within argument", { 55 | within <- mutual_total_expected(school_ses, "ethnic_group", "school_id", within = "ses_quintile") 56 | # manually 57 | d <- data.table::as.data.table(school_ses) 58 | manually <- d[, mutual_total_expected(.SD, "ethnic_group", "school_id"), by = .(ses_quintile)] 59 | p <- d[, .(p = .N), by = .(ses_quintile)][, .(ses_quintile, p = p / sum(p))] 60 | manually <- merge(manually, p) 61 | manually <- manually[, .(est_manual = sum(est * p)), by = .(stat)] 62 | compare <- merge(within, manually) 63 | expect_equal(compare$est, compare$est_manual, tolerance = 0.01) 64 | }) 65 | 66 | 67 | test_that("dissimilarity", { 68 | expect_error(dissimilarity_expected(data1, "u", "g", weight = "n")) 69 | expect_equal(dissimilarity_expected(data1, "g", "u", n_bootstrap = 500, weight = "n")$est, 70 | 0.098, 71 | tolerance = 0.1 72 | ) 73 | expect_equal( 74 | dissimilarity_expected(data1, "g", "u", 75 | n_bootstrap = 500, weight = "n", 76 | fixed_margins = FALSE 77 | )$est, 78 | 0.1003, 79 | tolerance = 0.1 80 | ) 81 | }) 82 | 83 | test_that("dissimilarity - Winship 1977", { 84 | # see table 2 85 | mat <- matrix(c(rep(1, 1000), rep(9, 1000)), ncol = 2) 86 | d <- matrix_to_long(mat) 87 | expect_equal( 88 | dissimilarity_expected(d, "group", "unit", weight = "n")$est, 89 | 0.387, 90 | tolerance = 0.1 91 | ) 92 | 93 | mat <- matrix(c(rep(5, 1000), rep(5, 1000)), ncol = 2) 94 | d <- matrix_to_long(mat) 95 | expect_equal( 96 | dissimilarity_expected(d, "group", "unit", weight = "n")$est, 97 | 0.246, 98 | tolerance = 0.1 99 | ) 100 | 101 | mat <- matrix(c(rep(10, 1000), rep(90, 1000)), ncol = 2) 102 | d <- matrix_to_long(mat) 103 | expect_equal( 104 | dissimilarity_expected(d, "group", "unit", weight = "n")$est, 105 | 0.131, 106 | tolerance = 0.1 107 | ) 108 | }) 109 | 110 | test_that("errors", { 111 | dat <- data.frame( 112 | u = rep(c(1, 2, 3, 4), 2), 113 | g = c(rep("a", 4), rep("b", 4)), 114 | n = c(40.2, 20, 5, 1, 20, 40, 60, 80), 115 | stringsAsFactors = FALSE 116 | ) 117 | 118 | expect_error( 119 | mutual_total_expected(dat, "g", "u", weight = "n"), 120 | "bootstrap with a total sample size" 121 | ) 122 | expect_error( 123 | dissimilarity_expected(dat, "g", "u", weight = "n"), 124 | "bootstrap with a total sample size" 125 | ) 126 | 127 | dat_within <- data.frame( 128 | u = c(rep("a", 4), rep("b", 4)), 129 | g = rep(c(1, 2), 4), 130 | supergroup = rep(c(12, 12, 34, 34), 2), 131 | n = c(40, 20, 5, 1, 20, 40, 60, 80), 132 | stringsAsFactors = FALSE 133 | ) 134 | 135 | expect_message(mutual_total_expected(schools00, "race", "school", 136 | within = "district", weight = "n", 137 | n_bootstrap = 10 138 | ), "singleton items") 139 | }) 140 | -------------------------------------------------------------------------------- /tests/testthat/test_mutual_within.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_mutual_within") 7 | 8 | test_data <- data.frame( 9 | u = c(rep("a", 4), rep("b", 4)), 10 | g = rep(c(1, 2, 3, 4), 2), 11 | supergroup = rep(c(12, 12, 34, 34), 2), 12 | n = c(40, 20, 5, 1, 20, 40, 60, 80), 13 | stringsAsFactors = FALSE 14 | ) 15 | 16 | test_that("dimensions and bootstrapping", { 17 | within <- mutual_within(test_data, "u", "g", 18 | within = "supergroup", weight = "n" 19 | ) 20 | expect_equal(dim(within), c(2 * 4, 3)) 21 | 22 | within_se <- mutual_within(test_data, "u", "g", 23 | within = "supergroup", weight = "n", se = TRUE, n_bootstrap = 10 24 | ) 25 | expect_equal(dim(within_se), c(2 * 4, 6)) 26 | }) 27 | 28 | test_that("bootstrap attributes exists", { 29 | within_se <- mutual_within(test_data, "u", "g", 30 | within = "supergroup", weight = "n", se = TRUE, n_bootstrap = 10 31 | ) 32 | 33 | expect_equal(dim(attr(within_se, "bootstrap")), c(10 * length(unique(test_data$supergroup)) * 4, 3)) 34 | }) 35 | 36 | test_that("bootstrapping fails when sample size is non-integer", { 37 | test_data <- data.frame( 38 | u = c(rep("a", 4), rep("b", 4)), 39 | g = rep(c(1, 2, 3, 4), 2), 40 | supergroup = rep(c(12, 12, 34, 34), 2), 41 | n = c(40.7, 20, 5, 1, 20.5, 40, 60, 80), 42 | stringsAsFactors = FALSE 43 | ) 44 | 45 | expect_error(mutual_within(test_data, "u", "g", 46 | within = "supergroup", weight = "n", se = TRUE, n_bootstrap = 10 47 | )) 48 | # rescale 49 | test_data$n2 <- test_data$n / sum(test_data$n) * round(sum(test_data$n)) 50 | ret <- mutual_within(test_data, "u", "g", 51 | within = "supergroup", weight = "n2", se = TRUE, n_bootstrap = 10 52 | ) 53 | expect_equal(dim(ret), c(2 * 4, 6)) 54 | }) 55 | 56 | test_that("between + within = total", { 57 | total <- mutual_total(test_data, "u", "g", within = "supergroup", weight = "n") 58 | m <- total[stat == "M", est] 59 | h <- total[stat == "H", est] 60 | 61 | within <- mutual_within(test_data, "u", "g", within = "supergroup", weight = "n") 62 | within <- unstack(within, form = est ~ stat) 63 | 64 | expect_equal(m, sum(within$p * within$M)) 65 | expect_equal(h, sum(within$p * within$ent_ratio * within$H)) 66 | 67 | within_wide <- mutual_within(test_data, "u", "g", 68 | within = "supergroup", 69 | weight = "n", wide = T 70 | ) 71 | expect_equal(sum(within$p * within$M), sum(within_wide$p * within_wide$M)) 72 | expect_equal( 73 | sum(within$p * within$ent_ratio * within$H), 74 | sum(within_wide$p * within_wide$ent_ratio * within_wide$H) 75 | ) 76 | 77 | # H is between 0 and 1 78 | expect_equal(all(within$H >= 0 & within$H <= 1), TRUE) 79 | expect_equal(all(h >= 0 & h <= 1), TRUE) 80 | }) 81 | 82 | test_that("option wide works", { 83 | nowide <- mutual_within(test_data, "u", "g", 84 | within = "supergroup", weight = "n" 85 | ) 86 | nowide_se <- mutual_within(test_data, "u", "g", 87 | within = "supergroup", weight = "n", se = TRUE, n_bootstrap = 10 88 | ) 89 | wide <- mutual_within(test_data, "u", "g", 90 | within = "supergroup", weight = "n", wide = TRUE 91 | ) 92 | wide_se <- mutual_within(test_data, "u", "g", 93 | within = "supergroup", weight = "n", wide = TRUE, se = TRUE, n_bootstrap = 10 94 | ) 95 | 96 | expect_equal(ncol(nowide) + 3, ncol(nowide_se)) 97 | expect_equal(nrow(nowide), 2 * 4) 98 | expect_equal(nrow(nowide), nrow(nowide_se)) 99 | 100 | expect_equal(ncol(wide) + 3 * 4, ncol(wide_se)) 101 | expect_equal(nrow(wide), 2) 102 | expect_equal(nrow(wide), nrow(wide_se)) 103 | 104 | expect_equal(nowide[stat == "M", est], wide$M) 105 | expect_equal(nowide[stat == "p", est], wide$p) 106 | expect_equal(nowide[stat == "H", est], wide$H) 107 | expect_equal(nowide[stat == "ent_ratio", est], wide$ent_ratio) 108 | 109 | total <- mutual_total(test_data, "u", "g", within = "supergroup", weight = "n") 110 | expect_equal( 111 | total[stat == "M", est], 112 | sum(nowide[stat == "M", est] * nowide[stat == "p", est]) 113 | ) 114 | expect_equal(total[stat == "M", est], sum(wide$M * wide$p)) 115 | expect_equal( 116 | total[stat == "M", est], 117 | sum(wide$H * wide$p * wide$ent_ratio * entropy(test_data, "u", weight = "n")) 118 | ) 119 | 120 | expect_equal( 121 | total[stat == "H", est], 122 | sum(nowide[stat == "H", est] * 123 | nowide[stat == "p", est] * 124 | nowide[stat == "ent_ratio", est]) 125 | ) 126 | expect_equal(total[stat == "H", est], sum(wide$H * wide$p * wide$ent_ratio)) 127 | 128 | expect_equal(all(nowide_se[["se"]] > 0), TRUE) 129 | }) 130 | -------------------------------------------------------------------------------- /tests/testthat/test_ipf.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_ipf") 7 | 8 | test_that("different precisions", { 9 | # reduce to overlap sample 10 | schools00_r <- schools00[schools00$school %in% schools05$school, ] 11 | schools05_r <- schools05[schools05$school %in% schools00$school, ] 12 | 13 | for (precision in c(.1, .01, .001)) { 14 | adj <- ipf(schools00_r, schools05_r, "race", "school", weight = "n", precision = precision) 15 | 16 | # check that the new "race" marginals are similar to the target marginals 17 | new <- aggregate(adj$n, list(adj$race), sum) 18 | old <- aggregate(schools05_r$n, list(schools05_r$race), sum) 19 | old <- old[match(new$Group.1, old$Group.1), ] 20 | new <- new$x / sum(new$x) 21 | old <- old$x / sum(old$x) 22 | expect_true(all(abs(new - old) < precision)) 23 | 24 | # check that the new "school" marginals are similar to the target marginals 25 | new <- aggregate(adj$n, list(adj$school), sum) 26 | old <- aggregate(schools05_r$n, list(schools05_r$school), sum) 27 | old <- old[match(new$Group.1, old$Group.1), ] 28 | new <- new$x / sum(new$x) 29 | old <- old$x / sum(old$x) 30 | expect_true(all(abs(new - old) < precision)) 31 | } 32 | }) 33 | 34 | test_that("warn if iterations are too low", { 35 | expect_error( 36 | suppressWarnings( 37 | ipf(schools00, schools05, "race", "school", 38 | weight = "n", 39 | precision = .00001, max_iterations = 1 40 | ) 41 | ) 42 | ) 43 | }) 44 | 45 | test_that("gives sames results as mutual_difference", { 46 | diff <- mutual_difference(schools00, schools05, 47 | group = "race", unit = "school", 48 | weight = "n", method = "km", precision = 0.000001 49 | ) 50 | # what changed from 2000 to 2005? 51 | # first reduce to overlap sample 52 | schools00_r <- schools00[schools00$school %in% schools05$school, ] 53 | schools05_r <- schools05[schools05$school %in% schools00$school, ] 54 | M_00 <- mutual_total(schools00_r, "race", "school", weight = "n")[stat == "M", est] 55 | M_05 <- mutual_total(schools05_r, "race", "school", weight = "n")[stat == "M", est] 56 | # adjust the 2000 margins to the 2005 margins 57 | # if only the margins changed, then this would explain all the difference 58 | adj_00 <- ipf(schools00_r, schools05_r, "race", "school", weight = "n", precision = 0.00001) 59 | M_margins <- mutual_total(adj_00, "race", "school", weight = "n")[stat == "M", est] 60 | structural_change <- M_05 - M_margins 61 | # test 62 | expect_equal(diff[stat == "M1", est] + diff[stat == "removals", est], M_00) 63 | expect_equal(diff[stat == "M2", est] - diff[stat == "additions", est], M_05) 64 | expect_equal(structural_change, diff[stat == "structural", est], tolerance = .001) 65 | expect_equal( 66 | M_05 - M_00 - structural_change, 67 | diff[stat %in% c("unit_marginal", "group_marginal", "interaction"), sum(est)], 68 | tolerance = .001 69 | ) 70 | }) 71 | 72 | test_that("example from Karmel & Maclachlan 1988", { 73 | source <- data.frame( 74 | occ = rep(c(1, 2, 3), 2), 75 | gender = c(rep("male", 3), rep("female", 3)), 76 | n = c(100, 50, 100, 50, 50, 50) 77 | ) 78 | target <- data.frame( 79 | occ = rep(c(1, 2, 3), 2), 80 | gender = c(rep("male", 3), rep("female", 3)), 81 | n = c(125, 100, 100, 100, 100, 75) 82 | ) 83 | adj <- ipf(source, target, "occ", "gender", "n", precision = 0.0000000001) 84 | # K-M report on the scale of the target distribution 85 | adj$n <- round(adj$n / sum(adj$n) * sum(adj$n_target), 1) 86 | expect_equal(adj[adj$gender == "male" & adj$occ == 1, "n"][[1]], 134.7) 87 | expect_equal(adj[adj$gender == "male" & adj$occ == 2, "n"][[1]], 85.5) 88 | expect_equal(adj[adj$gender == "male" & adj$occ == 3, "n"][[1]], 104.8) 89 | expect_equal(adj[adj$gender == "female" & adj$occ == 1, "n"][[1]], 90.3) 90 | expect_equal(adj[adj$gender == "female" & adj$occ == 2, "n"][[1]], 114.5) 91 | expect_equal(adj[adj$gender == "female" & adj$occ == 3, "n"][[1]], 70.2) 92 | 93 | expect_equal( 94 | sum(adj[adj$gender == "male", "n"]), 95 | sum(adj[adj$gender == "male", "n_target"]) 96 | ) 97 | expect_equal( 98 | sum(adj[adj$gender == "female", "n"]), 99 | sum(adj[adj$gender == "female", "n_target"]) 100 | ) 101 | expect_equal(sum(adj[adj$occ == 1, "n"]), sum(adj[adj$occ == 1, "n_target"])) 102 | expect_equal(sum(adj[adj$occ == 2, "n"]), sum(adj[adj$occ == 2, "n_target"])) 103 | expect_equal(sum(adj[adj$occ == 3, "n"]), sum(adj[adj$occ == 3, "n_target"])) 104 | }) 105 | 106 | test_that("warning about units and groups being dropped", { 107 | expect_warning(ipfd <- ipf(schools00, schools05, "race", "school", weight = "n")) 108 | expect_equal(sum(ipfd$n), sum(ipfd$n_source)) 109 | }) 110 | 111 | test_that("returns same number of observations as before", { 112 | # schools are dropped here 113 | suppressWarnings(ipfd <- ipf(schools00, schools05, "race", "school", weight = "n")) 114 | expect_equal(sum(ipfd$n), sum(ipfd$n_source)) 115 | 116 | # reduce to overlap sample, because then by definition the counts are identical 117 | schools00_r <- schools00[schools00$school %in% schools05$school, ] 118 | schools05_r <- schools05[schools05$school %in% schools00$school, ] 119 | 120 | ipfd <- ipf(schools00_r, schools05_r, "race", "school", weight = "n") 121 | expect_equal(sum(ipfd$n), sum(ipfd$n_source)) 122 | expect_equal(sum(schools00_r$n), sum(ipfd$n)) 123 | expect_equal(sum(schools00_r$n), sum(ipfd$n_source)) 124 | expect_equal(sum(schools05_r$n), sum(ipfd$n_target)) 125 | }) 126 | -------------------------------------------------------------------------------- /vignettes/faq.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "FAQ" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{FAQ} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | 16 | data.table::setDTthreads(1) 17 | 18 | # skip this vignette on CRAN etc. 19 | BUILD_VIGNETTE <- identical(Sys.getenv("BUILD_VIGNETTE"), "true") 20 | knitr::opts_chunk$set(eval = BUILD_VIGNETTE) 21 | 22 | library("dplyr") 23 | library("data.table") 24 | library("tigris") 25 | library("segregation") 26 | options(tigris_use_cache = TRUE) 27 | schools00 <- schools00 28 | ``` 29 | 30 | ## Can index X be added to the package? 31 | 32 | Adding new segregation indices is not a big trouble. Please 33 | [open an issue](https://github.com/elbersb/segregation/issues) on GitHub 34 | to request an index to be added. 35 | 36 | ## How can I compute indices for different areas at once? 37 | 38 | If you use the `dplyr` package, one pattern that works well is 39 | to use `group_modify`. Here, we compute the pairwise Black-White 40 | dissimilarity index for each state separately: 41 | 42 | ```{r} 43 | library("segregation") 44 | library("dplyr") 45 | 46 | schools00 %>% 47 | filter(race %in% c("black", "white")) %>% 48 | group_by(state) %>% 49 | group_modify(~ dissimilarity( 50 | data = .x, 51 | group = "race", 52 | unit = "school", 53 | weight = "n" 54 | )) 55 | ``` 56 | 57 | A similar pattern works also well with `data.table`: 58 | 59 | ```{r} 60 | library("data.table") 61 | 62 | schools00 <- as.data.table(schools00) 63 | schools00[ 64 | race %in% c("black", "white"), 65 | dissimilarity(data = .SD, group = "race", unit = "school", weight = "n"), 66 | by = .(state) 67 | ] 68 | ``` 69 | 70 | To compute many decompositions at once, it's easiest 71 | to combine the data for the two time points. For instance, 72 | here's a `dplyr` solution to decompose the state-specific 73 | M indices between 2000 and 2005: 74 | 75 | ```{r} 76 | # helper function for decomposition 77 | diff <- function(df, group) { 78 | data1 <- filter(df, year == 2000) 79 | data2 <- filter(df, year == 2005) 80 | mutual_difference(data1, data2, group = "race", unit = "school", weight = "n") 81 | } 82 | 83 | # add year indicators 84 | schools00$year <- 2000 85 | schools05$year <- 2005 86 | combine <- bind_rows(schools00, schools05) 87 | 88 | combine %>% 89 | group_by(state) %>% 90 | group_modify(diff) %>% 91 | head(5) 92 | ``` 93 | 94 | Again, here's also a `data.table` solution: 95 | 96 | ```{r} 97 | setDT(combine) 98 | combine[, diff(.SD), by = .(state)] %>% head(5) 99 | ``` 100 | 101 | ## How can I use Census data from `tidycensus` to compute segregation indices? 102 | 103 | Here are a few examples thanks to [Kyle Walker](https://twitter.com/kyle_e_walker/status/1392188844724809728), the author of the [tidycensus](https://walker-data.com/tidycensus/articles/basic-usage.html) package. 104 | 105 | First, download the data: 106 | 107 | ```{r} 108 | library("tidycensus") 109 | 110 | cook_data <- get_acs( 111 | geography = "tract", 112 | variables = c( 113 | white = "B03002_003", 114 | black = "B03002_004", 115 | asian = "B03002_006", 116 | hispanic = "B03002_012" 117 | ), 118 | state = "IL", 119 | county = "Cook" 120 | ) 121 | ``` 122 | 123 | Because this data is in "long" format, it's easy to compute segregation indices: 124 | 125 | ```{r} 126 | # compute index of dissimilarity 127 | cook_data %>% 128 | filter(variable %in% c("black", "white")) %>% 129 | dissimilarity( 130 | group = "variable", 131 | unit = "GEOID", 132 | weight = "estimate" 133 | ) 134 | 135 | # compute multigroup M/H indices 136 | cook_data %>% 137 | mutual_total( 138 | group = "variable", 139 | unit = "GEOID", 140 | weight = "estimate" 141 | ) 142 | ``` 143 | 144 | Producing a map of local segregation scores is also not hard: 145 | 146 | ```{r fig.width=7, fig.height=7} 147 | library("tigris") 148 | library("ggplot2") 149 | 150 | local_seg <- mutual_local(cook_data, 151 | group = "variable", 152 | unit = "GEOID", 153 | weight = "estimate", 154 | wide = TRUE 155 | ) 156 | 157 | # download shapefile 158 | seg_geom <- tracts("IL", "Cook", cb = TRUE, progress_bar = FALSE) %>% 159 | left_join(local_seg, by = "GEOID") 160 | 161 | ggplot(seg_geom, aes(fill = ls)) + 162 | geom_sf(color = NA) + 163 | coord_sf(crs = 3435) + 164 | scale_fill_viridis_c() + 165 | theme_void() + 166 | labs( 167 | title = "Local segregation scores for Cook County, IL", 168 | fill = NULL 169 | ) 170 | ``` 171 | 172 | ## Can I compute local segregation scores for the H index? 173 | 174 | See [this paper](https://osf.io/preprints/socarxiv/3juyc) for more information. The short 175 | answer is that you can divide the local segregation scores of the M index by the entropy 176 | of the group distribution. A weighted average of these scores must then equal the H index, as 177 | the H index is just the M index divided by the entropy of the group distribution. 178 | 179 | Here's an example: 180 | 181 | ```{r} 182 | (mutual_total(schools00, "race", "school", weight = "n")) 183 | 184 | local <- mutual_local(schools00, "race", "school", weight = "n", wide = TRUE) 185 | (local[, sum(p * ls)]) # same as M index above 186 | local[, ls_H := ls / entropy(schools00, "race", weight = "n")] 187 | (local[, sum(p * ls_H)]) # same as H index above 188 | ``` 189 | 190 | 191 | ## How can I compute margins-adjusted local segregation scores? 192 | 193 | When using `mutual_difference`, supply `method = "shapley_detailed"` 194 | to get two different local segregation scores that are margins-adjusted 195 | (one is coming from adjusting forward, the other from adjusting 196 | backwards). By averaging them we can create a single margins-adjusted 197 | local segregation score: 198 | 199 | ```{r} 200 | diff <- mutual_difference(schools00, schools05, "race", "school", 201 | weight = "n", method = "shapley_detailed" 202 | ) 203 | 204 | diff[stat %in% c("ls_diff1", "ls_diff2"), 205 | .(ls_diff_adjusted = mean(est)), 206 | by = .(school) 207 | ] 208 | ``` 209 | -------------------------------------------------------------------------------- /tests/testthat/test_compression.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_compression") 7 | 8 | subset <- schools00[1:50, ] 9 | data.table::setDT(subset) 10 | n_schools <- length(unique(subset$school)) 11 | 12 | all_neighbors <- unique(subset$school) 13 | all_neighbors <- expand.grid(a = all_neighbors, b = all_neighbors) 14 | res_all <- compress(subset, "race", "school", weight = "n", neighbors = all_neighbors) 15 | 16 | test_that("result is the same with no neighbors given", { 17 | res2 <- compress(subset, "race", "school", neighbors = "all", weight = "n") 18 | expect_equal(res_all$iterations, res2$iterations) 19 | }) 20 | 21 | test_that("compress works", { 22 | # 9 merges 23 | expect_equal(nrow(res_all$iterations), 16) 24 | # M values is declining continously 25 | expect_equal(all(res_all$iterations$M[2:16] < res_all$M$iterations[1:15]), TRUE) 26 | # number of units are correct 27 | expect_equal(res_all$iteration$N_units[[1]], n_schools - 1) 28 | }) 29 | 30 | test_that("print", { 31 | expect_output(print(res_all), "17 units") 32 | expect_output(print(res_all), "Threshold 99%") 33 | }) 34 | 35 | test_that("get_crosswalk works", { 36 | expect_error( 37 | get_crosswalk(schools00), 38 | "either n_units or percent has to be given" 39 | ) 40 | expect_error( 41 | get_crosswalk(res_all, n_units = -1), 42 | "n_units is out of bounds" 43 | ) 44 | expect_error( 45 | get_crosswalk(res_all, n_units = 20), 46 | "n_units is out of bounds" 47 | ) 48 | expect_error( 49 | get_crosswalk(res_all, n_units = 3, percent = 0.2), 50 | "only n_units or percent has to be given" 51 | ) 52 | 53 | expect_equal(nrow(get_crosswalk(res_all, n_units = 1)), n_schools) 54 | expect_equal(nrow(get_crosswalk(res_all, n_units = 5)), n_schools) 55 | expect_equal(nrow(get_crosswalk(res_all, n_units = 15)), n_schools) 56 | expect_equal(nrow(get_crosswalk(res_all, percent = 0.1)), n_schools) 57 | expect_equal(nrow(get_crosswalk(res_all, percent = 0.6)), n_schools) 58 | expect_equal(nrow(get_crosswalk(res_all, percent = 0.9)), n_schools) 59 | expect_equal(as.character(get_crosswalk(res_all, n_units = 1)$new), rep("M1", 17)) 60 | }) 61 | 62 | test_that("parts", { 63 | # get_crosswalk 64 | res_no_parts <- get_crosswalk(res_all, percent = 0.6) 65 | res_parts <- get_crosswalk(res_all, percent = 0.6, parts = TRUE) 66 | expect_equal(names(res_no_parts), c("school", "new")) 67 | expect_equal(names(res_parts), c("school", "new", "parts")) 68 | expect_equal(res_no_parts, res_parts[, -"parts"]) 69 | 70 | res_no_parts <- get_crosswalk(res_all, percent = 0.99) 71 | res_parts <- get_crosswalk(res_all, percent = 0.99, parts = TRUE) 72 | expect_equal(res_no_parts, res_parts[, -"parts"]) 73 | 74 | expect_true(all(!is.na(res_parts[grepl("^M", new)][["parts"]]))) 75 | 76 | # merge_units 77 | merged_no_parts <- merge_units(res_all, percent = .8) 78 | merged_parts <- merge_units(res_all, percent = .8, parts = TRUE) 79 | expect_equal(res_no_parts, res_parts[, -"parts"]) 80 | expect_equal(names(merged_no_parts), c("school", "race", "n")) 81 | expect_equal(names(merged_parts), c("school", "race", "n", "parts")) 82 | }) 83 | 84 | test_that("compress edge case", { 85 | res_edge <- compress(subset, "race", "school", neighbors = "all", weight = "n", max_iter = 1) 86 | expect_equal(nrow(get_crosswalk(res_edge, n_units = 16)), n_schools) 87 | }) 88 | 89 | test_that("merge_units", { 90 | merged <- merge_units(res_all, percent = 0.8) 91 | new_units_cw <- sort(unique(get_crosswalk(res_all, percent = 0.8)$new)) 92 | new_units_merged <- sort(unique(merged$school)) 93 | expect_equal(new_units_cw, new_units_merged) 94 | }) 95 | 96 | test_that("percent works", { 97 | M_full <- mutual_total(subset, "race", "school", weight = "n")[stat == "M"][["est"]] 98 | 99 | for (pct in seq(0.1, 0.9, by = 0.05)) { 100 | merged_pct <- merge_units(res_all, percent = pct) 101 | M_pct <- mutual_total(merged_pct, "race", "school", weight = "n")[stat == "M"][["est"]] 102 | pct_M <- M_pct / M_full 103 | expect_true(pct_M > pct) 104 | expect_equal(res_all$iterations[N_units == merged_pct[, uniqueN(school)]][["pct_M"]], pct_M) 105 | } 106 | }) 107 | 108 | test_that("merge_units edge case", { 109 | res_edge <- compress(subset, "race", "school", neighbors = "all", weight = "n", max_iter = 1) 110 | merged <- merge_units(res_edge, n_units = 16) 111 | # replicate manual merge 112 | units <- c(res_edge$iterations$old_unit, res_edge$iterations$new_unit) 113 | merged_manually <- subset[school %in% units, .(n = sum(n)), by = .(race)] 114 | merged_algo <- merged[school == "M1" & n != 0][, -"school"] 115 | expect_equal(merged_manually, merged_algo) 116 | }) 117 | 118 | test_that("scree plot", { 119 | if (requireNamespace("ggplot2", quietly = TRUE)) { 120 | plot <- scree_plot(res_all) 121 | expect_equal(nrow(plot$data), n_schools) 122 | 123 | plot <- scree_plot(res_all, tail = 3) 124 | expect_equal(nrow(plot$data), 3) 125 | } 126 | }) 127 | 128 | test_that("data set names", { 129 | subset <- schools00[1:50, ] 130 | data.table::setDT(subset) 131 | names(subset) <- c("state", "district", "unit", "group", "n") 132 | res <- compress(subset, "group", "unit", neighbors = "all", weight = "n") 133 | 134 | expect_equal(nrow(res$iterations), 16) 135 | }) 136 | 137 | test_that("local neighbors", { 138 | subset <- schools00[1:500, ] 139 | data.table::setDT(subset) 140 | res_local <- compress(subset, "race", "school", neighbors = "local", n_neighbors = 100, weight = "n") 141 | res_local_small <- compress(subset, "race", "school", neighbors = "local", n_neighbors = 5, weight = "n") 142 | res_all <- compress(subset, "race", "school", neighbors = "all", weight = "n") 143 | 144 | expect_equal(res_local$iterations$old_unit, res_all$iterations$old_unit) 145 | expect_true( 146 | res_local_small$iterations[pct_M > 0.99][.N][["N_units"]] > 147 | res_local$iterations[pct_M > 0.99][.N][["N_units"]] 148 | ) 149 | }) 150 | 151 | test_that("dendrogram", { 152 | dend <- as.dendrogram(res_all) 153 | expect_equal(attr(dend, "height"), res_all$iterations$M[[1]]) 154 | expect_equal(attr(dend, "members"), length(unique(res_all$data$school))) 155 | 156 | res_limited <- compress(subset, "race", "school", weight = "n", max_iter = 5) 157 | expect_error(as.dendrogram(res_limited)) 158 | }) 159 | -------------------------------------------------------------------------------- /tests/testthat/test_mutual_total.R: -------------------------------------------------------------------------------- 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) { 2 | return() 3 | } 4 | 5 | library("segregation") 6 | context("test_mutual_total") 7 | 8 | test_data <- data.frame( 9 | u = c(rep("a", 4), rep("b", 4)), 10 | g = rep(c(1, 2, 3, 4), 2), 11 | supergroup = rep(c(12, 12, 34, 34), 2), 12 | n = c(40, 20, 5, 1, 20, 40, 60, 80), 13 | stringsAsFactors = FALSE 14 | ) 15 | 16 | test_that("mutual M works both ways around", { 17 | expect_equal( 18 | mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est], 19 | mutual_total(test_data, "g", "u", weight = "n")[stat == "M", est] 20 | ) 21 | 22 | expanded <- test_data[rep(seq_len(nrow(test_data)), test_data$n), 1:3] 23 | expect_equal( 24 | mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est], 25 | mutual_total(expanded, "u", "g")[stat == "M", est] 26 | ) 27 | 28 | expect_equal( 29 | mutual_total(expanded, "u", "g")[stat == "M", est], 30 | mutual_total(expanded, "g", "u")[stat == "M", est] 31 | ) 32 | 33 | expect_equal( 34 | mutual_total(test_data, "u", "g", weight = "n", base = 2)[stat == "M", est], 35 | mutual_total(test_data, "g", "u", weight = "n", base = 2)[stat == "M", est] 36 | ) 37 | 38 | expect_equal( 39 | mutual_total(test_data, "u", c("supergroup", "g"), weight = "n")[stat == "M", est], 40 | mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est] 41 | ) 42 | 43 | expect_equal( 44 | mutual_total(test_data, "u", c("supergroup", "g"), weight = "n")[stat == "M", est], 45 | mutual_total(test_data, c("supergroup", "g"), "u", weight = "n")[stat == "M", est] 46 | ) 47 | }) 48 | 49 | test_that("between + within = total", { 50 | expect_equal( 51 | mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est], 52 | mutual_total(test_data, "u", "supergroup", weight = "n")[stat == "M", est] + 53 | mutual_total(test_data, "u", "g", within = "supergroup", weight = "n")[stat == "M", est] 54 | ) 55 | expect_equal( 56 | mutual_total(test_data, "u", "g", weight = "n")[stat == "H", est], 57 | mutual_total(test_data, "u", "supergroup", weight = "n")[stat == "H", est] + 58 | mutual_total(test_data, "u", "g", within = "supergroup", weight = "n")[stat == "H", est] 59 | ) 60 | }) 61 | 62 | p_12 <- sum(test_data[test_data$supergroup == 12, "n"]) / sum(test_data$n) 63 | p_34 <- sum(test_data[test_data$supergroup == 34, "n"]) / sum(test_data$n) 64 | test_that("within estimations are correct", { 65 | d_12 <- test_data[test_data$supergroup == 12, ] 66 | d_34 <- test_data[test_data$supergroup == 34, ] 67 | expect_equal( 68 | p_12 * mutual_total(d_12, "u", "g", weight = "n")[stat == "M", est] + 69 | p_34 * mutual_total(d_34, "u", "g", weight = "n")[stat == "M", est], 70 | mutual_total(test_data, "u", "g", within = "supergroup", weight = "n")[stat == "M", est] 71 | ) 72 | # this decomposition does not exist in the same way for H 73 | }) 74 | 75 | test_that("H is correct", { 76 | ret <- mutual_total(test_data, "u", "g", weight = "n") 77 | expect_equal(ret[stat == "H", est] >= 0 & ret[stat == "H", est] <= 1, TRUE) 78 | }) 79 | 80 | test_that("bootstrapping works", { 81 | ret <- mutual_total(test_data, "u", "g", weight = "n", se = TRUE, n_bootstrap = 10) 82 | expect_equal(dim(ret), c(2, 5)) 83 | expect_equal(all(ret$se > 0), TRUE) 84 | 85 | ret <- mutual_total(test_data, "u", "g", 86 | weight = "n", se = TRUE, n_bootstrap = 10, 87 | within = "supergroup" 88 | ) 89 | expect_equal(dim(ret), c(2, 5)) 90 | expect_equal(all(ret$se > 0), TRUE) 91 | }) 92 | 93 | test_that("bootstrap attributes exists", { 94 | ret <- mutual_total(test_data, "u", "g", weight = "n", se = TRUE, n_bootstrap = 10) 95 | expect_equal(dim(attr(ret, "bootstrap")), c(2 * 10, 2)) 96 | }) 97 | 98 | test_that("bootstrapping fails when sample size is non-integer", { 99 | test_data <- data.frame( 100 | u = c(rep("a", 4), rep("b", 4)), 101 | g = rep(c(1, 2, 3, 4), 2), 102 | n = c(40, 20, 5, 1.8, 20, 40, 60, 80), 103 | stringsAsFactors = FALSE 104 | ) 105 | 106 | expect_error(mutual_total(test_data, "u", "g", weight = "n", se = TRUE, n_bootstrap = 10)) 107 | # rescale 108 | test_data$n2 <- test_data$n / sum(test_data$n) * round(sum(test_data$n)) 109 | ret <- mutual_total(test_data, "u", "g", weight = "n2", se = TRUE, n_bootstrap = 10) 110 | expect_equal(dim(ret), c(2, 5)) 111 | expect_equal(all(ret$se > 0), TRUE) 112 | }) 113 | 114 | 115 | test_data <- data.frame( 116 | u = c(rep("a", 4), rep("b", 4)), 117 | g = rep(c(1, 2, 3, 4), 2), 118 | n = c(40, 0, 0, 0, 0, 0, 0, 40) 119 | ) 120 | 121 | test_that("zero weights no problem", { 122 | expect_equal(dim(mutual_total(test_data, "u", "g", 123 | weight = "n", 124 | se = TRUE, n_bootstrap = 10 125 | )), c(2, 5)) 126 | expect_equal(dim(mutual_total(test_data, "u", "g", weight = "n")), c(2, 2)) 127 | expect_equal(mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est], log(2)) 128 | expect_equal(mutual_total(test_data, "u", "g", weight = "n")[stat == "H", est], 1) 129 | 130 | test_data2 <- copy(test_data) 131 | test_data2$g <- as.factor(test_data2$g) 132 | expect_equal( 133 | mutual_total(test_data, "u", "g", weight = "n")[["est"]], 134 | mutual_total(test_data2, "u", "g", weight = "n")[["est"]] 135 | ) 136 | }) 137 | 138 | test_that("gives errors", { 139 | expect_error(mutual_total("test_data", "u", "g", weight = "n"), "not a data.frame") 140 | expect_error( 141 | mutual_total(test_data[test_data$u == "c", ], "u", "g", weight = "n"), 142 | "data.frame is empty" 143 | ) 144 | 145 | expect_error(mutual_total(test_data, "u2", "g", weight = "n"), "u2 not in data.frame") 146 | expect_error(mutual_total(test_data, "u2", "g2", weight = "n"), "u2, g2 not in data.frame") 147 | expect_error(mutual_total(test_data, "u2", "g2", weight = "n2"), "u2, g2, n2 not in data.frame") 148 | 149 | test_data_constant1 <- data.frame(u = c(rep("a", 4), rep("b", 4)), g = 1) 150 | expect_error(mutual_total(test_data_constant1, "g", "u"), "group variable is constant") 151 | test_data_constant2 <- data.frame(g = c(rep("a", 4), rep("b", 4)), u = 1) 152 | expect_error(mutual_total(test_data_constant2, "g", "u"), "unit variable is constant") 153 | }) 154 | 155 | test_that("debiasing works correctly", { 156 | nose <- mutual_total(test_data, "u", "g", weight = "n") 157 | withse <- mutual_total(test_data, "u", "g", weight = "n", se = TRUE) 158 | expect_equal(nose$est, withse$est + withse$bias) 159 | }) 160 | -------------------------------------------------------------------------------- /vignettes/plotting.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Visualizing and compressing segregation" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{Visualizing and compressing segregation} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | 16 | data.table::setDTthreads(1) 17 | 18 | # skip this vignette on CRAN etc. 19 | BUILD_VIGNETTE <- identical(Sys.getenv("BUILD_VIGNETTE"), "true") 20 | knitr::opts_chunk$set(eval = BUILD_VIGNETTE) 21 | 22 | library("segregation") 23 | ``` 24 | 25 | The package provides the functions `segcurve()` and `segplot()` to visualize segregation. 26 | These functions return simple ggplots, which can then be further styled and themed. 27 | For the `segplot()` function, it is often interesting to also compress the segregation 28 | information that is contained in large datasets. How to do this using 29 | the functions `compress()` and `merge_units()` is also described below, and in more 30 | detail [in this working paper](https://osf.io/preprints/socarxiv/ruw4g/). 31 | 32 | ## Segregation curve 33 | 34 | The segregation curve was first introduced by [Duncan and Duncan (1955)](https://www.jstor.org/stable/2088328). 35 | The function `segcurve()` provides a simple way of plotting one or several segregation curves: 36 | 37 | ```{r} 38 | segcurve(subset(schools00, race %in% c("white", "asian")), 39 | "race", "school", 40 | weight = "n", 41 | segment = "state" # leave this out to produce a single curve 42 | ) 43 | ``` 44 | 45 | In this case, state `A` is the most segregated, while state `B` and `C` are similarly segregated, 46 | but at a lower level. Segregation curves are closely related to the index of dissimilarity, and 47 | here this corresponds to the following index values: 48 | 49 | ```{r} 50 | # converting to data.table makes this easier 51 | data.table::as.data.table(schools00)[ 52 | race %in% c("white", "asian"), 53 | dissimilarity(.SD, "race", "school", weight = "n"), 54 | by = .(state) 55 | ] 56 | ``` 57 | 58 | 59 | ## Segplot 60 | 61 | ::: {.alert .alert-primary} 62 | Please consider citing the following paper if you use segplot: Benjamin Elbers and Rob Gruijters. 2023. "[Segplot: A New Method for Visualizing Patterns of Multi-Group Segregation](https://doi.org/10.1016/j.rssm.2023.100860). Research in Social Stratification and Mobility. 63 | ::: 64 | 65 | The function `segplot()` is provided to generate segplots. Segplots are described in more 66 | detail [in this working paper](https://osf.io/preprints/socarxiv/ruw4g/). 67 | The function requires the dataset, the group, and unit variables, and, if required, 68 | a variable that identifies the weight (`n` in this case). 69 | 70 | Other options to customize the look of the segplot are given by the argument `order`. 71 | By default, the units of the segplot are ordered by their local segregation score, 72 | but it is also possible to order them by entropy (i.e., diversity) or by share 73 | of the majority population. This last option can be useful for the two-group case. 74 | The argument `bar_space` can be used to increase the space between the units 75 | from the default of zero space between bars. When plotting a subset of the dataset, 76 | the reference distribution shown on the right of the segplot can be changed by 77 | supplying a two-column data frame to the `reference_distribution` argument. 78 | One column of this frame should contain the group identifiers, and 79 | the other should include the reference proportion of each group. 80 | 81 | Examples of how to use these arguments are given below: 82 | 83 | ```{r} 84 | sch <- subset(schools00, state == "A") 85 | 86 | # basic segplot 87 | segplot(sch, "race", "school", weight = "n") 88 | 89 | # order by majority group (white in this case) 90 | segplot(sch, "race", "school", weight = "n", order = "majority") 91 | 92 | # increase the space between bars 93 | # (has to be very low here because there are many schools in this dataset) 94 | segplot(sch, "race", "school", weight = "n", bar_space = 0.0005) 95 | 96 | # change the reference distribution 97 | # (here, we just use an equalized distribution across the five groups) 98 | (ref <- data.frame(race = unique(schools00$race), p = rep(0.2, 5))) 99 | segplot(sch, "race", "school", 100 | weight = "n", 101 | reference_distribution = ref 102 | ) 103 | ``` 104 | 105 | It is also possible to show a secondary plot that shows the adjusted local segregation scores: 106 | 107 | ```{r} 108 | segplot(sch, "race", "school", weight = "n", secondary_plot = "segregation") 109 | ``` 110 | 111 | ## Compressing segregation information 112 | 113 | The compression algorithm requires three steps to be taken. 114 | First, it is important to decide which units should be permitted to merge: 115 | for residential segregation, we may only want to allow neighboring units 116 | (such as tracts) to be mergeable. In this case, the first step consists 117 | of compiling a data frame with exactly two columns, where each row identifies 118 | a pair of neighboring units. In other cases, we may want to allow all units 119 | to be mergeable, in principle. However, this can be very time-consuming as 120 | it requires each unit to be compared to all others at every step of the merging 121 | operation. To speed up compression, we therefore implement an option that 122 | allows units to be merged only within a window of "neighboring" units, 123 | where the definition of each window is based on similarities in local segregation. 124 | Hence, for a given unit, only `n_neighbors` are considered at every step, and 125 | these neighbors are based on similarities in local segregation. 126 | Smaller `n_neighbors` values will result in faster run times, but increase 127 | the probability of non-optimal merges. The method of merging can be 128 | specified in the `compress()` function by supplying the argument neighbors. 129 | 130 | The second step is then to run the actual compression algorithm using `compress()`. 131 | For this example, we choose to compress based on a relatively small window: 132 | 133 | ```{r, results='hide'} 134 | # compression based on window of 20 'neighboring' units 135 | # in terms of local segregation (alternatively, neighbors can be a data frame) 136 | comp <- compress(sch, "race", "school", 137 | weight = "n", neighbors = "local", n_neighbors = 20 138 | ) 139 | ``` 140 | 141 | After running `compress()`—which can take some time depending on how 142 | many neighbors need to be considered—the output summarizes the compression 143 | that can be achieved: 144 | 145 | ```{r} 146 | comp 147 | ``` 148 | 149 | The results indicate that 99% of the segregation information can be retained by 150 | only 98 units (out of 560 in the original dataset), 95% in only 24 units, 151 | and 90% in 10 units. The percentage of information retained on each iteration 152 | can be accessed via the data frame available through `comp$iterations`. 153 | This data frame can also be used to generate a plot that shows the relationship 154 | between the number of merges and the loss in segregation information: 155 | 156 | ```{r} 157 | scree_plot(comp) 158 | ``` 159 | 160 | Another way to learn more about the compression is to visualize the information as a dendrogram: 161 | 162 | ```{r} 163 | dend <- as.dendrogram(comp) 164 | plot(dend, leaflab = "none") 165 | ``` 166 | 167 | The third step is to create a new dataset based on the desired level of compression. 168 | This can be achieved using the function `merge_units()`, and either `n_units` or `percent` 169 | can be specified to indicate the desired level of compression. 170 | 171 | ```{r} 172 | sch_compressed <- merge_units(comp, n_units = 15) 173 | # or, for instance: merge_units(comp, percent = 0.80) 174 | head(sch_compressed) 175 | ``` 176 | 177 | The compressed dataset has the same format as the original dataset 178 | and can now be used to produce another segplot, e.g. 179 | 180 | ```{r} 181 | segplot(sch_compressed, "race", "school", weight = "n", secondary_plot = "segregation") 182 | ``` 183 | -------------------------------------------------------------------------------- /man/mutual_difference.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mutual_difference.R 3 | \name{mutual_difference} 4 | \alias{mutual_difference} 5 | \title{Decomposes the difference between two M indices} 6 | \usage{ 7 | mutual_difference( 8 | data1, 9 | data2, 10 | group, 11 | unit, 12 | weight = NULL, 13 | method = "shapley", 14 | se = FALSE, 15 | CI = 0.95, 16 | n_bootstrap = 100, 17 | base = exp(1), 18 | ... 19 | ) 20 | } 21 | \arguments{ 22 | \item{data1}{A data frame with same structure as \code{data2}.} 23 | 24 | \item{data2}{A data frame with same structure as \code{data1}.} 25 | 26 | \item{group}{A categorical variable or a vector of variables 27 | contained in \code{data}. Defines the first dimension 28 | over which segregation is computed.} 29 | 30 | \item{unit}{A categorical variable or a vector of variables 31 | contained in \code{data}. Defines the second dimension 32 | over which segregation is computed.} 33 | 34 | \item{weight}{Numeric. (Default \code{NULL})} 35 | 36 | \item{method}{Either "shapley" (the default), "km" (Karmel and Maclachlan method), or 37 | "mrc" (Mora and Ruiz-Castillo method).} 38 | 39 | \item{se}{If \code{TRUE}, the segregation estimates are bootstrapped to provide 40 | standard errors and to apply bias correction. The bias that is reported 41 | has already been applied to the estimates (i.e. the reported estimates are "debiased") 42 | (Default \code{FALSE})} 43 | 44 | \item{CI}{If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval 45 | in addition to the bootstrap standard error. 46 | This is based on percentiles of the bootstrap distribution, and a valid interpretation 47 | relies on a larger number of bootstrap iterations. (Default \code{0.95})} 48 | 49 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})} 50 | 51 | \item{base}{Base of the logarithm that is used in the calculation. 52 | Defaults to the natural logarithm.} 53 | 54 | \item{...}{Only used for additional arguments when 55 | when \code{method} is set to \code{shapley} or \code{km}. See \link{ipf} for details.} 56 | } 57 | \value{ 58 | Returns a data.table with columns \code{stat} and \code{est}. The data frame contains 59 | the following rows defined by \code{stat}: 60 | \code{M1} contains the M for \code{data1}. 61 | \code{M2} contains the M for \code{data2}. 62 | \code{diff} is the difference between \code{M2} and \code{M1}. 63 | The sum of the five rows following \code{diff} equal \code{diff}. 64 | 65 | \code{additions} contains the change in M induces by \code{unit} and \code{group} categories 66 | present in \code{data2} but not \code{data1}, and \code{removals} the reverse. 67 | 68 | All methods return the following three terms: 69 | \code{unit_marginal} is the contribution of unit composition differences. 70 | \code{group_marginal} is the contribution of group composition differences. 71 | \code{structural} is the contribution unexplained by the marginal changes, i.e. the structural 72 | difference. Note that the interpretation of these terms depend on the exact method used. 73 | 74 | When using "km", one additional row is returned: 75 | \code{interaction} is the contribution of differences in the joint marginal distribution 76 | of \code{unit} and \code{group}. 77 | 78 | When "shapley_detailed" is used, an additional column "unit" is returned, along with 79 | six additional rows for each unit that is present in both \code{data1} and \code{data2}. 80 | The five rows have the following meaning: 81 | \code{p1} (\code{p2}) is the proportion of the unit in \code{data1} (\code{data2}) 82 | once non-intersecting units/groups have been removed. The changes in local linkage are 83 | given by \code{ls_diff1} and \code{ls_diff2}, and their average is given by 84 | \code{ls_diff_mean}. The row named \code{total} 85 | summarizes the contribution of 86 | the unit towards structural change 87 | using the formula \code{.5 * p1 * ls_diff1 + .5 * p2 * ls_diff2}. 88 | The sum of all "total" components equals structural change. 89 | 90 | If \code{se} is set to \code{TRUE}, an additional column \code{se} contains 91 | the associated bootstrapped standard errors, an additional column \code{CI} contains 92 | the estimate confidence interval as a list column, an additional column \code{bias} contains 93 | the estimated bias, and the column \code{est} contains the bias-corrected estimates. 94 | } 95 | \description{ 96 | Uses one of three methods to decompose the difference between two M indices: 97 | (1) "shapley" / "shapley_detailed": a method based on the Shapley decomposition 98 | with a few advantages over the Karmel-Maclachlan method 99 | (recommended and the default, Deutsch et al. 2006), 100 | (2) "km": the method based on Karmel-Maclachlan (1988), 101 | (3) "mrc": the method developed by Mora and Ruiz-Castillo (2009). 102 | All methods have been extended to account for missing units/groups in either data input. 103 | } 104 | \details{ 105 | The Shapley method is an improvement over the Karmel-Maclachlan method (Deutsch et al. 2006). 106 | It is based on several margins-adjusted data inputs 107 | and yields symmetrical results (i.e. \code{data1} and \code{data2} can be switched). 108 | When "shapley_detailed" is used, the structural component is further decomposed into 109 | the contributions of individuals units. 110 | 111 | The Karmel-Maclachlan method (Karmel and Maclachlan 1988) adjusts 112 | the margins of \code{data1} to be similar to the margins of \code{data2}. This process 113 | is not symmetrical. 114 | 115 | The Shapley and Karmel-Maclachlan methods are based on iterative proportional fitting (IPF), 116 | first introduced by Deming and Stephan (1940). 117 | Depending on the size of the dataset, this may take a few seconds (see \link{ipf} for details). 118 | 119 | The method developed by Mora and Ruiz-Castillo (2009) uses an algebraic approach to estimate the 120 | size of the components. This will often yield substantively different results from the Shapley 121 | and Karmel-Maclachlan methods. Note that this method is not symmetric in terms of what is 122 | defined as \code{group} and \code{unit} categories, which may yield contradictory results. 123 | 124 | A problem arises when there are \code{group} and/or \code{unit} categories in \code{data1} 125 | that are not present in \code{data2} (or vice versa). 126 | All methods estimate the difference only 127 | for categories that are present in both datasets, and report additionally 128 | the change in M that is induced by these cases as 129 | \code{additions} (present in \code{data2}, but not in \code{data1}) and 130 | \code{removals} (present in \code{data1}, but not in \code{data2}). 131 | } 132 | \examples{ 133 | \dontrun{ 134 | # decompose the difference in school segregation between 2000 and 2005, 135 | # using the Shapley method 136 | mutual_difference(schools00, schools05, 137 | group = "race", unit = "school", 138 | weight = "n", method = "shapley", precision = .1 139 | ) 140 | # => the structural component is close to zero, thus most change is in the marginals. 141 | # This method gives identical results when we switch the unit and group definitions, 142 | # and when we switch the data inputs. 143 | 144 | # the Karmel-Maclachlan method is similar, but only adjust the data in the forward direction... 145 | mutual_difference(schools00, schools05, 146 | group = "school", unit = "race", 147 | weight = "n", method = "km", precision = .1 148 | ) 149 | 150 | # ...this means that the results won't be identical when we switch the data inputs 151 | mutual_difference(schools05, schools00, 152 | group = "school", unit = "race", 153 | weight = "n", method = "km", precision = .1 154 | ) 155 | 156 | # the MRC method indicates a much higher structural change... 157 | mutual_difference(schools00, schools05, 158 | group = "race", unit = "school", 159 | weight = "n", method = "mrc" 160 | ) 161 | 162 | # ...and is not symmetric 163 | mutual_difference(schools00, schools05, 164 | group = "school", unit = "race", 165 | weight = "n", method = "mrc" 166 | ) 167 | } 168 | } 169 | \references{ 170 | W. E. Deming, F. F. Stephan. 1940. "On a Least Squares Adjustment of a Sampled Frequency Table 171 | When the Expected Marginal Totals are Known." 172 | The Annals of Mathematical Statistics 11(4): 427-444. 173 | 174 | T. Karmel and M. Maclachlan. 1988. 175 | "Occupational Sex Segregation — Increasing or Decreasing?" Economic Record 64: 187-195. 176 | 177 | R. Mora and J. Ruiz-Castillo. 2009. "The Invariance Properties of the 178 | Mutual Information Index of Multigroup Segregation." Research on Economic Inequality 17: 33-53. 179 | 180 | J. Deutsch, Y. Flückiger, and J. Silber. 2009. 181 | "Analyzing Changes in Occupational Segregation: The Case of Switzerland (1970–2000)." 182 | Research on Economic Inequality 17: 171–202. 183 | } 184 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: gfm 5 | editor_options: 6 | markdown: 7 | wrap: 72 8 | --- 9 | 10 | 11 | 12 | ```{r, echo = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>", 16 | fig.path = "man/figures/README-" 17 | ) 18 | options(scipen = 999) 19 | options(digits = 3) 20 | set.seed(69839) 21 | ``` 22 | 23 | # segregation 24 | 25 | [![CRAN 26 | Version](https://www.r-pkg.org/badges/version/segregation)](https://CRAN.R-project.org/package=segregation) 27 | [![R build 28 | status](https://github.com/elbersb/segregation/workflows/R-CMD-check/badge.svg)](https://github.com/elbersb/segregation/actions) 29 | [![Coverage 30 | status](https://codecov.io/gh/elbersb/segregation/branch/master/graph/badge.svg)](https://app.codecov.io/github/elbersb/segregation?branch=master) 31 | 32 | An R package to calculate, visualize, and decompose various segregation indices. 33 | The package currently supports 34 | 35 | - the Mutual Information Index (M), 36 | - Theil's Information Index (H), 37 | - the index of Dissimilarity (D), 38 | - the isolation and exposure index. 39 | 40 | Find more information in `vignette("segregation")` 41 | and the [documentation](https://elbersb.de/segregation). 42 | 43 | The package also supports 44 | 45 | - [standard error and confidence intervals estimation via bootstrapping](https://elbersb.com/public/posts/2021-11-24-segregation-bias/), 46 | which also corrects for small sample bias 47 | - decomposition of the M and H indices (within/between, local segregation) 48 | - decomposing differences in total segregation over time (Elbers 2020) 49 | - [segregation visualizations](https://elbersb.github.io/segregation/articles/plotting.html) (segregation curves and 'segplots') 50 | 51 | Most methods return [tidy](https://vita.had.co.nz/papers/tidy-data.html) 52 | [data.tables](https://rdatatable.gitlab.io/data.table/) for easy 53 | post-processing and plotting. For speed, the package uses the [`data.table`](https://rdatatable.gitlab.io/data.table/) 54 | package internally, and implements some functions in C++. 55 | 56 | Most of the procedures implemented in this package are described in more 57 | detail [in this SMR 58 | paper](https://journals.sagepub.com/doi/full/10.1177/0049124121986204) 59 | ([Preprint](https://osf.io/preprints/socarxiv/ya7zs/)) and [in this 60 | working paper](https://osf.io/preprints/socarxiv/ruw4g/). 61 | 62 | ## Usage 63 | 64 | The package provides an easy way to calculate segregation measures, 65 | based on the Mutual Information Index (M) and Theil's Entropy Index (H). 66 | 67 | ```{r} 68 | library(segregation) 69 | 70 | # example dataset with fake data provided by the package 71 | mutual_total(schools00, "race", "school", weight = "n") 72 | ``` 73 | 74 | Standard errors in all functions can be estimated via boostrapping. This 75 | will also apply bias-correction to the estimates: 76 | 77 | ```{r} 78 | mutual_total(schools00, "race", "school", 79 | weight = "n", 80 | se = TRUE, CI = 0.90, n_bootstrap = 500 81 | ) 82 | ``` 83 | 84 | Decompose segregation into a between-state and a within-state term (the 85 | sum of these equals total segregation): 86 | 87 | ```{r} 88 | # between states 89 | mutual_total(schools00, "race", "state", weight = "n") 90 | 91 | # within states 92 | mutual_total(schools00, "race", "school", within = "state", weight = "n") 93 | ``` 94 | 95 | Local segregation (`ls`) is a decomposition by units or groups (here 96 | racial groups). This function also support standard error and CI 97 | estimation. The sum of the proportion-weighted local segregation scores 98 | equals M: 99 | 100 | ```{r} 101 | local <- mutual_local(schools00, 102 | group = "school", unit = "race", weight = "n", 103 | se = TRUE, CI = 0.90, n_bootstrap = 500, wide = TRUE 104 | ) 105 | local[, c("race", "ls", "p", "ls_CI")] 106 | sum(local$p * local$ls) 107 | ``` 108 | 109 | Decompose the difference in M between 2000 and 2005, using iterative 110 | proportional fitting (IPF) and the Shapley decomposition (see Elbers 111 | 2021 for details): 112 | 113 | ```{r} 114 | mutual_difference(schools00, schools05, 115 | group = "race", unit = "school", 116 | weight = "n", method = "shapley" 117 | ) 118 | ``` 119 | 120 | Show a segplot: 121 | 122 | ```{r segplot} 123 | segplot(schools00, group = "race", unit = "school", weight = "n") 124 | ``` 125 | 126 | Find more information in the 127 | [documentation](https://elbersb.github.io/segregation/). 128 | 129 | ## How to install 130 | 131 | To install the package from CRAN, use 132 | 133 | ```{r eval=FALSE} 134 | install.packages("segregation") 135 | ``` 136 | 137 | To install the development version, use 138 | 139 | ```{r eval=FALSE} 140 | devtools::install_github("elbersb/segregation") 141 | ``` 142 | 143 | ## Citation 144 | 145 | If you use this package for your research, please cite one of the following papers: 146 | 147 | - Elbers, Benjamin (2021). A Method for Studying Differences in Segregation 148 | Across Time and Space. Sociological Methods & Research. 149 | 150 | 151 | - Elbers, Benjamin and Rob Gruijters (2023). Segplot: A New Method for Visualizing Patterns of Multi-Group Segregation. 152 | 153 | 154 | ## Some additional resources 155 | 156 | - The book *Analyzing US Census Data: Methods, Maps, and Models in R* 157 | by Kyle E. Walker contains [a discussion of this 158 | package](https://walker-data.com/census-r/modeling-us-census-data.html#indices-of-segregation-and-diversity), 159 | and is a great resource for anyone working with spatial data, 160 | especially U.S. Census data. 161 | - A paper that makes use of this package: [Did Residential Racial 162 | Segregation in the U.S. Really Increase? An Analysis Accounting for 163 | Changes in Racial 164 | Diversity](https://elbersb.com/public/posts/2021-07-23-segregation-increase/) 165 | ([Code and Data](https://osf.io/mg9q4/)) 166 | - Some of the analyses [in this 167 | article](https://multimedia.tijd.be/diversiteit/) by the Belgian 168 | newspaper *De Tijd* used the package. 169 | - The analyses of [this article in the Wall Street 170 | Journal](https://www.wsj.com/articles/chicago-vs-dallas-why-the-north-lags-behind-the-south-and-west-in-racial-integration-11657936680) 171 | were produced using this package. 172 | 173 | ## References on entropy-based segregation indices 174 | 175 | Deutsch, J., Flückiger, Y. & Silber, J. (2009). Analyzing Changes in 176 | Occupational Segregation: The Case of Switzerland (1970--2000), in: Yves 177 | Flückiger, Sean F. Reardon, Jacques Silber (eds.) Occupational and 178 | Residential Segregation (Research on Economic Inequality, Volume 17), 179 | 171--202. 180 | 181 | DiPrete, T. A., Eller, C. C., Bol, T., & van de Werfhorst, H. G. (2017). 182 | School-to-Work Linkages in the United States, Germany, and France. 183 | American Journal of Sociology, 122(6), 1869-1938. 184 | 185 | 186 | Elbers, B. (2021). A Method for Studying Differences in Segregation 187 | Across Time and Space. Sociological Methods & Research. 188 | 189 | 190 | Forster, A. G., & Bol, T. (2017). Vocational education and employment 191 | over the life course using a new measure of occupational specificity. 192 | Social Science Research, 70, 176-197. 193 | 194 | 195 | Theil, H. (1971). Principles of Econometrics. New York: Wiley. 196 | 197 | Frankel, D. M., & Volij, O. (2011). Measuring school segregation. 198 | Journal of Economic Theory, 146(1), 1-38. 199 | 200 | 201 | Mora, R., & Ruiz-Castillo, J. (2003). Additively decomposable 202 | segregation indexes. The case of gender segregation by occupations and 203 | human capital levels in Spain. The Journal of Economic Inequality, 1(2), 204 | 147-179. 205 | 206 | Mora, R., & Ruiz-Castillo, J. (2009). The Invariance Properties of the 207 | Mutual Information Index of Multigroup Segregation, in: Yves Flückiger, 208 | Sean F. Reardon, Jacques Silber (eds.) Occupational and Residential 209 | Segregation (Research on Economic Inequality, Volume 17), 33-53. 210 | 211 | Mora, R., & Ruiz-Castillo, J. (2011). Entropy-based Segregation Indices. 212 | Sociological Methodology, 41(1), 159--194. 213 | 214 | 215 | Van Puyenbroeck, T., De Bruyne, K., & Sels, L. (2012). More than 'Mutual 216 | Information': Educational and sectoral gender segregation and their 217 | interaction on the Flemish labor market. Labour Economics, 19(1), 1-8. 218 | 219 | 220 | Watts, M. The Use and Abuse of Entropy Based Segregation Indices. 221 | Working Paper. URL: 222 | 223 | -------------------------------------------------------------------------------- /R/segregation.R: -------------------------------------------------------------------------------- 1 | #' segregation: Entropy-based segregation indices 2 | #' 3 | #' Calculate and decompose entropy-based, multigroup segregation indices, with a focus 4 | #' on the Mutual Information Index (M) and Theil's Information Index (H). 5 | #' Provides tools to decompose the measures by groups and units, and by within 6 | #' and between terms. Includes standard error estimation by bootstrapping. 7 | #' 8 | #' @seealso \url{https://elbersb.com/segregation} 9 | #' 10 | #' @docType package 11 | #' @name segregation 12 | #' @keywords internal 13 | "_PACKAGE" 14 | 15 | #' @importFrom Rcpp sourceCpp 16 | #' @import RcppProgress 17 | #' @useDynLib segregation, .registration = TRUE 18 | NULL 19 | 20 | globalVariables(c( 21 | "V1", "V2", "cond1", "cond2", "entropy_cond", "entropy_cond1", "entropy_cond2", "entropyw", 22 | "est", "freq", "freq1", "freq2", "freq_orig1", "freq_orig2", 23 | "ls_diff_mean", "ls_diff1", "ls_diff2", "ls_unit", "bias", "boot_est", "est_debiased", 24 | "n", "n_group", "n_group_target", "n_source", "n_target", "n_unit", "n_unit_target", 25 | "n_within_group", "p", "p_exp", "p1", "p2", "p_group", "p_group_g_unit", "p_group_g_unit1", 26 | "p_group_g_unit2", "p_group_s", "p_group_t", "p_unit", "p_unit1", "p_unit2", "p_unit_s", 27 | "p_unit_t", "p_within", "sumcond1", "sumcond2", "total", "unit1", "unit2", 28 | ".", "..base", "..fixed_margins", "..group", "..n_bootstrap", "..unit", "se", "stat", 29 | "M", "N_units", "i.freq1", "i.freq2", "iter", "ls1", "ls2", "p_unit_g_group1", 30 | "p_unit_g_group2", "pair", "pct_M", 31 | "x", "xend", "y", "yend", "xmax", "xmin", "ymax", "ymin", "..cols", "p_overall", 32 | "freq_of", "freq_to", 33 | "cumul_prob_1", "cumul_prob_2", "group1", "group2", "pct_group_1", 34 | ".data", "new_unit", "old_unit" 35 | )) 36 | 37 | # log 38 | 39 | log_env <- new.env() 40 | assign("n_printed", 0, envir = log_env) 41 | 42 | update_log <- function(bs_n = NULL, bs_max = NULL, ipf_n = NULL, ipf_max = NULL) { 43 | if (!is.null(bs_n)) assign("bs_n", bs_n, envir = log_env) 44 | if (!is.null(bs_max)) assign("bs_max", bs_max, envir = log_env) 45 | if (!is.null(ipf_n)) assign("ipf_n", ipf_n, envir = log_env) 46 | if (!is.null(ipf_max)) assign("ipf_max", ipf_max, envir = log_env) 47 | 48 | if (!is.null(get("bs_n", envir = log_env)) && !is.null(get("ipf_n", envir = log_env))) { 49 | text <- paste0("[", "Bootstrap ", get("bs_n", envir = log_env), "/", 50 | get("bs_max", envir = log_env), 51 | " IPF ", get("ipf_n", envir = log_env), "/", 52 | get("ipf_max", envir = log_env), "] ", 53 | collapse = "" 54 | ) 55 | } else if (!is.null(get("bs_n", envir = log_env))) { 56 | text <- paste0("[", "Bootstrap ", get("bs_n", envir = log_env), 57 | "/", get("bs_max", envir = log_env), "] ", 58 | collapse = "" 59 | ) 60 | } else if (!is.null(get("ipf_n", envir = log_env))) { 61 | text <- paste0("[", "IPF ", get("ipf_n", envir = log_env), 62 | "/", get("ipf_max", envir = log_env), "] ", 63 | collapse = "" 64 | ) 65 | } 66 | 67 | clear_log() 68 | assign("n_printed", nchar(text), envir = log_env) 69 | cat(text, file = stderr()) 70 | } 71 | 72 | update_log_progress <- function(text) { 73 | assign("n_printed", get("n_printed", envir = log_env) + nchar(text), envir = log_env) 74 | cat(text, file = stderr()) 75 | } 76 | 77 | clear_log <- function() { 78 | utils::flush.console() 79 | 80 | if (get("n_printed", envir = log_env) > 0) { 81 | cat("\r", paste0(rep(" ", times = get("n_printed", envir = log_env)), collapse = ""), 82 | "\r", 83 | file = stderr() 84 | ) 85 | 86 | assign("n_printed", 0, envir = log_env) 87 | } 88 | } 89 | 90 | close_log <- function() { 91 | clear_log() 92 | assign("bs_n", NULL, envir = log_env) 93 | assign("bs_max", NULL, envir = log_env) 94 | assign("ipf_n", NULL, envir = log_env) 95 | assign("ipf_max", NULL, envir = log_env) 96 | } 97 | close_log() 98 | 99 | # helpers 100 | 101 | logf <- function(v, base = exp(1)) { 102 | logged <- log(v, base = base) 103 | logged[!is.finite(logged)] <- 0 104 | logged 105 | } 106 | 107 | #' @import data.table 108 | prepare_data <- function(data, group, unit, weight, within = NULL) { 109 | if ("data.frame" %in% class(data)) { 110 | if (nrow(data) == 0) { 111 | stop("data.frame is empty") 112 | } 113 | test_vars <- c(group, unit, weight, within) 114 | test_vars <- test_vars[!test_vars %in% names(data)] 115 | if (length(test_vars) > 0) { 116 | test_vars <- paste0(test_vars, collapse = ", ") 117 | stop(paste0("variable(s) ", test_vars, " not in data.frame")) 118 | } 119 | } else { 120 | stop("not a data.frame") 121 | } 122 | vars <- c(group, unit) 123 | 124 | # create a copy 125 | data <- as.data.table(data) 126 | 127 | # check whether there is variation 128 | n_groups <- nrow(data[, .N, by = group]) 129 | n_units <- nrow(data[, .N, by = unit]) 130 | if (n_groups == 1) stop("Cannot compute segregation: the group variable is constant") 131 | if (n_units == 1) stop("Cannot compute segregation: the unit variable is constant") 132 | 133 | # use provided weight or weight of 1 134 | weight_no_conflict <- weight 135 | if (!is.null(weight_no_conflict)) { 136 | if (weight_no_conflict == "weight") { 137 | data[, freq := as.double(weight)] 138 | } else { 139 | data[, freq := as.double(get(weight_no_conflict))] 140 | } 141 | } else { 142 | data[, freq := 1] 143 | } 144 | 145 | if (!is.null(within)) { 146 | vars <- c(vars, within) 147 | } 148 | 149 | # drop unused factor levels - these can lead to problems downstream 150 | for (var in vars) { 151 | if (is.factor(data[[var]])) { 152 | data[[var]] <- droplevels(data[[var]]) 153 | } 154 | } 155 | 156 | # collapse on vars, and select only positive weights 157 | data <- data[freq > 0, list(freq = sum(freq)), by = vars] 158 | setattr(data, "vars", vars) 159 | setkey(data, NULL) 160 | data 161 | } 162 | 163 | 164 | #' @import data.table 165 | add_local <- function(data, group_var, unit_var, base, weight = "freq") { 166 | n_total <- sum(data[, get(weight)]) 167 | # generate unit and group totals 168 | data[, n_unit := sum(get(weight)), by = unit_var] 169 | data[, n_group := sum(get(weight)), by = group_var] 170 | # generate unit and group proportions and the 171 | # conditional probability of being in any group given the unit 172 | data[, `:=`( 173 | p_unit = n_unit / n_total, 174 | p_group = n_group / n_total, 175 | p_group_g_unit = get(weight) / n_unit 176 | )] 177 | # calculate local linkage, i.e. log(cond.) * log(cond./marginal) 178 | data[, ls_unit := sum(p_group_g_unit * logf(p_group_g_unit / p_group, base)), 179 | by = unit_var 180 | ] 181 | } 182 | 183 | #' @import data.table 184 | bootstrap_summary <- function(ret, boot_ret, cols, CI) { 185 | setnames(boot_ret, "est", "boot_est") 186 | ret <- merge(ret, boot_ret, by = cols, sort = FALSE) 187 | # create a "debiased" version of bootstrap estimates 188 | ret[, est_debiased := 2 * est - boot_est, by = cols] 189 | pct <- c((1 - CI) / 2, 1 - (1 - CI) / 2) 190 | # estimate the "debiased" mean, standard error, CI, and quantify bias 191 | ret <- ret[, list( 192 | est = mean(est_debiased), 193 | se = stats::sd(est_debiased), 194 | CI = list(stats::quantile(est_debiased, pct)), 195 | bias = first(est) - mean(est_debiased) 196 | ), by = cols] 197 | ret[] 198 | } 199 | 200 | #' Turns a contingency table into long format 201 | #' 202 | #' Returns a data.table in long form, such that it is suitable 203 | #' for use in \link{mutual_total}, etc. Colnames and rownames of 204 | #' the matrix will be respected. 205 | #' 206 | #' @param matrix A matrix, where the rows represent the units, and the 207 | #' column represent the groups. 208 | #' @param group Variable name for group. (Default \code{group}) 209 | #' @param unit Variable name for unit. (Default \code{unit}) 210 | #' @param weight Variable name for frequency weight. (Default \code{weight}) 211 | #' @param drop_zero Drop unit-group combinations with zero weight. (Default \code{TRUE}) 212 | #' @return A data.table. 213 | #' @examples 214 | #' m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3) 215 | #' colnames(m) <- c("Black", "White") 216 | #' long <- matrix_to_long(m, group = "race", unit = "school") 217 | #' mutual_total(long, "race", "school", weight = "n") 218 | #' @import data.table 219 | #' @export 220 | matrix_to_long <- function(matrix, group = "group", unit = "unit", 221 | weight = "n", drop_zero = TRUE) { 222 | if (!is.matrix(matrix)) stop("matrix needs be a matrix object") 223 | if (is.null(rownames(matrix))) rownames(matrix) <- seq_len(nrow(matrix)) 224 | if (is.null(colnames(matrix))) colnames(matrix) <- seq_len(ncol(matrix)) 225 | d <- as.data.table(matrix, keep.rownames = unit) 226 | long <- melt(d, 227 | id.vars = unit, 228 | variable.name = group, 229 | variable.factor = FALSE, 230 | value.name = weight 231 | ) 232 | if (drop_zero == TRUE) { 233 | ind <- long[[weight]] > 0 234 | long[ind] 235 | } else { 236 | long 237 | } 238 | } 239 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # segregation 4 | 5 | [![CRAN 6 | Version](https://www.r-pkg.org/badges/version/segregation)](https://CRAN.R-project.org/package=segregation) 7 | [![R build 8 | status](https://github.com/elbersb/segregation/workflows/R-CMD-check/badge.svg)](https://github.com/elbersb/segregation/actions) 9 | [![Coverage 10 | status](https://codecov.io/gh/elbersb/segregation/branch/master/graph/badge.svg)](https://app.codecov.io/github/elbersb/segregation?branch=master) 11 | 12 | An R package to calculate, visualize, and decompose various segregation 13 | indices. The package currently supports 14 | 15 | - the Mutual Information Index (M), 16 | - Theil’s Information Index (H), 17 | - the index of Dissimilarity (D), 18 | - the isolation and exposure index. 19 | 20 | Find more information in `vignette("segregation")` and the 21 | [documentation](https://elbersb.de/segregation). 22 | 23 | The package also supports 24 | 25 | - [standard error and confidence intervals estimation via 26 | bootstrapping](https://elbersb.com/public/posts/2021-11-24-segregation-bias/), 27 | which also corrects for small sample bias 28 | - decomposition of the M and H indices (within/between, local 29 | segregation) 30 | - decomposing differences in total segregation over time (Elbers 2020) 31 | - [segregation 32 | visualizations](https://elbersb.github.io/segregation/articles/plotting.html) 33 | (segregation curves and ‘segplots’) 34 | 35 | Most methods return [tidy](https://vita.had.co.nz/papers/tidy-data.html) 36 | [data.tables](https://rdatatable.gitlab.io/data.table/) for easy 37 | post-processing and plotting. For speed, the package uses the 38 | [`data.table`](https://rdatatable.gitlab.io/data.table/) package 39 | internally, and implements some functions in C++. 40 | 41 | Most of the procedures implemented in this package are described in more 42 | detail [in this SMR 43 | paper](https://journals.sagepub.com/doi/full/10.1177/0049124121986204) 44 | ([Preprint](https://osf.io/preprints/socarxiv/ya7zs/)) and [in this 45 | working paper](https://osf.io/preprints/socarxiv/ruw4g/). 46 | 47 | ## Usage 48 | 49 | The package provides an easy way to calculate segregation measures, 50 | based on the Mutual Information Index (M) and Theil’s Entropy Index (H). 51 | 52 | ``` r 53 | library(segregation) 54 | 55 | # example dataset with fake data provided by the package 56 | mutual_total(schools00, "race", "school", weight = "n") 57 | #> stat est 58 | #> 59 | #> 1: M 0.426 60 | #> 2: H 0.419 61 | ``` 62 | 63 | Standard errors in all functions can be estimated via boostrapping. This 64 | will also apply bias-correction to the estimates: 65 | 66 | ``` r 67 | mutual_total(schools00, "race", "school", 68 | weight = "n", 69 | se = TRUE, CI = 0.90, n_bootstrap = 500 70 | ) 71 | #> 500 bootstrap iterations on 877739 observations 72 | #> stat est se CI bias 73 | #> 74 | #> 1: M 0.422 0.000775 0.421,0.423 0.00361 75 | #> 2: H 0.415 0.000712 0.414,0.416 0.00356 76 | ``` 77 | 78 | Decompose segregation into a between-state and a within-state term (the 79 | sum of these equals total segregation): 80 | 81 | ``` r 82 | # between states 83 | mutual_total(schools00, "race", "state", weight = "n") 84 | #> stat est 85 | #> 86 | #> 1: M 0.0992 87 | #> 2: H 0.0977 88 | 89 | # within states 90 | mutual_total(schools00, "race", "school", within = "state", weight = "n") 91 | #> stat est 92 | #> 93 | #> 1: M 0.326 94 | #> 2: H 0.321 95 | ``` 96 | 97 | Local segregation (`ls`) is a decomposition by units or groups (here 98 | racial groups). This function also support standard error and CI 99 | estimation. The sum of the proportion-weighted local segregation scores 100 | equals M: 101 | 102 | ``` r 103 | local <- mutual_local(schools00, 104 | group = "school", unit = "race", weight = "n", 105 | se = TRUE, CI = 0.90, n_bootstrap = 500, wide = TRUE 106 | ) 107 | #> 500 bootstrap iterations on 877739 observations 108 | local[, c("race", "ls", "p", "ls_CI")] 109 | #> race ls p ls_CI 110 | #> 111 | #> 1: asian 0.591 0.02255 0.582,0.601 112 | #> 2: black 0.876 0.19017 0.873,0.879 113 | #> 3: hisp 0.771 0.15167 0.767,0.775 114 | #> 4: white 0.183 0.62810 0.182,0.184 115 | #> 5: native 1.352 0.00751 1.32,1.38 116 | sum(local$p * local$ls) 117 | #> [1] 0.422 118 | ``` 119 | 120 | Decompose the difference in M between 2000 and 2005, using iterative 121 | proportional fitting (IPF) and the Shapley decomposition (see Elbers 122 | 2021 for details): 123 | 124 | ``` r 125 | mutual_difference(schools00, schools05, 126 | group = "race", unit = "school", 127 | weight = "n", method = "shapley" 128 | ) 129 | #> stat est 130 | #> 131 | #> 1: M1 0.42554 132 | #> 2: M2 0.41339 133 | #> 3: diff -0.01215 134 | #> 4: additions -0.00341 135 | #> 5: removals -0.01141 136 | #> 6: group_marginal 0.01787 137 | #> 7: unit_marginal -0.01171 138 | #> 8: structural -0.00349 139 | ``` 140 | 141 | Show a segplot: 142 | 143 | ``` r 144 | segplot(schools00, group = "race", unit = "school", weight = "n") 145 | ``` 146 | 147 | ![](man/figures/README-segplot-1.png) 148 | 149 | Find more information in the 150 | [documentation](https://elbersb.github.io/segregation/). 151 | 152 | ## How to install 153 | 154 | To install the package from CRAN, use 155 | 156 | ``` r 157 | install.packages("segregation") 158 | ``` 159 | 160 | To install the development version, use 161 | 162 | ``` r 163 | devtools::install_github("elbersb/segregation") 164 | ``` 165 | 166 | ## Citation 167 | 168 | If you use this package for your research, please cite one of the 169 | following papers: 170 | 171 | - Elbers, Benjamin (2021). A Method for Studying Differences in 172 | Segregation Across Time and Space. Sociological Methods & Research. 173 | 174 | 175 | - Elbers, Benjamin and Rob Gruijters (2023). Segplot: A New Method for 176 | Visualizing Patterns of Multi-Group Segregation. 177 | 178 | 179 | ## Some additional resources 180 | 181 | - The book *Analyzing US Census Data: Methods, Maps, and Models in R* 182 | by Kyle E. Walker contains [a discussion of this 183 | package](https://walker-data.com/census-r/modeling-us-census-data.html#indices-of-segregation-and-diversity), 184 | and is a great resource for anyone working with spatial data, 185 | especially U.S. Census data. 186 | - A paper that makes use of this package: [Did Residential Racial 187 | Segregation in the U.S. Really Increase? An Analysis Accounting for 188 | Changes in Racial 189 | Diversity](https://elbersb.com/public/posts/2021-07-23-segregation-increase/) 190 | ([Code and Data](https://osf.io/mg9q4/)) 191 | - Some of the analyses [in this 192 | article](https://multimedia.tijd.be/diversiteit/) by the Belgian 193 | newspaper *De Tijd* used the package. 194 | - The analyses of [this article in the Wall Street 195 | Journal](https://www.wsj.com/articles/chicago-vs-dallas-why-the-north-lags-behind-the-south-and-west-in-racial-integration-11657936680) 196 | were produced using this package. 197 | 198 | ## References on entropy-based segregation indices 199 | 200 | Deutsch, J., Flückiger, Y. & Silber, J. (2009). Analyzing Changes in 201 | Occupational Segregation: The Case of Switzerland (1970–2000), in: Yves 202 | Flückiger, Sean F. Reardon, Jacques Silber (eds.) Occupational and 203 | Residential Segregation (Research on Economic Inequality, Volume 17), 204 | 171–202. 205 | 206 | DiPrete, T. A., Eller, C. C., Bol, T., & van de Werfhorst, H. G. (2017). 207 | School-to-Work Linkages in the United States, Germany, and France. 208 | American Journal of Sociology, 122(6), 1869-1938. 209 | 210 | 211 | Elbers, B. (2021). A Method for Studying Differences in Segregation 212 | Across Time and Space. Sociological Methods & Research. 213 | 214 | 215 | Forster, A. G., & Bol, T. (2017). Vocational education and employment 216 | over the life course using a new measure of occupational specificity. 217 | Social Science Research, 70, 176-197. 218 | 219 | 220 | Theil, H. (1971). Principles of Econometrics. New York: Wiley. 221 | 222 | Frankel, D. M., & Volij, O. (2011). Measuring school segregation. 223 | Journal of Economic Theory, 146(1), 1-38. 224 | 225 | 226 | Mora, R., & Ruiz-Castillo, J. (2003). Additively decomposable 227 | segregation indexes. The case of gender segregation by occupations and 228 | human capital levels in Spain. The Journal of Economic Inequality, 1(2), 229 | 147-179. 230 | 231 | Mora, R., & Ruiz-Castillo, J. (2009). The Invariance Properties of the 232 | Mutual Information Index of Multigroup Segregation, in: Yves Flückiger, 233 | Sean F. Reardon, Jacques Silber (eds.) Occupational and Residential 234 | Segregation (Research on Economic Inequality, Volume 17), 33-53. 235 | 236 | Mora, R., & Ruiz-Castillo, J. (2011). Entropy-based Segregation Indices. 237 | Sociological Methodology, 41(1), 159–194. 238 | 239 | 240 | Van Puyenbroeck, T., De Bruyne, K., & Sels, L. (2012). More than ‘Mutual 241 | Information’: Educational and sectoral gender segregation and their 242 | interaction on the Flemish labor market. Labour Economics, 19(1), 1-8. 243 | 244 | 245 | Watts, M. The Use and Abuse of Entropy Based Segregation Indices. 246 | Working Paper. URL: 247 | 248 | -------------------------------------------------------------------------------- /R/ipf.R: -------------------------------------------------------------------------------- 1 | #' Adjustment of marginal distributions using iterative proportional fitting 2 | #' 3 | #' Adjusts the marginal distributions for \code{group} and \code{unit} 4 | #' in \code{source} to the respective marginal distributions in \code{target}, using the iterative 5 | #' proportional fitting algorithm (IPF). 6 | #' 7 | #' The algorithm works by scaling 8 | #' the marginal distribution of \code{group} in the \code{source} data frame towards the 9 | #' marginal distribution of \code{target}; then repeating this process for \code{unit}. The 10 | #' algorithm then keeps alternating between \code{group} and \code{unit} until the marginals 11 | #' of the adjusted data frame are within the allowed precision. This results in a dataset that 12 | #' retains the association structure of \code{source} while approximating 13 | #' the marginal distribution of \code{target}. If the number of \code{unit} and 14 | #' \code{group} categories is different in \code{source} and \code{target}, the data frame returns 15 | #' the combination of \code{unit} and \code{group} categories that occur in both datasets. 16 | #' Zero values are replaced by a small, non-zero number (1e-4). 17 | #' Note that the values returned sum to the observations of the source data frame, not the 18 | #' target data frame. This is different from other IPF implementations, but ensures that the IPF 19 | #' does not change the number of observations. 20 | #' 21 | #' @param source A "source" data frame. The marginals of this 22 | #' dataset are adjusted to the marginals of \code{target}. 23 | #' @param target A "target" data frame. The function returns a dataset 24 | #' where the marginal distributions of \code{group} and \code{unit} categories 25 | #' are approximated by those of \code{target}. 26 | #' @param group A categorical variable or a vector of variables 27 | #' contained in \code{source} and \code{target}. Defines the first distribution 28 | #' for adjustment. 29 | #' @param unit A categorical variable or a vector of variables 30 | #' contained in \code{source} and \code{target}. Defines the second distribution 31 | #' for adjustment. 32 | #' @param weight Numeric. (Default \code{NULL}) 33 | #' @param max_iterations Maximum number of iterations used for the IPF algorithm. 34 | #' @param precision Convergence criterion for the IPF algorithm. In every iteration, 35 | #' the ratio of the source and target marginals are calculated for every category of 36 | #' \code{group} and \code{unit}. The algorithm converges when all ratios are smaller 37 | #' than \code{1 + precision}. 38 | #' @return Returns a data frame that retains 39 | #' the association structure of \code{source} while approximating 40 | #' the marginal distributions for \code{group} and \code{unit} of \code{target}. 41 | #' The dataset identifies each combination of \code{group} and \code{unit}, 42 | #' and categories that only occur in either \code{source} or \code{target} are dropped. 43 | #' The adjusted frequency of each combination is given by the column \code{n}, 44 | #' while \code{n_target} and \code{n_source} contain the zero-adjusted frequencies 45 | #' in the target and source dataset, respectively. 46 | #' @references 47 | #' W. E. Deming and F. F. Stephan. 1940. 48 | #' "On a Least Squares Adjustment of a Sampled Frequency Table 49 | #' When the Expected Marginal Totals are Known". 50 | #' Annals of Mathematical Statistics. 11 (4): 427–444. 51 | #' 52 | #' T. Karmel and M. Maclachlan. 1988. 53 | #' "Occupational Sex Segregation — Increasing or Decreasing?" Economic Record 64: 187-195. 54 | #' @examples 55 | #' \dontrun{ 56 | #' # adjusts the marginals of group and unit categories so that 57 | #' # schools00 has similar marginals as schools05 58 | #' adj <- ipf(schools00, schools05, "race", "school", weight = "n") 59 | #' 60 | #' # check that the new "race" marginals are similar to the target marginals 61 | #' # (the same could be done for schools) 62 | #' aggregate(adj$n, list(adj$race), sum) 63 | #' aggregate(adj$n_target, list(adj$race), sum) 64 | #' 65 | #' # note that the adjusted dataset contains fewer 66 | #' # schools than either the source or the target dataset, 67 | #' # because the marginals are only defined for the overlap 68 | #' # of schools 69 | #' length(unique(schools00$school)) 70 | #' length(unique(schools05$school)) 71 | #' length(unique(adj$school)) 72 | #' } 73 | #' @import data.table 74 | #' @export 75 | ipf <- function(source, target, group, unit, weight = NULL, 76 | max_iterations = 100, precision = 1e-07) { 77 | d1 <- prepare_data(source, group, unit, weight) 78 | d2 <- prepare_data(target, group, unit, weight) 79 | 80 | common_data <- create_common_data(d1, d2, group, unit) 81 | update_log(ipf_n = 1, ipf_max = 1) 82 | ret <- ipf_compute(common_data, group, unit, max_iterations, precision) 83 | close_log() 84 | 85 | as.data.frame(ret) 86 | } 87 | 88 | #' @import data.table 89 | create_common_data <- function(d1, d2, group, unit, suppress_warnings = FALSE, fill_na = 1e-10) { 90 | # generate the crossproduct of common groups and units to 91 | # preserve all possible combinations 92 | # we reselect to make sure that by removing some unit/group, we don't create any new 93 | # combinations that don't exist. There might be rare cases where the loop needs to be repeated 94 | # even further, but then the table is probably too sparse anyway 95 | common_group <- fintersect(d1[, group, with = FALSE], d2[, group, with = FALSE]) 96 | common_unit <- fintersect( 97 | merge(d1, common_group)[, unit, with = FALSE], 98 | merge(d2, common_group)[, unit, with = FALSE] 99 | ) 100 | common_group <- fintersect( 101 | merge(d1, common_unit)[, group, with = FALSE], 102 | merge(d2, common_unit)[, group, with = FALSE] 103 | ) 104 | common_group$key <- 1 105 | common_unit$key <- 1 106 | common <- merge(common_unit, common_group, allow.cartesian = TRUE) 107 | common[, "key" := NULL] 108 | 109 | # this is optional because mutual_difference handles this internally -- no need 110 | # to print a warning here 111 | if (suppress_warnings == FALSE) { 112 | group_removed_d1 <- fsetdiff(d1[, group, with = FALSE], common_group[, group, with = FALSE]) 113 | group_removed_d2 <- fsetdiff(d2[, group, with = FALSE], common_group[, group, with = FALSE]) 114 | unit_removed_d1 <- fsetdiff(d1[, unit, with = FALSE], common_unit[, unit, with = FALSE]) 115 | unit_removed_d2 <- fsetdiff(d2[, unit, with = FALSE], common_unit[, unit, with = FALSE]) 116 | 117 | if (nrow(group_removed_d1) > 0) { 118 | warning(paste0( 119 | "IPF procedure removed ", nrow(group_removed_d1), " group categories from source, ", 120 | "this likely reduced the sample size (check columns n_source and n_target)" 121 | )) 122 | } 123 | if (nrow(group_removed_d2) > 0) { 124 | warning(paste0( 125 | "IPF procedure removed ", nrow(group_removed_d2), " group categories from target, ", 126 | "this likely reduced the sample size (check columns n_source and n_target)" 127 | )) 128 | } 129 | if (nrow(unit_removed_d1) > 0) { 130 | warning(paste0( 131 | "IPF procedure removed ", nrow(unit_removed_d1), " units from source, ", 132 | "this likely reduced the sample size (check columns n_source and n_target)" 133 | )) 134 | } 135 | if (nrow(unit_removed_d1) > 0) { 136 | warning(paste0( 137 | "IPF procedure removed ", nrow(unit_removed_d2), " units from target, ", 138 | "this likely reduced the sample size (check columns n_source and n_target)" 139 | )) 140 | } 141 | } 142 | 143 | # make sure that fill_na is much smaller than the smallest frequency 144 | min_freq <- min(c(d1[["freq"]], d2[["freq"]])) 145 | if (fill_na > min_freq) { 146 | fill_na <- min_freq / 1000 147 | } 148 | 149 | # join original frequencies 150 | common <- merge(common, d1, by = c(group, unit), all.x = TRUE) 151 | setnames(common, "freq", "freq_orig1") 152 | common <- merge(common, d2, by = c(group, unit), all.x = TRUE) 153 | setnames(common, "freq", "freq_orig2") 154 | # NA in freq1 and freq2 are replaced by a small non-zero number 155 | common[, freq1 := ifelse(is.na(freq_orig1), fill_na, freq_orig1)] 156 | common[, freq2 := ifelse(is.na(freq_orig2), fill_na, freq_orig2)] 157 | common[, freq_orig1 := ifelse(is.na(freq_orig1), 0, freq_orig1)] 158 | common[, freq_orig2 := ifelse(is.na(freq_orig2), 0, freq_orig2)] 159 | common 160 | } 161 | 162 | 163 | #' @import data.table 164 | ipf_compute <- function(data, group, unit, 165 | max_iterations = 100, precision = .001, 166 | only_group = FALSE, only_unit = FALSE) { 167 | # work with relative weights 168 | data[, p1 := freq1 / sum(freq1)] 169 | data[, p2 := freq2 / sum(freq2)] 170 | data[, p_group_s := sum(p1), by = group] 171 | data[, p_unit_s := sum(p2), by = unit] 172 | 173 | if (only_group == TRUE) { 174 | data[, p_group_t := sum(p2), by = group] 175 | data[, p_unit_t := sum(p1), by = unit] 176 | } else if (only_unit == TRUE) { 177 | data[, p_group_t := sum(p1), by = group] 178 | data[, p_unit_t := sum(p2), by = unit] 179 | } else { 180 | data[, p_group_t := sum(p2), by = group] 181 | data[, p_unit_t := sum(p2), by = unit] 182 | } 183 | 184 | # IPF algorithm 185 | converged <- FALSE 186 | start_i <- sample(2, 1) # start randomly with either row or column adjustment 187 | max_iterations <- max_iterations - 1 + start_i # offset so max_iterations is still the same 188 | 189 | for (i in start_i:(max_iterations * 2)) { 190 | if (i %% 2 == 0) { 191 | data[, `:=`(p1 = p1 * p_unit_t / p_unit_s)] 192 | } else { 193 | data[, `:=`(p1 = p1 * p_group_t / p_group_s)] 194 | } 195 | data[, p1 := p1 / sum(p1)] 196 | 197 | if (i %% 5 == 0) { 198 | update_log_progress("#") 199 | } 200 | 201 | data[, p_group_s := sum(p1), by = group] 202 | data[, p_unit_s := sum(p1), by = unit] 203 | 204 | group_ratio <- data[, list(first(p_group_s), first(p_group_t)), by = group][, abs(V1 - V2)] 205 | unit_ratio <- data[, list(first(p_unit_s), first(p_unit_t)), by = unit][, abs(V1 - V2)] 206 | 207 | if (all(group_ratio <= precision) && all(unit_ratio <= precision)) { 208 | converged <- TRUE 209 | break 210 | } 211 | } 212 | 213 | if (!converged) { 214 | close_log() 215 | stop("IPF did not converge; lower precision or increase max_iterations.") 216 | } 217 | 218 | # adjust ipf-adjusted count to sample size of source 219 | data[, n := p1 / sum(p1) * sum(freq1)] 220 | setnames(data, "freq1", "n_source") 221 | setnames(data, "freq2", "n_target") 222 | 223 | data[, c( 224 | "p_unit_t", "p_unit_s", "p_group_t", "p_group_s", 225 | "freq_orig1", "freq_orig2", "p1", "p2" 226 | ) := NULL] 227 | setkey(data, NULL) 228 | data 229 | } 230 | -------------------------------------------------------------------------------- /R/plots.R: -------------------------------------------------------------------------------- 1 | #' A visual representation of segregation 2 | #' 3 | #' Produces a segregation plot. 4 | #' 5 | #' @param data A data frame. 6 | #' @param group A categorical variable or a vector of variables 7 | #' contained in \code{data}. Defines the first dimension 8 | #' over which segregation is computed. 9 | #' @param unit A categorical variable or a vector of variables 10 | #' contained in \code{data}. Defines the second dimension 11 | #' over which segregation is computed. 12 | #' @param weight Numeric. (Default \code{NULL}) 13 | #' @param order A character, either 14 | #' "segregation", "entropy", "majority", or "majority_fixed". 15 | #' Affects the ordering of the units. 16 | #' The horizontal ordering of the groups can be changed 17 | #' by using a factor variable for \code{group}. 18 | #' The difference between "majority" and "majority_fixed" is that the former 19 | #' will reorder the groups in such a way that the majority group actually comes first. 20 | #' If you want to control the ordering yourself, use "majority_fixed" and specify 21 | #' the \code{group} variable as a factor variable. 22 | #' @param secondary_plot If \code{NULL} (default), no secondary plot is drawn. 23 | #' If "segregation", a secondary plot is drawn that shows adjusted local segregation 24 | #' scores for each unit. If "cumulative", a secondary plot is drawn that shows 25 | #' the cumulative contribution of each unit toward the total H (calculated as the 26 | #' proportion of each unit times the adjusted local segregation of each unit)0. 27 | #' @param reference_distribution Specifies the reference distribution, given as 28 | #' a two-column data frame, to be plotted on the right. 29 | #' If order is \code{segregation}, then this reference distribution is 30 | #' also used to compute the local segregation scores. 31 | #' @param bar_space Specifies space between single units. 32 | #' @param hline Default \code{NULL}. If a color is specified, 33 | #' horizontal lines will be drawn where groups are separated. 34 | #' @return Returns a ggplot2 or patchwork object. 35 | #' @import data.table 36 | #' @export 37 | segplot <- function(data, group, unit, weight, 38 | order = "segregation", secondary_plot = NULL, 39 | reference_distribution = NULL, 40 | bar_space = 0, hline = NULL) { 41 | if (!requireNamespace("patchwork", quietly = TRUE)) { 42 | stop("Please install patchwork to use this function") 43 | } 44 | if (!requireNamespace("ggplot2", quietly = TRUE)) { 45 | stop("Please install ggplot2 to use this function") 46 | } 47 | 48 | checkmate::assert_character(group, len = 1) 49 | checkmate::assert_character(unit, len = 1) 50 | d <- prepare_data(data, group, unit, weight) 51 | # easier if renamed 52 | setnames(d, group, "group") 53 | setnames(d, unit, "unit") 54 | 55 | # check other arguments 56 | checkmate::assert_choice(order, c("segregation", "entropy", "majority", "majority_fixed")) 57 | checkmate::assert_choice(secondary_plot, c("segregation", "cumulative"), null.ok = TRUE) 58 | checkmate::assert_data_frame(reference_distribution, ncols = 2, nrows = d[, uniqueN(group)], null.ok = TRUE) 59 | checkmate::assert_numeric(bar_space, len = 1, lower = 0) 60 | checkmate::assert_character(hline, len = 1, null.ok = TRUE) 61 | 62 | d[, unit := as.character(unit)] 63 | d[, p := freq / sum(freq), by = .(unit)] 64 | d[, p_unit := sum(freq), by = .(unit)] 65 | N <- d[, first(p_unit), by = .(unit)][, sum(V1)] 66 | d[, p_unit := p_unit / N] 67 | 68 | # overall 69 | if (is.null(reference_distribution)) { 70 | overall <- d[, .(freq = sum(freq)), by = .(group)] 71 | overall[, p := freq / sum(freq)] 72 | } else { 73 | stopifnot(names(reference_distribution) == c(group, "p")) 74 | overall <- as.data.table(reference_distribution) 75 | setnames(overall, group, "group") 76 | } 77 | # order by size 78 | setorder(overall, -p) 79 | group_order <- overall[["group"]] 80 | if (order == "majority") { 81 | # if majority, we always force a reordering by group size 82 | d[, group := as.character(group)] 83 | overall[, group := as.character(group)] 84 | d[, group := factor(group, levels = group_order)] 85 | overall[, group := factor(group, levels = group_order)] 86 | } 87 | 88 | wide <- data.table::dcast(d[, -"freq"], p_unit + unit ~ group, value.var = "p", fill = 0) 89 | 90 | if (order == "segregation") { 91 | ls <- merge(d, overall[, .(group, p_overall = p)], by = "group", all.x = TRUE) 92 | ls <- ls[, .(ls = sum(p * logf(p / p_overall))), by = .(unit)] 93 | wide <- merge(ls, wide, by = "unit") 94 | setorder(wide, -ls) 95 | } else if (order == "entropy") { 96 | ent <- d[, .(entropy = entropy(.SD, "group", weight = "freq")), by = .(unit)] 97 | wide <- merge(ent, wide, by = "unit") 98 | setorder(wide, entropy) 99 | } else if (order %in% c("majority", "majority_fixed")) { 100 | if (is.factor(d[["group"]])) { 101 | group_order <- d[, levels(group)] 102 | } 103 | setorderv(wide, 104 | c(group_order[[1]], utils::tail(group_order, 1)), 105 | order = c(1, -1) 106 | ) 107 | } 108 | 109 | # format units 110 | wide[, xmin := cumsum(p_unit) - p_unit] 111 | wide[, xmax := cumsum(p_unit)] 112 | wide[, xmin := xmin + (.I - 1) * bar_space] 113 | wide[, xmax := xmax + (.I - 1) * bar_space] 114 | d <- merge(d, wide[, .(unit, xmin, xmax)], by = "unit") 115 | setorderv(d, c("xmin", "group")) 116 | d[, ymin := cumsum(p) - p, by = .(unit)] 117 | d[, ymax := cumsum(p), by = .(unit)] 118 | 119 | if (bar_space == 0) { 120 | breaks <- c(wide[["xmin"]], wide[, max(xmax)]) 121 | } else { 122 | breaks <- c() 123 | } 124 | 125 | # format overall 126 | overall[, xmin := wide[, max(xmax)] + 0.05] 127 | overall[, xmax := wide[, max(xmax)] + 0.10] 128 | setorderv(overall, "group") 129 | overall[, ymin := cumsum(p) - p] 130 | overall[, ymax := cumsum(p)] 131 | 132 | combine <- rbindlist(list(d, overall), use.names = TRUE, fill = TRUE) 133 | plot <- ggplot2::ggplot( 134 | combine, 135 | ggplot2::aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax) 136 | ) + 137 | ggplot2::geom_vline(xintercept = wide[, max(xmax)]) + 138 | ggplot2::geom_vline(xintercept = wide[, max(xmax) + 0.05]) + 139 | ggplot2::geom_rect(ggplot2::aes(fill = .data[["group"]])) + 140 | ggplot2::scale_x_continuous(breaks = breaks, expand = c(0, 0)) + 141 | ggplot2::scale_y_continuous( 142 | labels = scales::percent_format(), 143 | expand = c(0, 0), 144 | sec.axis = ggplot2::dup_axis() 145 | ) + 146 | ggplot2::guides(fill = ggplot2::guide_legend(reverse = TRUE)) + 147 | ggplot2::theme_bw() + 148 | ggplot2::theme( 149 | panel.grid = ggplot2::element_blank(), 150 | axis.text.x = ggplot2::element_blank(), 151 | legend.position = "right" 152 | ) + 153 | ggplot2::labs(fill = NULL) 154 | 155 | if (!is.null(hline)) { 156 | plot <- plot + ggplot2::geom_hline(yintercept = overall[1:(.N - 1), ymax], color = hline) 157 | } 158 | 159 | if (order == "segregation" && is.null(secondary_plot)) { 160 | plot <- plot + ggplot2::labs(x = "< more segregated | less segregated >") 161 | } else if (order == "segregation") { 162 | entropy <- entropy(overall, "group", weight = "p") 163 | H_index <- wide[, sum(ls * p_unit) / entropy] 164 | 165 | if (secondary_plot == "segregation") { 166 | wide[, stat := ls / entropy] 167 | label <- "adj. LS" 168 | } else if (secondary_plot == "cumulative") { 169 | wide[, stat := cumsum(p_unit * ls / entropy)] 170 | wide[, stat := max(stat) - stat + min(stat)] 171 | label <- "Cumulative" 172 | } 173 | 174 | stat_segments_h <- wide[, .(unit, x = ifelse(.I == 1, xmin, xmin - bar_space), xend = xmax, y = stat, yend = stat)] 175 | stat_segments_v <- wide[, .(unit, x = xmax, xend = xmax, y = stat, yend = shift(stat, type = "lead"))] 176 | stat_segments <- rbindlist(list(stat_segments_h, stat_segments_v)) 177 | sub_plot <- ggplot2::ggplot( 178 | mapping = ggplot2::aes(x = x, y = y, xend = xend, yend = yend) 179 | ) + 180 | ggplot2::geom_vline(xintercept = wide[, max(xmax)], linewidth = 0.2) + 181 | ggplot2::geom_hline(yintercept = H_index, color = "orange") + 182 | ggplot2::geom_segment(data = stats::na.omit(stat_segments)) + 183 | ggplot2::theme_bw() + 184 | ggplot2::scale_x_continuous( 185 | limits = c(0, overall[, max(xmax)]), expand = c(0, 0), 186 | labels = function(x) scales::label_percent(1)(1 - x) 187 | ) + 188 | ggplot2::scale_y_continuous(sec.axis = ggplot2::dup_axis( 189 | breaks = H_index, 190 | labels = function(x) paste0("H = ", round(H_index, 3)) 191 | )) + 192 | ggplot2::theme( 193 | panel.grid.major.x = ggplot2::element_blank(), 194 | panel.grid.minor.x = ggplot2::element_blank(), 195 | panel.grid.minor.y = ggplot2::element_blank(), 196 | axis.text.x = ggplot2::element_blank(), 197 | axis.ticks.x = ggplot2::element_blank(), 198 | axis.title.y.right = ggplot2::element_blank() 199 | ) + 200 | ggplot2::labs(y = label, x = "< more segregated | less segregated >") 201 | 202 | plot <- plot / sub_plot + patchwork::plot_layout(heights = c(4, 1)) 203 | } 204 | 205 | plot 206 | } 207 | 208 | 209 | #' A visual representation of two-group segregation 210 | #' 211 | #' Produces one or several segregation curves, as defined in Duncan and Duncan (1955) 212 | #' 213 | #' @param data A data frame. 214 | #' @param group A categorical variable contained in \code{data}. 215 | #' Defines the first dimension over which segregation is computed. 216 | #' @param unit A categorical variable contained in \code{data}. 217 | #' Defines the second dimension over which segregation is computed. 218 | #' @param weight Numeric. (Default \code{NULL}) 219 | #' @param segment A categorical variable contained in \code{data}. (Default \code{NULL}) 220 | #' If given, several segregation curves will be shown, one for each segment. 221 | #' @return Returns a ggplot2 object. 222 | #' @import data.table 223 | #' @export 224 | segcurve <- function(data, group, unit, weight = NULL, segment = NULL) { 225 | if (!requireNamespace("ggplot2", quietly = TRUE)) { 226 | stop("Please install ggplot2 to use this function") 227 | } 228 | 229 | stopifnot(length(group) == 1) 230 | stopifnot(length(unit) == 1) 231 | d <- prepare_data(data, group, unit, weight, within = segment) 232 | # easier if renamed 233 | setnames(d, group, "group") 234 | setnames(d, unit, "unit") 235 | if (is.null(segment)) { 236 | d[["segment"]] <- 1 237 | } else { 238 | stopifnot(length(segment) == 1) 239 | setnames(d, segment, "segment") 240 | d[["segment"]] <- as.factor(d[["segment"]]) 241 | } 242 | 243 | if (d[, uniqueN(group)] != 2) { 244 | stop("requires exactly two groups") 245 | } 246 | 247 | wide <- dcast(d, segment + unit ~ group, value.var = "freq", fill = 0) 248 | group_names <- names(wide)[3:4] 249 | setnames(wide, group_names, c("group1", "group2")) 250 | wide[, pct_group_1 := group1 / (group1 + group2)] 251 | setorder(wide, segment, pct_group_1) 252 | wide[, cumul_prob_1 := cumsum(group1) / sum(group1), by = .(segment)] 253 | wide[, cumul_prob_2 := cumsum(group2) / sum(group2), by = .(segment)] 254 | # need to add line through origin 255 | wide <- wide[, .(segment, cumul_prob_1, cumul_prob_2)] 256 | zeros <- wide[, .(cumul_prob_1 = 0, cumul_prob_2 = 0), by = .(segment)] 257 | wide <- rbindlist(list(wide, zeros)) 258 | 259 | p <- ggplot2::ggplot(wide, ggplot2::aes(x = cumul_prob_2, y = cumul_prob_1)) + 260 | ggplot2::annotate(geom = "segment", x = 0, y = 0, xend = 1, yend = 1, colour = "darkgray") + 261 | ggplot2::scale_x_continuous(labels = scales::percent_format(accuracy = 1)) + 262 | ggplot2::scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + 263 | ggplot2::labs( 264 | x = paste("Cumulative % ", group_names[2]), 265 | y = paste("Cumulative % ", group_names[1]) 266 | ) + 267 | ggplot2::coord_fixed() 268 | 269 | if (is.null(segment)) { 270 | p <- p + ggplot2::geom_line() 271 | } else { 272 | p <- p + 273 | ggplot2::geom_line(ggplot2::aes(color = segment)) + 274 | ggplot2::labs(color = segment) 275 | } 276 | 277 | return(p) 278 | } 279 | -------------------------------------------------------------------------------- /R/mutual_expected.R: -------------------------------------------------------------------------------- 1 | expected_compute <- function(index, d, group_var, unit_var, 2 | fixed_margins, n_bootstrap, base) { 3 | n_group <- d[, sum(freq), by = group_var][, V1] 4 | by_unit <- d[, list(n = sum(freq)), by = unit_var] # to keep the order 5 | n_unit <- by_unit[, n] 6 | if (length(n_group) == 1 || length(n_unit) == 1) { 7 | if (index == "mh") { 8 | return(data.table( 9 | stat = c("M under 0", "H under 0"), 10 | est = c(NA_real_, NA_real_), se = c(NA_real_, NA_real_) 11 | )) 12 | } else { 13 | return(data.table( 14 | stat = "D under 0", 15 | est = NA_real_, se = NA_real_ 16 | )) 17 | } 18 | } 19 | p_group <- n_group / sum(n_group) 20 | p_unit <- n_unit / sum(n_unit) 21 | 22 | if (fixed_margins == TRUE) { 23 | # take the margins from the table provided 24 | entropy_group <- -sum(p_group * logf(p_group, base)) 25 | sim_fixed <- stats::r2dtable(n_bootstrap, n_group, n_unit) 26 | } else { 27 | # simulate margins using a multinomial model 28 | group_margins <- stats::rmultinom(n_bootstrap, sum(n_group), p_group) 29 | unit_margins <- stats::rmultinom(n_bootstrap, sum(n_unit), p_unit) 30 | } 31 | 32 | boot_ret <- lapply(seq_len(n_bootstrap), function(i) { 33 | if (i %% 5 == 0) update_log(bs_n = i, bs_max = n_bootstrap) 34 | 35 | if (index %in% c("ls", "mh")) { 36 | if (fixed_margins == TRUE) { 37 | p <- sim_fixed[[i]] / sum(sim_fixed[[i]]) 38 | # margins are fixed, just copy over 39 | p_group_sim <- p_group 40 | p_unit_sim <- p_unit 41 | entropy_group_sim <- entropy_group 42 | } else { 43 | sim <- stats::r2dtable(1, group_margins[, i], unit_margins[, i])[[1]] 44 | p <- sim / sum(sim) 45 | # margins are simulated, calculate new margins 46 | p_group_sim <- rowSums(p) 47 | p_unit_sim <- colSums(p) 48 | entropy_group_sim <- -sum(p_group_sim * logf(p_group_sim, base)) 49 | } 50 | 51 | # calculate local segregation scores per unit 52 | p_within_unit <- sweep(p, 2, p_unit_sim, "/") 53 | ls <- apply(p_within_unit, 2, function(x) sum(x * logf(x / p_group_sim, base))) 54 | 55 | if (index == "mh") { 56 | m <- sum(p_unit_sim * ls) 57 | h <- m / entropy_group_sim 58 | data.table( 59 | stat = c("M under 0", "H under 0"), 60 | est = c(m, h) 61 | ) 62 | } else { 63 | ls_data <- copy(by_unit) 64 | ls_data[, n := NULL] 65 | ls_data[, ls := ls] 66 | ls_data[, p := p_unit_sim] 67 | ls_data 68 | } 69 | } else { 70 | if (fixed_margins == TRUE) { 71 | tab <- sim_fixed[[i]] 72 | } else { 73 | tab <- stats::r2dtable(1, group_margins[, i], unit_margins[, i])[[1]] 74 | } 75 | 76 | div <- sweep(tab, 1, rowSums(tab), "/") 77 | d <- 1 / 2 * sum(apply(div, 2, abs_diff)) 78 | data.table( 79 | stat = c("D under 0"), 80 | est = d 81 | ) 82 | } 83 | }) 84 | close_log() 85 | 86 | res <- data.table::rbindlist(boot_ret) 87 | if (index == "ls") { 88 | res[, .(stat = "LS under 0", est = mean(ls), se = stats::sd(ls), p_mean = mean(p)), by = unit_var] 89 | } else { 90 | res[, list(est = mean(est), se = stats::sd(est)), by = .(stat)] 91 | } 92 | } 93 | 94 | #' Calculates expected values when true segregation is zero 95 | #' 96 | #' When sample sizes are small, one group has a small proportion, or 97 | #' when there are many units, segregation indices are typically upwardly 98 | #' biased, even when true segregation is zero. This function simulates 99 | #' tables with zero segregation, given the marginals of the dataset, 100 | #' and calculates segregation. If the expected values are large, 101 | #' the interpretation of index scores might have to be adjusted. 102 | #' 103 | #' @param data A data frame. 104 | #' @param group A categorical variable or a vector of variables 105 | #' contained in \code{data}. Defines the first dimension 106 | #' over which segregation is computed. 107 | #' @param unit A categorical variable or a vector of variables 108 | #' contained in \code{data}. Defines the second dimension 109 | #' over which segregation is computed. 110 | #' @param weight Numeric. (Default \code{NULL}) 111 | #' @param within Apply algorithm within each group defined by this variable, 112 | #' and report the weighted average. (Default \code{NULL}) 113 | #' @param fixed_margins Should the margins be fixed or simulated? (Default \code{TRUE}) 114 | #' @param n_bootstrap Number of bootstrap iterations. (Default \code{100}) 115 | #' @param base Base of the logarithm that is used in the calculation. 116 | #' Defaults to the natural logarithm. 117 | #' @return A data.table with two rows, corresponding to the expected values of 118 | #' segregation when true segregation is zero. 119 | #' @examples 120 | #' \dontrun{ 121 | #' # the schools00 dataset has a large sample size, so expected segregation is close to zero 122 | #' mutual_total_expected(schools00, "race", "school", weight = "n") 123 | #' 124 | #' # but we can build a smaller table, with 100 students distributed across 125 | #' # 10 schools, where one racial group has 10% of the students 126 | #' small <- data.frame( 127 | #' school = c(1:10, 1:10), 128 | #' race = c(rep("r1", 10), rep("r2", 10)), 129 | #' n = c(rep(1, 10), rep(9, 10)) 130 | #' ) 131 | #' mutual_total_expected(small, "race", "school", weight = "n") 132 | #' # with an increase in sample size (n=1000), the values improve 133 | #' small$n <- small$n * 10 134 | #' mutual_total_expected(small, "race", "school", weight = "n") 135 | #' } 136 | #' @import data.table 137 | #' @export 138 | mutual_total_expected <- function(data, group, unit, weight = NULL, 139 | within = NULL, fixed_margins = TRUE, 140 | n_bootstrap = 100, base = exp(1)) { 141 | d <- prepare_data(data, group, unit, weight, within) 142 | n_total <- d[, sum(freq)] 143 | if (all.equal(n_total, round(n_total)) != TRUE) { 144 | stop(paste0( 145 | "bootstrap with a total sample size that is not an integer is not allowed, ", 146 | "maybe scale your weights?" 147 | )) 148 | } 149 | 150 | if (is.null(within)) { 151 | res <- expected_compute( 152 | "mh", d, group, unit, 153 | fixed_margins, n_bootstrap, base 154 | ) 155 | } else { 156 | res <- d[, expected_compute( 157 | "mh", .SD, ..group, ..unit, 158 | ..fixed_margins, ..n_bootstrap, ..base 159 | ), 160 | by = within 161 | ] 162 | n_na <- sum(is.na(res$se)) / 2 163 | if (n_na > 0) { 164 | message(paste0("Removed ", n_na, " singleton items")) 165 | res <- res[!is.na(est)] 166 | } 167 | p <- d[, .(p = sum(freq)), by = within] 168 | res <- merge(res, p, all.x = TRUE, by = within) 169 | res[, p := p / sum(p), by = .(stat)] 170 | res <- res[, .(est = sum(est * p), se = sqrt(sum(p * se^2))), by = .(stat)] 171 | } 172 | res 173 | } 174 | 175 | #' Calculates expected values when true segregation is zero 176 | #' 177 | #' When sample sizes are small, one group has a small proportion, or 178 | #' when there are many units, segregation indices are typically upwardly 179 | #' biased, even when true segregation is zero. This function simulates 180 | #' tables with zero segregation, given the marginals of the dataset, 181 | #' and calculates segregation. If the expected values are large, 182 | #' the interpretation of index scores might have to be adjusted. 183 | #' 184 | #' @param data A data frame. 185 | #' @param group A categorical variable or a vector of variables 186 | #' contained in \code{data}. Defines the first dimension 187 | #' over which segregation is computed. 188 | #' @param unit A categorical variable or a vector of variables 189 | #' contained in \code{data}. Defines the second dimension 190 | #' over which segregation is computed. 191 | #' @param weight Numeric. (Default \code{NULL}) 192 | #' @param fixed_margins Should the margins be fixed or simulated? (Default \code{TRUE}) 193 | #' @param n_bootstrap Number of bootstrap iterations. (Default \code{100}) 194 | #' @return A data.table with one row, corresponding to the expected value of 195 | #' the D index when true segregation is zero. 196 | #' @examples 197 | #' # build a smaller table, with 100 students distributed across 198 | #' # 10 schools, where one racial group has 10% of the students 199 | #' small <- data.frame( 200 | #' school = c(1:10, 1:10), 201 | #' race = c(rep("r1", 10), rep("r2", 10)), 202 | #' n = c(rep(1, 10), rep(9, 10)) 203 | #' ) 204 | #' dissimilarity_expected(small, "race", "school", weight = "n") 205 | #' # with an increase in sample size (n=1000), the values improve 206 | #' small$n <- small$n * 10 207 | #' dissimilarity_expected(small, "race", "school", weight = "n") 208 | #' @import data.table 209 | #' @export 210 | dissimilarity_expected <- function(data, group, unit, weight = NULL, 211 | fixed_margins = TRUE, 212 | n_bootstrap = 100) { 213 | if (length(unique(data[[group]])) != 2) { 214 | stop("The D index only allows two distinct groups") 215 | } 216 | d <- prepare_data(data, group, unit, weight) 217 | 218 | n_total <- d[, sum(freq)] 219 | if (all.equal(n_total, round(n_total)) != TRUE) { 220 | stop(paste0( 221 | "bootstrap with a total sample size that is not an integer is not allowed, ", 222 | "maybe scale your weights?" 223 | )) 224 | } 225 | 226 | expected_compute( 227 | "d", d, group, unit, 228 | fixed_margins, n_bootstrap, exp(1) 229 | ) 230 | } 231 | 232 | #' Calculates expected local segregation scores when true segregation is zero 233 | #' 234 | #' When sample sizes are small, one group has a small proportion, or 235 | #' when there are many units, segregation indices are typically upwardly 236 | #' biased, even when true segregation is zero. This function simulates 237 | #' tables with zero segregation, given the marginals of the dataset, 238 | #' and calculates local segregation scores. If the expected values are large, 239 | #' the interpretation of index scores might have to be adjusted. 240 | #' 241 | #' @param data A data frame. 242 | #' @param group A categorical variable or a vector of variables 243 | #' contained in \code{data}. Defines the first dimension 244 | #' over which segregation is computed. 245 | #' @param unit A categorical variable or a vector of variables 246 | #' contained in \code{data}. Defines the group for which local 247 | #' segregation indices are calculated. 248 | #' @param weight Numeric. (Default \code{NULL}) 249 | #' @param fixed_margins Should the margins be fixed or simulated? (Default \code{TRUE}) 250 | #' @param n_bootstrap Number of bootstrap iterations. (Default \code{100}) 251 | #' @param base Base of the logarithm that is used in the calculation. 252 | #' Defaults to the natural logarithm. 253 | #' @return A data.table with two rows, corresponding to the expected values of 254 | #' segregation when true segregation is zero. 255 | #' @examples 256 | #' \dontrun{ 257 | #' # the schools00 dataset has a large sample size, so expected segregation is close to zero 258 | #' mutual_local_expected(schools00, "race", "school", weight = "n") 259 | #' 260 | #' # but we can build a smaller table, with 100 students distributed across 261 | #' # 10 schools, where one racial group has 10% of the students 262 | #' small <- data.frame( 263 | #' school = c(1:10, 1:10), 264 | #' race = c(rep("r1", 10), rep("r2", 10)), 265 | #' n = c(rep(1, 10), rep(9, 10)) 266 | #' ) 267 | #' mutual_local_expected(small, "race", "school", weight = "n") 268 | #' # with an increase in sample size (n=1000), the values improve 269 | #' small$n <- small$n * 10 270 | #' mutual_local_expected(small, "race", "school", weight = "n") 271 | #' } 272 | #' @import data.table 273 | #' @export 274 | mutual_local_expected <- function(data, group, unit, weight = NULL, 275 | fixed_margins = TRUE, n_bootstrap = 100, base = exp(1)) { 276 | d <- prepare_data(data, group, unit, weight) 277 | n_total <- d[, sum(freq)] 278 | if (all.equal(n_total, round(n_total)) != TRUE) { 279 | stop(paste0( 280 | "bootstrap with a total sample size that is not an integer is not allowed, ", 281 | "maybe scale your weights?" 282 | )) 283 | } 284 | 285 | expected_compute( 286 | "ls", d, group, unit, 287 | fixed_margins, n_bootstrap, base 288 | ) 289 | } 290 | -------------------------------------------------------------------------------- /src/compression.cpp: -------------------------------------------------------------------------------- 1 | // [[Rcpp::depends(RcppProgress)]] 2 | #include 3 | #include 4 | #include 5 | using namespace Rcpp; 6 | 7 | struct CompressionResults 8 | { 9 | std::vector iter; 10 | std::vector M; 11 | std::vector M_wgt; 12 | std::vector N_units; 13 | std::vector old_unit; 14 | std::vector new_unit; 15 | }; 16 | 17 | std::tuple calculate_m(std::map> &data) 18 | { 19 | // create group sums 20 | int n_groups = data.begin()->second.size(); 21 | std::vector group_sums(n_groups, 0.0); 22 | for (auto &[unit, counts] : data) 23 | { 24 | for (int i = 0; i < n_groups; i++) 25 | group_sums[i] += counts[i]; 26 | } 27 | 28 | double n_total = std::accumulate(group_sums.begin(), group_sums.end(), 0); 29 | 30 | double m_total = 0.0; 31 | for (auto &[unit, counts] : data) 32 | { 33 | double n_unit = std::accumulate(counts.begin(), counts.end(), 0); 34 | for (int i = 0; i < n_groups; i++) 35 | { 36 | double obs = counts[i] / n_total; 37 | if (obs == 0) 38 | continue; 39 | double exp = (n_unit / n_total * group_sums[i] / n_total); 40 | m_total += obs * std::log(obs / exp); 41 | } 42 | } 43 | 44 | return std::make_tuple(n_total, m_total); 45 | } 46 | 47 | std::vector> calculate_ls(std::map> &data) 48 | { 49 | // create group sums 50 | int n_groups = data.begin()->second.size(); 51 | std::vector group_sums(n_groups, 0.0); 52 | for (auto &[unit, counts] : data) 53 | { 54 | for (int i = 0; i < n_groups; i++) 55 | group_sums[i] += counts[i]; 56 | } 57 | 58 | // create group proportions 59 | double n_total = std::accumulate(group_sums.begin(), group_sums.end(), 0); 60 | std::vector group_p(n_groups, 0.0); 61 | for (int i = 0; i < n_groups; i++) 62 | { 63 | group_p[i] = group_sums[i] / n_total; 64 | } 65 | 66 | // create local segregation scores for each unit 67 | std::vector> ls; 68 | for (auto &[unit, counts] : data) 69 | { 70 | double n_unit = std::accumulate(counts.begin(), counts.end(), 0); 71 | double ls_unit = 0.0; 72 | for (int i = 0; i < n_groups; i++) 73 | { 74 | double p_group_given_unit = counts[i] / n_unit; 75 | if (p_group_given_unit == 0) 76 | continue; 77 | ls_unit += p_group_given_unit * std::log(p_group_given_unit / group_p[i]); 78 | } 79 | ls.push_back({unit, ls_unit}); 80 | } 81 | 82 | std::sort(ls.begin(), ls.end(), [](auto &left, auto &right) 83 | { return left.second < right.second; }); 84 | 85 | return ls; 86 | } 87 | 88 | double calculate_reduction(double n, std::vector &unit1, std::vector &unit2) 89 | { 90 | // create group sums 91 | const int n_groups = unit1.size(); 92 | double n_total = 0.0; 93 | std::vector group_sums(n_groups, 0.0); 94 | for (int i = 0; i < n_groups; i++) 95 | { 96 | group_sums[i] += unit1[i]; 97 | group_sums[i] += unit2[i]; 98 | n_total += unit1[i]; 99 | n_total += unit2[i]; 100 | } 101 | 102 | // create unit sums 103 | const double n_unit1 = std::accumulate(unit1.begin(), unit1.end(), 0); 104 | const double n_unit2 = std::accumulate(unit2.begin(), unit2.end(), 0); 105 | 106 | // calculate M 107 | double m_total = 0.0; 108 | for (int i = 0; i < n_groups; i++) 109 | { 110 | // unit 1 111 | double obs1 = unit1[i] / n_total; 112 | if (obs1 != 0) 113 | { 114 | double exp1 = (n_unit1 / n_total * group_sums[i] / n_total); 115 | m_total += obs1 * std::log(obs1 / exp1); 116 | } 117 | // unit 2 118 | double obs2 = unit2[i] / n_total; 119 | if (obs2 != 0) 120 | { 121 | double exp2 = (n_unit2 / n_total * group_sums[i] / n_total); 122 | m_total += obs2 * std::log(obs2 / exp2); 123 | } 124 | } 125 | 126 | return n_total / n * m_total; 127 | } 128 | 129 | typedef std::pair t_neighbor; 130 | 131 | t_neighbor neighbor_make_pair(std::string a, std::string b) 132 | { 133 | if (a < b) 134 | return t_neighbor(a, b); 135 | else 136 | return t_neighbor(b, a); 137 | } 138 | 139 | // [[Rcpp::export]] 140 | List compress_compute_cpp( 141 | std::string neighbors_option, 142 | StringMatrix m_neighbors, 143 | int n_neighbors, 144 | NumericMatrix m_data, 145 | std::vector unit_names, 146 | int max_iter) 147 | { 148 | // prepare main data structure: map, where the key is the unit name 149 | // and the values are the ordered group counts 150 | std::map> data; 151 | for (int i = 0; i < m_data.nrow(); i++) 152 | { 153 | auto unit = unit_names[i]; 154 | data[unit] = {}; 155 | for (int j = 0; j < m_data.ncol(); j++) 156 | data[unit].push_back(m_data(i, j)); 157 | } 158 | 159 | int n_groups = m_data.ncol(); 160 | // compute total M index 161 | double n_total, m_total; 162 | std::tie(n_total, m_total) = calculate_m(data); 163 | 164 | // prepare neighbors data structure (list of pairs, where order is unimportant) 165 | std::map neighbors; 166 | if (neighbors_option == "all") 167 | { 168 | for (int row = 0; row < unit_names.size(); row++) 169 | { 170 | for (int col = row + 1; col < unit_names.size(); col++) 171 | { 172 | neighbors[neighbor_make_pair(unit_names[row], unit_names[col])] = 0; 173 | } 174 | } 175 | } 176 | else if (neighbors_option == "local") 177 | { 178 | 179 | auto ls = calculate_ls(data); 180 | for (int i = 0; i < ls.size(); i++) 181 | { 182 | for (int j = std::max(i - n_neighbors, 0); 183 | j < std::min(i + n_neighbors + 1, static_cast(ls.size()) - 1); 184 | j++) 185 | { 186 | if (i != j) 187 | neighbors[neighbor_make_pair(ls[i].first, ls[j].first)] = 0; 188 | } 189 | } 190 | } 191 | else if (neighbors_option == "df") 192 | { 193 | for (int i = 0; i < m_neighbors.nrow(); i++) 194 | { 195 | std::string unit1 = Rcpp::as(m_neighbors(i, 0)); 196 | std::string unit2 = Rcpp::as(m_neighbors(i, 1)); 197 | if (unit1 != unit2) 198 | neighbors[neighbor_make_pair(unit1, unit2)] = 0; 199 | } 200 | } 201 | 202 | // calculate reduction for each neighbor pair 203 | // (we don't do this in the previous step because otherwise we 204 | // might do a lot of duplicate calculations) 205 | for (const auto &[key, reduction] : neighbors) 206 | { 207 | neighbors[neighbor_make_pair(key.first, key.second)] = calculate_reduction(n_total, data[key.first], data[key.second]); 208 | } 209 | 210 | // determine maximum number of iterations 211 | if (max_iter == -1) 212 | { 213 | max_iter = std::min(static_cast(neighbors.size()), static_cast(unit_names.size()) - 1); 214 | } 215 | 216 | CompressionResults results; 217 | results.iter.reserve(max_iter); 218 | results.M_wgt.reserve(max_iter); 219 | results.M.reserve(max_iter); 220 | results.N_units.reserve(max_iter); 221 | results.old_unit.reserve(max_iter); 222 | results.new_unit.reserve(max_iter); 223 | 224 | int counter = 0; 225 | double m_current = m_total; 226 | int n_units_current = m_data.nrow(); 227 | 228 | Progress p(n_units_current, true); 229 | while (neighbors.size() > 0) 230 | { 231 | if (Progress::check_abort()) 232 | throw Rcpp::exception("user interruption"); 233 | 234 | // find smallest reduction 235 | double min_reduction = 10000; 236 | t_neighbor min_key; 237 | for (const auto &[key, reduction] : neighbors) 238 | { 239 | if (reduction < min_reduction) 240 | { 241 | min_reduction = reduction; 242 | min_key = key; 243 | if (reduction == 0) 244 | break; 245 | } 246 | } 247 | 248 | const std::string unit_keep = min_key.first; 249 | const std::string unit_delete = min_key.second; 250 | // add counts of 'delete' to 'keep', delete unit 251 | for (int i = 0; i < n_groups; i++) 252 | data[unit_keep][i] += data[unit_delete][i]; 253 | data.erase(unit_delete); 254 | 255 | // update neighbors 256 | neighbors.erase(min_key); 257 | 258 | std::vector delete_neighbors; 259 | std::map new_neighbors; 260 | for (const auto &[key, reduction] : neighbors) 261 | { 262 | // update pairs if deleted unit is involved 263 | if (key.first == unit_delete || key.second == unit_delete) 264 | { 265 | // this is a pair some_unit - deleted_unit -- replace deleted_unit with unit_keep 266 | delete_neighbors.push_back(key); 267 | std::string some_unit = (key.first == unit_delete) ? key.second : key.first; 268 | new_neighbors[neighbor_make_pair(unit_keep, some_unit)] = 269 | calculate_reduction(n_total, data[unit_keep], data[some_unit]); 270 | } 271 | // recalculate reduction if kept unit is involved 272 | else if (key.first == unit_keep || key.second == unit_keep) 273 | { 274 | new_neighbors[key] = calculate_reduction(n_total, data[key.first], data[key.second]); 275 | } 276 | } 277 | 278 | // delete neighbors that involve the old unit 279 | for (int i = 0; i < delete_neighbors.size(); i++) 280 | neighbors.erase(delete_neighbors[i]); 281 | 282 | // update and add new neighbors 283 | for (const auto &[key, reduction] : new_neighbors) 284 | neighbors[key] = reduction; 285 | 286 | // update results 287 | m_current -= min_reduction; 288 | n_units_current -= 1; 289 | counter += 1; 290 | results.iter.push_back(counter); 291 | results.M_wgt.push_back(min_reduction); 292 | if (n_units_current == 1) 293 | { 294 | // ensure that this is displayed as a true 0 295 | results.M.push_back(0); 296 | } 297 | else 298 | { 299 | results.M.push_back(m_current); 300 | } 301 | results.N_units.push_back(n_units_current); 302 | results.old_unit.push_back(unit_delete); 303 | results.new_unit.push_back(unit_keep); 304 | 305 | if (counter == max_iter) 306 | break; 307 | 308 | p.increment(); 309 | } 310 | 311 | return List::create( 312 | _["iter"] = results.iter, 313 | _["M_wgt"] = results.M_wgt, 314 | _["M"] = results.M, 315 | _["N_units"] = results.N_units, 316 | _["old_unit"] = results.old_unit, 317 | _["new_unit"] = results.new_unit); 318 | } 319 | 320 | int find_in_sets(std::string needle, std::vector> haystack) 321 | { 322 | for (int i = 0; i < haystack.size(); i++) 323 | { 324 | const bool is_in = haystack[i].find(needle) != haystack[i].end(); 325 | if (is_in == true) 326 | return i; 327 | } 328 | return -1; 329 | } 330 | 331 | // [[Rcpp::export]] 332 | List get_crosswalk_cpp(std::vector old_unit, std::vector new_unit) 333 | { 334 | std::vector> bags; 335 | 336 | for (int i = 0; i < old_unit.size(); i++) 337 | { 338 | int old_unit_bag = find_in_sets(old_unit[i], bags); 339 | int new_unit_bag = find_in_sets(new_unit[i], bags); 340 | 341 | if (old_unit_bag == -1 && new_unit_bag == -1) 342 | { 343 | // neither unit in bags - add new bag 344 | bags.push_back({old_unit[i], new_unit[i]}); 345 | } 346 | else if (old_unit_bag != -1 && new_unit_bag == -1) 347 | { 348 | // old_unit already exists - add new_unit to same bag 349 | bags[old_unit_bag].insert(new_unit[i]); 350 | } 351 | else if (old_unit_bag == -1 && new_unit_bag != -1) 352 | { 353 | // new_unit already exists - add old_unit to same bag 354 | bags[new_unit_bag].insert(old_unit[i]); 355 | } 356 | else if (old_unit_bag != -1 && new_unit_bag != -1) 357 | { 358 | // both units exist in different bags, merge the two 359 | bags[old_unit_bag].insert(bags[new_unit_bag].begin(), bags[new_unit_bag].end()); 360 | bags.erase(bags.begin() + new_unit_bag); 361 | } 362 | } 363 | 364 | // convert to List 365 | List l(bags.size()); 366 | for (int i = 0; i < bags.size(); i++) 367 | { 368 | std::vector bag(bags[i].size()); 369 | int index = 0; 370 | for (auto el : bags[i]) 371 | { 372 | bag[index] = el; 373 | index++; 374 | } 375 | l[i] = bag; 376 | } 377 | return l; 378 | } --------------------------------------------------------------------------------