├── .github
    ├── .gitignore
    └── workflows
    │   ├── pkgdown.yaml
    │   ├── check-standard.yaml
    │   └── test-coverage.yaml
├── src
    ├── Makevars
    ├── .gitignore
    ├── RcppExports.cpp
    └── compression.cpp
├── vignettes
    ├── .gitignore
    ├── faq.Rmd
    └── plotting.Rmd
├── LICENSE
├── data
    ├── school_ses.rda
    ├── schools00.rda
    └── schools05.rda
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_mutual_total_nested.R
    │   ├── test_exposure_isolation.R
    │   ├── test_entropy.R
    │   ├── test_matrix_to_long.R
    │   ├── test_dissimilarity.R
    │   ├── test_plots.R
    │   ├── test_mutual_local.R
    │   ├── test_mutual_local_expected.R
    │   ├── test_mutual_total_expected.R
    │   ├── test_mutual_within.R
    │   ├── test_ipf.R
    │   ├── test_compression.R
    │   └── test_mutual_total.R
├── man
    ├── figures
    │   └── README-segplot-1.png
    ├── scree_plot.Rd
    ├── school_ses.Rd
    ├── isolation.Rd
    ├── segregation.Rd
    ├── exposure.Rd
    ├── schools00.Rd
    ├── schools05.Rd
    ├── segcurve.Rd
    ├── merge_units.Rd
    ├── get_crosswalk.Rd
    ├── entropy.Rd
    ├── matrix_to_long.Rd
    ├── mutual_total_nested.Rd
    ├── compress.Rd
    ├── dissimilarity_expected.Rd
    ├── segplot.Rd
    ├── mutual_local_expected.Rd
    ├── dissimilarity.Rd
    ├── mutual_total_expected.Rd
    ├── mutual_local.Rd
    ├── mutual_within.Rd
    ├── mutual_total.Rd
    ├── ipf.Rd
    └── mutual_difference.Rd
├── .gitignore
├── codecov.yml
├── cran-comments.md
├── .Rbuildignore
├── CITATION.cff
├── R
    ├── RcppExports.R
    ├── entropy.R
    ├── data.R
    ├── exposure.R
    ├── dissimilarity.R
    ├── segregation.R
    ├── ipf.R
    ├── plots.R
    └── mutual_expected.R
├── inst
    └── CITATION
├── NAMESPACE
├── _pkgdown.yml
├── DESCRIPTION
├── NEWS.md
├── data-raw
    └── clean_data.R
├── README.Rmd
└── README.md


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | CXX_STD=CXX17
2 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2018
2 | COPYRIGHT HOLDER: Benjamin Elbers
3 | 


--------------------------------------------------------------------------------
/data/school_ses.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbersb/segregation/HEAD/data/school_ses.rda


--------------------------------------------------------------------------------
/data/schools00.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbersb/segregation/HEAD/data/schools00.rda


--------------------------------------------------------------------------------
/data/schools05.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbersb/segregation/HEAD/data/schools05.rda


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(segregation)
3 | 
4 | test_check("segregation")
5 | 


--------------------------------------------------------------------------------
/man/figures/README-segplot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elbersb/segregation/HEAD/man/figures/README-segplot-1.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.Rproj
 2 | .Rproj.user
 3 | .Rhistory
 4 | .RData
 5 | .Ruserdata
 6 | data-raw/sc*
 7 | inst/doc
 8 | docs/
 9 | .DS_Store
10 | .vscode/*


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |     patch:
10 |       default:
11 |         target: auto
12 |         threshold: 1%
13 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Test environments
 2 | 
 3 | * local macOS 13.3.1, R 4.2.0
 4 | * tested using Github Actions (mac, win, ubuntu)
 5 | * tested using win-devel
 6 | 
 7 | ## R CMD check results
 8 | 
 9 | No ERRORs, WARNINGSs, or NOTES.
10 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^CRAN-RELEASE$
 2 | ^.*\.Rproj$
 3 | ^.*\.rhistory$
 4 | ^\.Rproj\.user$
 5 | ^data-raw$
 6 | ^README\.Rmd$
 7 | ^README-.*\.png$
 8 | ^cran-comments\.md$
 9 | ^codecov\.yml$
10 | ^render_README\.R$
11 | ^docs$
12 | ^\.lintr$
13 | ^\.github$
14 | ^_pkgdown.yml$
15 | ^CITATION.cff$
16 | ^\.vscode$
17 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | preferred-citation:
 3 |   type: article
 4 |   message: "If you use {segregation} in your research, please cite the following paper."
 5 |   authors:
 6 |   - family-names: "Elbers"
 7 |     given-names: "Benjamin"
 8 |     orcid: "https://orcid.org/0000-0001-5392-3448"
 9 |   title: "A Method for Studying Differences in Segregation Across Time and Space"
10 |   doi: "10.1177/0049124121986204"
11 |   journal: "Sociological Methods & Research"
12 |   year: 2021
13 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | compress_compute_cpp <- function(neighbors_option, m_neighbors, n_neighbors, m_data, unit_names, max_iter) {
 5 |     .Call(`_segregation_compress_compute_cpp`, neighbors_option, m_neighbors, n_neighbors, m_data, unit_names, max_iter)
 6 | }
 7 | 
 8 | get_crosswalk_cpp <- function(old_unit, new_unit) {
 9 |     .Call(`_segregation_get_crosswalk_cpp`, old_unit, new_unit)
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/man/scree_plot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/compression.R
 3 | \name{scree_plot}
 4 | \alias{scree_plot}
 5 | \title{Scree plot for segregation compression}
 6 | \usage{
 7 | scree_plot(compression, tail = Inf)
 8 | }
 9 | \arguments{
10 | \item{compression}{A "segcompression" object returned by \link{compress}.}
11 | 
12 | \item{tail}{Return only the last \code{tail} units (default: \code{Inf})}
13 | }
14 | \value{
15 | Returns a ggplot2 plot.
16 | }
17 | \description{
18 | A plot that allows to visually see the effect of compression
19 | on mutual information.
20 | }
21 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | bibentry(
 2 |   header="To cite 'segregation' in publications use:",
 3 |   bibtype="Article",
 4 |   title="A Method for Studying Differences in Segregation Across Time and Space",
 5 |   author=as.person("Benjamin Elbers"),
 6 |   journal="Sociological Methods & Research",
 7 |   year="2021",
 8 |   volume="52",
 9 |   number="1",
10 |   pages="5-42",
11 |   doi="10.1177/0049124121986204",
12 |   textVersion=paste(
13 |     "Benjamin Elbers. 2021.",
14 |     "A Method for Studying Differences in Segregation Across Time and Space",
15 |     "Sociological Methods & Research 52(1): 5-42. doi: 10.1177/0049124121986204"
16 |   )
17 | )


--------------------------------------------------------------------------------
/man/school_ses.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{school_ses}
 5 | \alias{school_ses}
 6 | \title{Student-level data including SES status}
 7 | \format{
 8 | A data frame with 5,153 rows and 3 variables:
 9 | \describe{
10 |   \item{school_id}{school ID}
11 |   \item{ethnic_group}{one of A, B, or C}
12 |   \item{ses_quintile}{SES of the student (1 = lowest, 5 = highest)}
13 | }
14 | }
15 | \usage{
16 | school_ses
17 | }
18 | \description{
19 | Fake dataset used for examples. This is an individual-level
20 | dataset of students in schools.
21 | }
22 | \keyword{datasets}
23 | 


--------------------------------------------------------------------------------
/tests/testthat/test_mutual_total_nested.R:
--------------------------------------------------------------------------------
 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
 2 |     return()
 3 | }
 4 | 
 5 | library("segregation")
 6 | context("test_mutual_total_nested")
 7 | 
 8 | test_that("works both ways around", {
 9 |     decomp <- mutual_total_nested(schools00, "race",
10 |         c("state", "district", "school"),
11 |         weight = "n"
12 |     )
13 |     term1 <- mutual_total(schools00, "race", "state", weight = "n")$est
14 |     term2 <- mutual_total(schools00, "race", "district", within = "state", weight = "n")$est
15 |     term3 <- mutual_total(schools00, "race", "school", within = c("state", "district"), weight = "n")$est
16 |     expect_equal(decomp$est, c(term1, term2, term3))
17 | })
18 | 


--------------------------------------------------------------------------------
/man/isolation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/exposure.R
 3 | \name{isolation}
 4 | \alias{isolation}
 5 | \title{Calculates isolation indices}
 6 | \usage{
 7 | isolation(data, group, unit, weight = NULL)
 8 | }
 9 | \arguments{
10 | \item{data}{A data frame.}
11 | 
12 | \item{group}{A categorical variable
13 | contained in \code{data}. Defines the first dimension
14 | over which segregation is computed.}
15 | 
16 | \item{unit}{A vector of variables
17 | contained in \code{data}. Defines the second dimension
18 | over which segregation is computed.}
19 | 
20 | \item{weight}{Numeric. (Default \code{NULL})}
21 | }
22 | \value{
23 | Returns a data.table with group column and isolation index.
24 | }
25 | \description{
26 | Returns isolation index of each group
27 | }
28 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(as.dendrogram,segcompression)
 4 | S3method(print,segcompression)
 5 | export(compress)
 6 | export(dissimilarity)
 7 | export(dissimilarity_expected)
 8 | export(entropy)
 9 | export(exposure)
10 | export(get_crosswalk)
11 | export(ipf)
12 | export(isolation)
13 | export(matrix_to_long)
14 | export(merge_units)
15 | export(mutual_difference)
16 | export(mutual_local)
17 | export(mutual_local_expected)
18 | export(mutual_total)
19 | export(mutual_total_expected)
20 | export(mutual_total_nested)
21 | export(mutual_within)
22 | export(scree_plot)
23 | export(segcurve)
24 | export(segplot)
25 | import(RcppProgress)
26 | import(data.table)
27 | importFrom(Rcpp,sourceCpp)
28 | importFrom(stats,as.dendrogram)
29 | useDynLib(segregation, .registration = TRUE)
30 | 


--------------------------------------------------------------------------------
/man/segregation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/segregation.R
 3 | \docType{package}
 4 | \name{segregation}
 5 | \alias{segregation-package}
 6 | \alias{segregation}
 7 | \title{segregation: Entropy-based segregation indices}
 8 | \description{
 9 | Calculate and decompose entropy-based, multigroup segregation indices, with a focus
10 | on the Mutual Information Index (M) and Theil's Information Index (H).
11 | Provides tools to decompose the measures by groups and units, and by within
12 | and between terms. Includes standard error estimation by bootstrapping.
13 | }
14 | \seealso{
15 | \url{https://elbersb.com/segregation}
16 | }
17 | \author{
18 | \strong{Maintainer}: Benjamin Elbers \email{be2239@columbia.edu} (\href{https://orcid.org/0000-0001-5392-3448}{ORCID})
19 | 
20 | }
21 | \keyword{internal}
22 | 


--------------------------------------------------------------------------------
/man/exposure.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/exposure.R
 3 | \name{exposure}
 4 | \alias{exposure}
 5 | \title{Calculates pairwise exposure indices}
 6 | \usage{
 7 | exposure(data, group, unit, weight = NULL)
 8 | }
 9 | \arguments{
10 | \item{data}{A data frame.}
11 | 
12 | \item{group}{A categorical variable
13 | contained in \code{data}. Defines the first dimension
14 | over which segregation is computed.}
15 | 
16 | \item{unit}{A vector of variables
17 | contained in \code{data}. Defines the second dimension
18 | over which segregation is computed.}
19 | 
20 | \item{weight}{Numeric. (Default \code{NULL})}
21 | }
22 | \value{
23 | Returns a data.table with columns "of", "to", and
24 |  "exposure". Read results as "exposure of group x to group y".
25 | }
26 | \description{
27 | Returns the pairwise exposure indices between groups
28 | }
29 | 


--------------------------------------------------------------------------------
/man/schools00.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{schools00}
 5 | \alias{schools00}
 6 | \title{Ethnic/racial composition of schools for 2000/2001}
 7 | \format{
 8 | A data frame with 8,142 rows and 5 variables:
 9 | \describe{
10 |   \item{state}{either A, B, or C}
11 |   \item{district}{school agency/district ID}
12 |   \item{school}{school ID}
13 |   \item{race}{either native, asian, hispanic, black, or white}
14 |   \item{n}{n of students by school and race}
15 | }
16 | }
17 | \usage{
18 | schools00
19 | }
20 | \description{
21 | Fake dataset used for examples. Loosely based on data provided by
22 | the National Center for Education Statistics, Common Core of Data,
23 | with information on U.S. primary schools in three U.S. states.
24 | The original data can be downloaded at \url{https://nces.ed.gov/ccd/}.
25 | }
26 | \keyword{datasets}
27 | 


--------------------------------------------------------------------------------
/man/schools05.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{schools05}
 5 | \alias{schools05}
 6 | \title{Ethnic/racial composition of schools for 2005/2006}
 7 | \format{
 8 | A data frame with 8,013 rows and 5 variables:
 9 | \describe{
10 |   \item{state}{either A, B, or C}
11 |   \item{district}{school agency/district ID}
12 |   \item{school}{school ID}
13 |   \item{race}{either native, asian, hispanic, black, or white}
14 |   \item{n}{n of students by school and race}
15 | }
16 | }
17 | \usage{
18 | schools05
19 | }
20 | \description{
21 | Fake dataset used for examples. Loosely based on data provided by
22 | the National Center for Education Statistics, Common Core of Data,
23 | with information on U.S. primary schools in three U.S. states.
24 | The original data can be downloaded at \url{https://nces.ed.gov/ccd/}.
25 | }
26 | \keyword{datasets}
27 | 


--------------------------------------------------------------------------------
/man/segcurve.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plots.R
 3 | \name{segcurve}
 4 | \alias{segcurve}
 5 | \title{A visual representation of two-group segregation}
 6 | \usage{
 7 | segcurve(data, group, unit, weight = NULL, segment = NULL)
 8 | }
 9 | \arguments{
10 | \item{data}{A data frame.}
11 | 
12 | \item{group}{A categorical variable contained in \code{data}.
13 | Defines the first dimension over which segregation is computed.}
14 | 
15 | \item{unit}{A categorical variable contained in \code{data}.
16 | Defines the second dimension over which segregation is computed.}
17 | 
18 | \item{weight}{Numeric. (Default \code{NULL})}
19 | 
20 | \item{segment}{A categorical variable contained in \code{data}. (Default \code{NULL})
21 | If given, several segregation curves will be shown, one for each segment.}
22 | }
23 | \value{
24 | Returns a ggplot2 object.
25 | }
26 | \description{
27 | Produces one or several segregation curves, as defined in Duncan and Duncan (1955)
28 | }
29 | 


--------------------------------------------------------------------------------
/man/merge_units.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/compression.R
 3 | \name{merge_units}
 4 | \alias{merge_units}
 5 | \title{Creates a compressed dataset}
 6 | \usage{
 7 | merge_units(compression, n_units = NULL, percent = NULL, parts = FALSE)
 8 | }
 9 | \arguments{
10 | \item{compression}{A "segcompression" object returned by \link{compress}.}
11 | 
12 | \item{n_units}{Determines the number of merges by specifying the number of
13 | units to remain in the compressed dataset.
14 | Only \code{n_units} or \code{percent} must be given. (default: \code{NULL})}
15 | 
16 | \item{percent}{Determines the number of merges by specifying the percentage
17 | of total segregation information retained in the compressed dataset.
18 | Only \code{n_units} or \code{percent} must be given. (default: \code{NULL})}
19 | 
20 | \item{parts}{(default: FALSE)}
21 | }
22 | \value{
23 | Returns a data.table.
24 | }
25 | \description{
26 | After running \link{compress}, this function creates a dataset where
27 | units are merged.
28 | }
29 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: https://elbersb.com/segregation
 2 | 
 3 | authors:
 4 |   Benjamin Elbers:
 5 |     href: https://elbersb.com
 6 | 
 7 | template:
 8 |   bootstrap: 5
 9 | 
10 | reference:
11 |   - title: Segregation indices
12 |     contents:
13 |       - dissimilarity
14 |       - exposure
15 |       - isolation
16 |       - mutual_total
17 |       - mutual_total_nested
18 |       - mutual_within
19 |       - mutual_local
20 |   - title: Visualizing segregation
21 |     contents:
22 |       - segcurve
23 |       - segplot
24 |   - title: Debiasing
25 |     contents:
26 |       - mutual_total_expected
27 |       - mutual_local_expected
28 |       - dissimilarity_expected
29 |   - title: Comparing differences
30 |     contents:
31 |       - mutual_difference
32 |       - ipf
33 |   - title: Compressing segregation
34 |     contents:
35 |       - compress
36 |       - merge_units
37 |       - get_crosswalk
38 |       - scree_plot
39 |   - title: Datasets
40 |     contents:
41 |       - school_ses
42 |       - schools00
43 |       - schools05
44 |   - title: Helper functions
45 |     contents:
46 |       - entropy
47 |       - matrix_to_long
48 | 


--------------------------------------------------------------------------------
/man/get_crosswalk.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/compression.R
 3 | \name{get_crosswalk}
 4 | \alias{get_crosswalk}
 5 | \title{Create crosswalk after compression}
 6 | \usage{
 7 | get_crosswalk(compression, n_units = NULL, percent = NULL, parts = FALSE)
 8 | }
 9 | \arguments{
10 | \item{compression}{A "segcompression" object returned by \link{compress}.}
11 | 
12 | \item{n_units}{Determines the number of merges by specifying the number of
13 | units to remain in the compressed dataset.
14 | Only \code{n_units} or \code{percent} must be given. (default: \code{NULL})}
15 | 
16 | \item{percent}{Determines the number of merges by specifying the percentage
17 | of total segregation information retained in the compressed dataset.
18 | Only \code{n_units} or \code{percent} must be given. (default: \code{NULL})}
19 | 
20 | \item{parts}{(default: FALSE)}
21 | }
22 | \value{
23 | Returns a ggplot2 plot.
24 | 
25 | Returns a data.table.
26 | }
27 | \description{
28 | After running \link{compress}, this function creates a crosswalk table.
29 | Usually it is preferred to call \link{merge_units} directly.
30 | }
31 | 


--------------------------------------------------------------------------------
/man/entropy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/entropy.R
 3 | \name{entropy}
 4 | \alias{entropy}
 5 | \title{Calculates the entropy of a distribution}
 6 | \usage{
 7 | entropy(data, group, within = NULL, weight = NULL, base = exp(1))
 8 | }
 9 | \arguments{
10 | \item{data}{A data frame.}
11 | 
12 | \item{group}{A categorical variable or a vector of variables
13 | contained in \code{data}.}
14 | 
15 | \item{within}{A categorical variable or a vector of variables
16 | contained in \code{data}.}
17 | 
18 | \item{weight}{Numeric. (Default \code{NULL})}
19 | 
20 | \item{base}{Base of the logarithm that is used in the entropy
21 | calculation. Defaults to the natural logarithm.}
22 | }
23 | \value{
24 | A single number, the entropy.
25 | }
26 | \description{
27 | Returns the entropy of the distribution defined by
28 | \code{group}.
29 | }
30 | \examples{
31 | d <- data.frame(cat = c("A", "B"), n = c(25, 75))
32 | entropy(d, "cat", weight = "n") # => .56
33 | # this is equivalent to -.25*log(.25)-.75*log(.75)
34 | 
35 | d <- data.frame(cat = c("A", "B"), n = c(50, 50))
36 | # use base 2 for the logarithm, then entropy is maximized at 1
37 | entropy(d, "cat", weight = "n", base = 2) # => 1
38 | }
39 | 


--------------------------------------------------------------------------------
/man/matrix_to_long.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/segregation.R
 3 | \name{matrix_to_long}
 4 | \alias{matrix_to_long}
 5 | \title{Turns a contingency table into long format}
 6 | \usage{
 7 | matrix_to_long(
 8 |   matrix,
 9 |   group = "group",
10 |   unit = "unit",
11 |   weight = "n",
12 |   drop_zero = TRUE
13 | )
14 | }
15 | \arguments{
16 | \item{matrix}{A matrix, where the rows represent the units, and the
17 | column represent the groups.}
18 | 
19 | \item{group}{Variable name for group. (Default \code{group})}
20 | 
21 | \item{unit}{Variable name for unit. (Default \code{unit})}
22 | 
23 | \item{weight}{Variable name for frequency weight. (Default \code{weight})}
24 | 
25 | \item{drop_zero}{Drop unit-group combinations with zero weight. (Default \code{TRUE})}
26 | }
27 | \value{
28 | A data.table.
29 | }
30 | \description{
31 | Returns a data.table in long form, such that it is suitable
32 | for use in \link{mutual_total}, etc. Colnames and rownames of
33 | the matrix will be respected.
34 | }
35 | \examples{
36 | m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3)
37 | colnames(m) <- c("Black", "White")
38 | long <- matrix_to_long(m, group = "race", unit = "school")
39 | mutual_total(long, "race", "school", weight = "n")
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/testthat/test_exposure_isolation.R:
--------------------------------------------------------------------------------
 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
 2 |     return()
 3 | }
 4 | 
 5 | library("segregation")
 6 | context("test_exposure_isolation")
 7 | 
 8 | test_that("two group case", {
 9 |     two <- data.table::as.data.table(schools00)
10 |     two <- two[race %in% c("white", "black")]
11 |     exp <- exposure(two, "race", "school", "n")
12 |     # relationship: sum of xPy and yPx = 1
13 |     expect_equal(exp[, .(sum = sum(exposure)), by = .(of)][["sum"]], c(1, 1))
14 |     # relationship: xPy = yPx Y/X
15 |     white_total <- two[race == "white", sum(n)]
16 |     black_total <- two[race == "black", sum(n)]
17 |     expect_equal(
18 |         exp[of == "black" & to == "white", exposure],
19 |         exp[of == "white" & to == "black", exposure] * white_total / black_total
20 |     )
21 | })
22 | 
23 | test_that("exposure", {
24 |     exp <- exposure(schools00, "race", "school", "n")
25 |     expect_equal(
26 |         exp[, .(sum = sum(exposure)), by = .(of)][["sum"]],
27 |         rep(1, 5)
28 |     )
29 | })
30 | 
31 | test_that("exposure and isolation", {
32 |     exp <- exposure(schools00, "race", "school", "n")[of == to]
33 |     iso <- isolation(schools00, "race", "school", "n")
34 |     comp <- merge(exp, iso, by.x = "of", by.y = "race")
35 |     expect_equal(comp[["isolation"]], comp[["exposure"]])
36 | })
37 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |   release:
 8 |     types: [published]
 9 |   workflow_dispatch:
10 | 
11 | name: pkgdown.yaml
12 | 
13 | permissions: read-all
14 | 
15 | jobs:
16 |   pkgdown:
17 |     runs-on: ubuntu-latest
18 |     # Only restrict concurrency for non-PR jobs
19 |     concurrency:
20 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
21 |     env:
22 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
23 |       CENSUS_API_KEY: ${{ secrets.CENSUS_API_KEY }}
24 |     permissions:
25 |       contents: write
26 |     steps:
27 |       - uses: actions/checkout@v4
28 | 
29 |       - uses: r-lib/actions/setup-pandoc@v2
30 | 
31 |       - uses: r-lib/actions/setup-r@v2
32 |         with:
33 |           use-public-rspm: true
34 | 
35 |       - uses: r-lib/actions/setup-r-dependencies@v2
36 |         with:
37 |           extra-packages: any::pkgdown, local::., any::tigris
38 |           needs: website
39 | 
40 |       - name: Build site
41 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
42 |         shell: Rscript {0}
43 | 
44 |       - name: Deploy to GitHub pages 🚀
45 |         if: github.event_name != 'pull_request'
46 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
47 |         with:
48 |           clean: false
49 |           branch: gh-pages
50 |           folder: docs
51 | 


--------------------------------------------------------------------------------
/man/mutual_total_nested.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mutual.R
 3 | \name{mutual_total_nested}
 4 | \alias{mutual_total_nested}
 5 | \title{Calculates a nested decomposition of segregation for M and H}
 6 | \usage{
 7 | mutual_total_nested(data, group, unit, weight = NULL, base = exp(1))
 8 | }
 9 | \arguments{
10 | \item{data}{A data frame.}
11 | 
12 | \item{group}{A categorical variable or a vector of variables
13 | contained in \code{data}. Defines the first dimension
14 | over which segregation is computed.}
15 | 
16 | \item{unit}{A vector of variables
17 | contained in \code{data}. Defines the levels at which
18 | the decomposition should be computed.}
19 | 
20 | \item{weight}{Numeric. (Default \code{NULL})}
21 | 
22 | \item{base}{Base of the logarithm that is used in the calculation.
23 | Defaults to the natural logarithm.}
24 | }
25 | \value{
26 | Returns a data.table similar to \code{\link{mutual_total}},
27 |   but with column \code{between} and \code{within} that define
28 |   the levels of nesting.
29 | }
30 | \description{
31 | Returns the between-within decomposition defined by
32 | the sequence of variables in \code{unit}.
33 | }
34 | \examples{
35 | mutual_total_nested(schools00, "race", c("state", "district", "school"),
36 |     weight = "n"
37 | )
38 | # This is a simpler way to run the following manually:
39 | # mutual_total(schools00, "race", "state", weight = "n")
40 | # mutual_total(schools00, "race", "district", within = "state", weight = "n")
41 | # mutual_total(schools00, "race", "school", within = c("state", "district"), weight = "n")
42 | }
43 | 


--------------------------------------------------------------------------------
/man/compress.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/compression.R
 3 | \name{compress}
 4 | \alias{compress}
 5 | \title{Compresses a data matrix based on mutual information (segregation)}
 6 | \usage{
 7 | compress(
 8 |   data,
 9 |   group,
10 |   unit,
11 |   weight = NULL,
12 |   neighbors = "local",
13 |   n_neighbors = 50,
14 |   max_iter = Inf
15 | )
16 | }
17 | \arguments{
18 | \item{data}{A data frame.}
19 | 
20 | \item{group}{A categorical variable
21 | contained in \code{data}. Defines the first dimension
22 | over which segregation is computed.}
23 | 
24 | \item{unit}{A categorical variable
25 | contained in \code{data}. Defines the second dimension
26 | over which segregation is computed.}
27 | 
28 | \item{weight}{Numeric. Only frequency weights are allowed.
29 | (Default \code{NULL})}
30 | 
31 | \item{neighbors}{Either a data frame or a character. If data frame, then
32 | it needs exactly two columns, where each row identifies
33 | a set of "neighbors" that may be merged.
34 | If "local", considers the \code{n_neighbors} closest neighbors
35 | in terms of local segregation.
36 | If "all", all units are considered as possible neighbors. This
37 | may be very time-consuming.}
38 | 
39 | \item{n_neighbors}{Only relevant if \code{neighbors} is \code{"local"}.}
40 | 
41 | \item{max_iter}{Maximum number of iterations (Default \code{Inf})}
42 | }
43 | \value{
44 | Returns a data.table.
45 | }
46 | \description{
47 | Given a data set that identifies suitable neighbors for merging,
48 | this function will merge units iteratively, where in each iteration
49 | the neighbors with the smallest reduction in terms of total M will be merged.
50 | }
51 | 


--------------------------------------------------------------------------------
/.github/workflows/check-standard.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 | 
 8 | name: R-CMD-check.yaml
 9 | 
10 | permissions: read-all
11 | 
12 | jobs:
13 |   R-CMD-check:
14 |     runs-on: ${{ matrix.config.os }}
15 | 
16 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
17 | 
18 |     strategy:
19 |       fail-fast: false
20 |       matrix:
21 |         config:
22 |           - {os: macos-latest,   r: 'release'}
23 |           - {os: windows-latest, r: 'release'}
24 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
25 |           - {os: ubuntu-latest,   r: 'release'}
26 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
27 | 
28 |     env:
29 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
30 |       CENSUS_API_KEY: ${{ secrets.CENSUS_API_KEY }}
31 |       R_KEEP_PKG_SOURCE: yes
32 | 
33 |     steps:
34 |       - uses: actions/checkout@v4
35 | 
36 |       - uses: r-lib/actions/setup-pandoc@v2
37 | 
38 |       - uses: r-lib/actions/setup-r@v2
39 |         with:
40 |           r-version: ${{ matrix.config.r }}
41 |           http-user-agent: ${{ matrix.config.http-user-agent }}
42 |           use-public-rspm: true
43 | 
44 |       - uses: r-lib/actions/setup-r-dependencies@v2
45 |         with:
46 |           extra-packages: any::rcmdcheck
47 |           needs: check
48 | 
49 |       - uses: r-lib/actions/check-r-package@v2
50 |         with:
51 |           upload-snapshots: true
52 |           build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
53 | 


--------------------------------------------------------------------------------
/R/entropy.R:
--------------------------------------------------------------------------------
 1 | #' Calculates the entropy of a distribution
 2 | #'
 3 | #' Returns the entropy of the distribution defined by
 4 | #' \code{group}.
 5 | #'
 6 | #' @param data A data frame.
 7 | #' @param group A categorical variable or a vector of variables
 8 | #'   contained in \code{data}.
 9 | #' @param within A categorical variable or a vector of variables
10 | #'   contained in \code{data}.
11 | #' @param weight Numeric. (Default \code{NULL})
12 | #' @param base Base of the logarithm that is used in the entropy
13 | #'   calculation. Defaults to the natural logarithm.
14 | #' @return A single number, the entropy.
15 | #' @examples
16 | #' d <- data.frame(cat = c("A", "B"), n = c(25, 75))
17 | #' entropy(d, "cat", weight = "n") # => .56
18 | #' # this is equivalent to -.25*log(.25)-.75*log(.75)
19 | #'
20 | #' d <- data.frame(cat = c("A", "B"), n = c(50, 50))
21 | #' # use base 2 for the logarithm, then entropy is maximized at 1
22 | #' entropy(d, "cat", weight = "n", base = 2) # => 1
23 | #' @import data.table
24 | #' @export
25 | entropy <- function(data, group, within = NULL, weight = NULL, base = exp(1)) {
26 |     # use provided weight
27 |     if (!is.null(weight)) {
28 |         data[, "freq"] <- data[[weight]]
29 |     } else {
30 |         data[, "freq"] <- 1
31 |     }
32 |     data.table::setDT(data)
33 |     n_total <- sum(data[, "freq"])
34 | 
35 |     if (is.null(within)) {
36 |         vars <- group
37 |         entropy_within <- 0
38 |     } else {
39 |         vars <- c(group, within)
40 |         p_within <- data[, list(p = sum(freq)), by = within][["p"]] / n_total
41 |         entropy_within <- -sum(p_within * logf(p_within, base))
42 |     }
43 | 
44 |     p <- data[, list(p = sum(freq)), by = vars][["p"]] / n_total
45 |     -sum(p * logf(p, base)) - entropy_within
46 | }
47 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Ethnic/racial composition of schools for 2000/2001
 2 | #'
 3 | #' Fake dataset used for examples. Loosely based on data provided by
 4 | #' the National Center for Education Statistics, Common Core of Data,
 5 | #' with information on U.S. primary schools in three U.S. states.
 6 | #' The original data can be downloaded at \url{https://nces.ed.gov/ccd/}.
 7 | #'
 8 | #' @format A data frame with 8,142 rows and 5 variables:
 9 | #' \describe{
10 | #'   \item{state}{either A, B, or C}
11 | #'   \item{district}{school agency/district ID}
12 | #'   \item{school}{school ID}
13 | #'   \item{race}{either native, asian, hispanic, black, or white}
14 | #'   \item{n}{n of students by school and race}
15 | #' }
16 | "schools00"
17 | 
18 | #' Ethnic/racial composition of schools for 2005/2006
19 | #'
20 | #' Fake dataset used for examples. Loosely based on data provided by
21 | #' the National Center for Education Statistics, Common Core of Data,
22 | #' with information on U.S. primary schools in three U.S. states.
23 | #' The original data can be downloaded at \url{https://nces.ed.gov/ccd/}.
24 | #'
25 | #' @format A data frame with 8,013 rows and 5 variables:
26 | #' \describe{
27 | #'   \item{state}{either A, B, or C}
28 | #'   \item{district}{school agency/district ID}
29 | #'   \item{school}{school ID}
30 | #'   \item{race}{either native, asian, hispanic, black, or white}
31 | #'   \item{n}{n of students by school and race}
32 | #' }
33 | "schools05"
34 | 
35 | #' Student-level data including SES status
36 | #'
37 | #' Fake dataset used for examples. This is an individual-level
38 | #' dataset of students in schools.
39 | #'
40 | #' @format A data frame with 5,153 rows and 3 variables:
41 | #' \describe{
42 | #'   \item{school_id}{school ID}
43 | #'   \item{ethnic_group}{one of A, B, or C}
44 | #'   \item{ses_quintile}{SES of the student (1 = lowest, 5 = highest)}
45 | #' }
46 | "school_ses"
47 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: segregation
 2 | Type: Package
 3 | Title: Entropy-Based Segregation Indices
 4 | Version: 1.1.0.9000
 5 | Authors@R: person("Benjamin", "Elbers", email = "be2239@columbia.edu",
 6 |   role = c("aut", "cre"), comment = c(ORCID = "0000-0001-5392-3448"))
 7 | Description: Computes segregation indices, including the Index of Dissimilarity,
 8 |     as well as the information-theoretic indices developed by
 9 |     Theil (1971) <isbn:978-0471858454>, namely
10 |     the Mutual Information Index (M) and Theil's Information Index (H).
11 |     The M, further described by Mora and Ruiz-Castillo (2011) <doi:10.1111/j.1467-9531.2011.01237.x>
12 |     and Frankel and Volij (2011) <doi:10.1016/j.jet.2010.10.008>,
13 |     is a measure of segregation that is highly decomposable. The package provides
14 |     tools to decompose the index by units and groups (local segregation),
15 |     and by within and between terms. The package also provides a method to decompose
16 |     differences in segregation as described by Elbers (2021) <doi:10.1177/0049124121986204>.
17 |     The package includes standard error estimation by bootstrapping, which also corrects for
18 |     small sample bias. The package also contains functions for visualizing segregation patterns.
19 | License: MIT + file LICENSE
20 | Depends: R (>= 3.5.0)
21 | Imports: 
22 |     data.table,
23 |     checkmate,
24 |     Rcpp,
25 |     RcppProgress,
26 | Encoding: UTF-8
27 | LazyData: true
28 | Suggests:
29 |     testthat,
30 |     covr,
31 |     knitr,
32 |     rmarkdown,
33 |     dplyr,
34 |     ggplot2,
35 |     scales,
36 |     tidycensus,
37 |     tigris,
38 |     rrapply,
39 |     dendextend,
40 |     patchwork
41 | URL: https://elbersb.github.io/segregation/
42 | BugReports: https://github.com/elbersb/segregation/issues
43 | RoxygenNote: 7.3.2
44 | VignetteBuilder: knitr
45 | SystemRequirements: C++17
46 | LinkingTo: 
47 |     Rcpp,
48 |     RcppProgress
49 | 


--------------------------------------------------------------------------------
/tests/testthat/test_entropy.R:
--------------------------------------------------------------------------------
 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
 2 |     return()
 3 | }
 4 | 
 5 | library("segregation")
 6 | context("test_entropy")
 7 | 
 8 | test_that("custom log function", {
 9 |     expect_equal(logf(2, exp(1)), log(2))
10 | })
11 | 
12 | test_that("correct entropy calculation", {
13 |     expect_equal(entropy(data.frame(x = c(1)), "x"), 0)
14 |     expect_equal(entropy(data.frame(x = c(1, 2)), "x"), log(2))
15 |     expect_equal(entropy(data.frame(x = c(1, 2, 3)), "x"), log(3))
16 | 
17 |     expect_equal(
18 |         entropy(data.frame(x = c(1, 2), n = c(10, 10)), "x", weight = "n"),
19 |         log(2)
20 |     )
21 |     expect_equal(
22 |         entropy(data.frame(x = c(1, 2), n = c(10, 10)), "x", weight = "n", base = 2),
23 |         1
24 |     )
25 |     expect_equal(
26 |         entropy(data.frame(x = c(1, 2), n = c(10, 30)), "x", weight = "n"),
27 |         .25 * log(1 / .25) + .75 * log(1 / .75)
28 |     )
29 | })
30 | 
31 | test_that("conditional entropy calculation", {
32 |     # complete independence
33 |     df <- data.frame(
34 |         race = c("w", "w", "b", "b"),
35 |         district = c(1, 2, 1, 2),
36 |         n = c(10, 10, 10, 10)
37 |     )
38 | 
39 |     expect_equal(entropy(df, "race", weight = "n"), log(2))
40 |     expect_equal(entropy(df, "district", weight = "n"), log(2))
41 |     expect_equal(entropy(df, c("race", "district"), weight = "n"), log(4))
42 |     expect_equal(entropy(df, "race", within = "district", weight = "n"), log(2))
43 | 
44 |     # complete dependence
45 |     df2 <- data.frame(
46 |         race = c("w", "w", "b", "b"),
47 |         district = c("1", "2", "1", "2"),
48 |         n = c(10, 0, 0, 10)
49 |     )
50 | 
51 |     expect_equal(entropy(df2, "race", weight = "n"), log(2))
52 |     expect_equal(entropy(df2, "district", weight = "n"), log(2))
53 |     expect_equal(entropy(df2, c("race", "district"), weight = "n"), log(2))
54 |     expect_equal(entropy(df2, "race", within = "district", weight = "n"), 0)
55 | })
56 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 | 
 8 | name: test-coverage.yaml
 9 | 
10 | permissions: read-all
11 | 
12 | jobs:
13 |   test-coverage:
14 |     runs-on: ubuntu-latest
15 |     env:
16 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v4
20 | 
21 |       - uses: r-lib/actions/setup-r@v2
22 |         with:
23 |           use-public-rspm: true
24 | 
25 |       - uses: r-lib/actions/setup-r-dependencies@v2
26 |         with:
27 |           extra-packages: any::covr, any::xml2
28 |           needs: coverage
29 | 
30 |       - name: Test coverage
31 |         run: |
32 |           cov <- covr::package_coverage(
33 |             quiet = FALSE,
34 |             clean = FALSE,
35 |             install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
36 |           )
37 |           print(cov)
38 |           covr::to_cobertura(cov)
39 |         shell: Rscript {0}
40 | 
41 |       - uses: codecov/codecov-action@v5
42 |         with:
43 |           # Fail if error if not on PR, or if on PR and token is given
44 |           fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }}
45 |           files: ./cobertura.xml
46 |           plugins: noop
47 |           disable_search: true
48 |           token: ${{ secrets.CODECOV_TOKEN }}
49 | 
50 |       - name: Show testthat output
51 |         if: always()
52 |         run: |
53 |           ## --------------------------------------------------------------------
54 |           find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
55 |         shell: bash
56 | 
57 |       - name: Upload test results
58 |         if: failure()
59 |         uses: actions/upload-artifact@v4
60 |         with:
61 |           name: coverage-test-failures
62 |           path: ${{ runner.temp }}/package
63 | 


--------------------------------------------------------------------------------
/man/dissimilarity_expected.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mutual_expected.R
 3 | \name{dissimilarity_expected}
 4 | \alias{dissimilarity_expected}
 5 | \title{Calculates expected values when true segregation is zero}
 6 | \usage{
 7 | dissimilarity_expected(
 8 |   data,
 9 |   group,
10 |   unit,
11 |   weight = NULL,
12 |   fixed_margins = TRUE,
13 |   n_bootstrap = 100
14 | )
15 | }
16 | \arguments{
17 | \item{data}{A data frame.}
18 | 
19 | \item{group}{A categorical variable or a vector of variables
20 | contained in \code{data}. Defines the first dimension
21 | over which segregation is computed.}
22 | 
23 | \item{unit}{A categorical variable or a vector of variables
24 | contained in \code{data}. Defines the second dimension
25 | over which segregation is computed.}
26 | 
27 | \item{weight}{Numeric. (Default \code{NULL})}
28 | 
29 | \item{fixed_margins}{Should the margins be fixed or simulated? (Default \code{TRUE})}
30 | 
31 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})}
32 | }
33 | \value{
34 | A data.table with one row, corresponding to the expected value of
35 |    the D index when true segregation is zero.
36 | }
37 | \description{
38 | When sample sizes are small, one group has a small proportion, or
39 | when there are many units, segregation indices are typically upwardly
40 | biased, even when true segregation is zero. This function simulates
41 | tables with zero segregation, given the marginals of the dataset,
42 | and calculates segregation. If the expected values are large,
43 | the interpretation of index scores might have to be adjusted.
44 | }
45 | \examples{
46 | # build a smaller table, with 100 students distributed across
47 | # 10 schools, where one racial group has 10\% of the students
48 | small <- data.frame(
49 |     school = c(1:10, 1:10),
50 |     race = c(rep("r1", 10), rep("r2", 10)),
51 |     n = c(rep(1, 10), rep(9, 10))
52 | )
53 | dissimilarity_expected(small, "race", "school", weight = "n")
54 | # with an increase in sample size (n=1000), the values improve
55 | small$n <- small$n * 10
56 | dissimilarity_expected(small, "race", "school", weight = "n")
57 | }
58 | 


--------------------------------------------------------------------------------
/tests/testthat/test_matrix_to_long.R:
--------------------------------------------------------------------------------
 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
 2 |     return()
 3 | }
 4 | 
 5 | library("segregation")
 6 | context("test_matrix_to_long")
 7 | 
 8 | test_that("accept only matrix", {
 9 |     a <- data.frame()
10 |     expect_error(matrix_to_long(a))
11 | })
12 | 
13 | test_that("no names", {
14 |     m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3)
15 |     long <- matrix_to_long(m)
16 |     expect_equal(names(long), c("unit", "group", "n"))
17 |     expect_equal(long$unit, as.character(rep(1:3, 2)))
18 |     expect_equal(long$group, as.character(c(1, 1, 1, 2, 2, 2)))
19 | })
20 | 
21 | test_that("rownames only", {
22 |     m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3)
23 |     colnames(m) <- c("A", "B")
24 |     long <- matrix_to_long(m)
25 |     expect_equal(names(long), c("unit", "group", "n"))
26 |     expect_equal(long$unit, as.character(rep(1:3, 2)))
27 |     expect_equal(long$group, as.character(c("A", "A", "A", "B", "B", "B")))
28 | })
29 | 
30 | test_that("colnames only", {
31 |     m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3)
32 |     rownames(m) <- c("S1", "S2", "S3")
33 |     long <- matrix_to_long(m)
34 |     expect_equal(names(long), c("unit", "group", "n"))
35 |     expect_equal(long$unit, rep(c("S1", "S2", "S3"), 2))
36 |     expect_equal(long$group, as.character(c(1, 1, 1, 2, 2, 2)))
37 | })
38 | 
39 | test_that("rownames + colnames", {
40 |     m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3)
41 |     colnames(m) <- c("A", "B")
42 |     rownames(m) <- c("S1", "S2", "S3")
43 |     long <- matrix_to_long(m)
44 |     expect_equal(names(long), c("unit", "group", "n"))
45 |     expect_equal(long$unit, rep(c("S1", "S2", "S3"), 2))
46 |     expect_equal(long$group, as.character(c("A", "A", "A", "B", "B", "B")))
47 | })
48 | 
49 | test_that("arguments", {
50 |     # drop zero
51 |     m <- matrix(c(10, 20, 30, 0, 20, 0), nrow = 3)
52 |     long1 <- matrix_to_long(m)
53 |     expect_equal(nrow(long1), 4)
54 |     long2 <- matrix_to_long(m, drop_zero = FALSE)
55 |     expect_equal(nrow(long2), 6)
56 | 
57 |     # change names
58 |     long3 <- matrix_to_long(m, "race", "school", weight = "weight")
59 |     expect_equal(names(long3), c("school", "race", "weight"))
60 | })
61 | 


--------------------------------------------------------------------------------
/man/segplot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plots.R
 3 | \name{segplot}
 4 | \alias{segplot}
 5 | \title{A visual representation of segregation}
 6 | \usage{
 7 | segplot(
 8 |   data,
 9 |   group,
10 |   unit,
11 |   weight,
12 |   order = "segregation",
13 |   secondary_plot = NULL,
14 |   reference_distribution = NULL,
15 |   bar_space = 0,
16 |   hline = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{data}{A data frame.}
21 | 
22 | \item{group}{A categorical variable or a vector of variables
23 | contained in \code{data}. Defines the first dimension
24 | over which segregation is computed.}
25 | 
26 | \item{unit}{A categorical variable or a vector of variables
27 | contained in \code{data}. Defines the second dimension
28 | over which segregation is computed.}
29 | 
30 | \item{weight}{Numeric. (Default \code{NULL})}
31 | 
32 | \item{order}{A character, either
33 | "segregation", "entropy", "majority", or "majority_fixed".
34 | Affects the ordering of the units.
35 | The horizontal ordering of the groups can be changed
36 | by using a factor variable for \code{group}.
37 | The difference between "majority" and "majority_fixed" is that the former
38 | will reorder the groups in such a way that the majority group actually comes first.
39 | If you want to control the ordering yourself, use "majority_fixed" and specify
40 | the \code{group} variable as a factor variable.}
41 | 
42 | \item{secondary_plot}{If \code{NULL} (default), no secondary plot is drawn.
43 | If "segregation", a secondary plot is drawn that shows adjusted local segregation
44 | scores for each unit. If "cumulative", a secondary plot is drawn that shows
45 | the cumulative contribution of each unit toward the total H (calculated as the
46 | proportion of each unit times the adjusted local segregation of each unit)0.}
47 | 
48 | \item{reference_distribution}{Specifies the reference distribution, given as
49 | a two-column data frame, to be plotted on the right.
50 | If order is \code{segregation}, then this reference distribution is
51 | also used to compute the local segregation scores.}
52 | 
53 | \item{bar_space}{Specifies space between single units.}
54 | 
55 | \item{hline}{Default \code{NULL}. If a color is specified,
56 | horizontal lines will be drawn where groups are separated.}
57 | }
58 | \value{
59 | Returns a ggplot2 or patchwork object.
60 | }
61 | \description{
62 | Produces a segregation plot.
63 | }
64 | 


--------------------------------------------------------------------------------
/tests/testthat/test_dissimilarity.R:
--------------------------------------------------------------------------------
 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
 2 |     return()
 3 | }
 4 | 
 5 | library("segregation")
 6 | context("test_dissimilarity")
 7 | 
 8 | test_that("correct calculations", {
 9 |     m0 <- matrix_to_long(matrix(c(100, 100, 100, 100, 100, 100), ncol = 2))
10 |     expect_equal(dissimilarity(m0, "group", "unit", weight = "n")$est[[1]], 0)
11 | 
12 |     m1 <- matrix_to_long(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2))
13 |     m2 <- matrix_to_long(matrix(c(80, 80, 20, 20, 20, 20, 80, 80), ncol = 2))
14 |     expect_equal(dissimilarity(m1, "group", "unit", weight = "n")$est[[1]], .6)
15 |     expect_equal(dissimilarity(m2, "group", "unit", weight = "n")$est[[1]], .6)
16 | 
17 |     m3 <- matrix_to_long(matrix(c(100, 100, 0, 0, 0, 0, 100, 100), ncol = 2))
18 |     expect_equal(dissimilarity(m3, "group", "unit", weight = "n")$est[[1]], 1)
19 | })
20 | 
21 | test_that("alternative calculation", {
22 |     tab <- t(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2))
23 |     div <- sweep(tab, 1, rowSums(tab), "/")
24 |     d <- 1 / 2 * sum(apply(div, 2, segregation:::abs_diff))
25 |     m1 <- matrix_to_long(t(tab))
26 |     expect_equal(dissimilarity(m1, "group", "unit", weight = "n")$est[[1]], d)
27 | })
28 | 
29 | test_that("SE works", {
30 |     m0 <- matrix_to_long(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2))
31 |     d <- dissimilarity(m0, "group", "unit", weight = "n", se = TRUE)
32 |     expect_equal(dim(d), c(1, 5))
33 |     expect_equal(d$se > 0, TRUE)
34 |     expect_equal(dim(attr(d, "bootstrap")), c(100, 2))
35 | })
36 | 
37 | test_that("names of columns", {
38 |     m0 <- matrix_to_long(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2),
39 |         group = "race", unit = "tract"
40 |     )
41 | 
42 |     d <- dissimilarity(m0, "race", "tract", weight = "n")
43 |     expect_equal(dim(d), c(1, 2))
44 | 
45 |     data.table::setDT(m0)
46 |     d <- dissimilarity(m0, "race", "tract", weight = "n")
47 |     expect_equal(dim(d), c(1, 2))
48 | })
49 | 
50 | 
51 | test_that("bootstrapping fails when sample size is non-integer", {
52 |     m0 <- matrix_to_long(matrix(c(100.3, 60, 40, 0, 0, 40, 60, 100), ncol = 2))
53 |     expect_error(dissimilarity(m0, "group", "unit", weight = "n", se = TRUE))
54 | })
55 | 
56 | test_that("gives error when group > 2", {
57 |     m0 <- matrix_to_long(matrix(c(100, 60, 40, 10, 20, 40, 60, 100, 50), ncol = 3))
58 |     expect_error(dissimilarity(m0, "group", "unit", weight = "n"))
59 | })
60 | 


--------------------------------------------------------------------------------
/man/mutual_local_expected.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mutual_expected.R
 3 | \name{mutual_local_expected}
 4 | \alias{mutual_local_expected}
 5 | \title{Calculates expected local segregation scores when true segregation is zero}
 6 | \usage{
 7 | mutual_local_expected(
 8 |   data,
 9 |   group,
10 |   unit,
11 |   weight = NULL,
12 |   fixed_margins = TRUE,
13 |   n_bootstrap = 100,
14 |   base = exp(1)
15 | )
16 | }
17 | \arguments{
18 | \item{data}{A data frame.}
19 | 
20 | \item{group}{A categorical variable or a vector of variables
21 | contained in \code{data}. Defines the first dimension
22 | over which segregation is computed.}
23 | 
24 | \item{unit}{A categorical variable or a vector of variables
25 | contained in \code{data}. Defines the group for which local
26 | segregation indices are calculated.}
27 | 
28 | \item{weight}{Numeric. (Default \code{NULL})}
29 | 
30 | \item{fixed_margins}{Should the margins be fixed or simulated? (Default \code{TRUE})}
31 | 
32 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})}
33 | 
34 | \item{base}{Base of the logarithm that is used in the calculation.
35 | Defaults to the natural logarithm.}
36 | }
37 | \value{
38 | A data.table with two rows, corresponding to the expected values of
39 |    segregation when true segregation is zero.
40 | }
41 | \description{
42 | When sample sizes are small, one group has a small proportion, or
43 | when there are many units, segregation indices are typically upwardly
44 | biased, even when true segregation is zero. This function simulates
45 | tables with zero segregation, given the marginals of the dataset,
46 | and calculates local segregation scores. If the expected values are large,
47 | the interpretation of index scores might have to be adjusted.
48 | }
49 | \examples{
50 | \dontrun{
51 | # the schools00 dataset has a large sample size, so expected segregation is close to zero
52 | mutual_local_expected(schools00, "race", "school", weight = "n")
53 | 
54 | # but we can build a smaller table, with 100 students distributed across
55 | # 10 schools, where one racial group has 10\% of the students
56 | small <- data.frame(
57 |     school = c(1:10, 1:10),
58 |     race = c(rep("r1", 10), rep("r2", 10)),
59 |     n = c(rep(1, 10), rep(9, 10))
60 | )
61 | mutual_local_expected(small, "race", "school", weight = "n")
62 | # with an increase in sample size (n=1000), the values improve
63 | small$n <- small$n * 10
64 | mutual_local_expected(small, "race", "school", weight = "n")
65 | }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <Rcpp.h>
 5 | 
 6 | using namespace Rcpp;
 7 | 
 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
 9 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
10 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
11 | #endif
12 | 
13 | // compress_compute_cpp
14 | List compress_compute_cpp(std::string neighbors_option, StringMatrix m_neighbors, int n_neighbors, NumericMatrix m_data, std::vector<std::string> unit_names, int max_iter);
15 | RcppExport SEXP _segregation_compress_compute_cpp(SEXP neighbors_optionSEXP, SEXP m_neighborsSEXP, SEXP n_neighborsSEXP, SEXP m_dataSEXP, SEXP unit_namesSEXP, SEXP max_iterSEXP) {
16 | BEGIN_RCPP
17 |     Rcpp::RObject rcpp_result_gen;
18 |     Rcpp::RNGScope rcpp_rngScope_gen;
19 |     Rcpp::traits::input_parameter< std::string >::type neighbors_option(neighbors_optionSEXP);
20 |     Rcpp::traits::input_parameter< StringMatrix >::type m_neighbors(m_neighborsSEXP);
21 |     Rcpp::traits::input_parameter< int >::type n_neighbors(n_neighborsSEXP);
22 |     Rcpp::traits::input_parameter< NumericMatrix >::type m_data(m_dataSEXP);
23 |     Rcpp::traits::input_parameter< std::vector<std::string> >::type unit_names(unit_namesSEXP);
24 |     Rcpp::traits::input_parameter< int >::type max_iter(max_iterSEXP);
25 |     rcpp_result_gen = Rcpp::wrap(compress_compute_cpp(neighbors_option, m_neighbors, n_neighbors, m_data, unit_names, max_iter));
26 |     return rcpp_result_gen;
27 | END_RCPP
28 | }
29 | // get_crosswalk_cpp
30 | List get_crosswalk_cpp(std::vector<std::string> old_unit, std::vector<std::string> new_unit);
31 | RcppExport SEXP _segregation_get_crosswalk_cpp(SEXP old_unitSEXP, SEXP new_unitSEXP) {
32 | BEGIN_RCPP
33 |     Rcpp::RObject rcpp_result_gen;
34 |     Rcpp::RNGScope rcpp_rngScope_gen;
35 |     Rcpp::traits::input_parameter< std::vector<std::string> >::type old_unit(old_unitSEXP);
36 |     Rcpp::traits::input_parameter< std::vector<std::string> >::type new_unit(new_unitSEXP);
37 |     rcpp_result_gen = Rcpp::wrap(get_crosswalk_cpp(old_unit, new_unit));
38 |     return rcpp_result_gen;
39 | END_RCPP
40 | }
41 | 
42 | static const R_CallMethodDef CallEntries[] = {
43 |     {"_segregation_compress_compute_cpp", (DL_FUNC) &_segregation_compress_compute_cpp, 6},
44 |     {"_segregation_get_crosswalk_cpp", (DL_FUNC) &_segregation_get_crosswalk_cpp, 2},
45 |     {NULL, NULL, 0}
46 | };
47 | 
48 | RcppExport void R_init_segregation(DllInfo *dll) {
49 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
50 |     R_useDynamicSymbols(dll, FALSE);
51 | }
52 | 


--------------------------------------------------------------------------------
/man/dissimilarity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dissimilarity.R
 3 | \name{dissimilarity}
 4 | \alias{dissimilarity}
 5 | \title{Calculates Index of Dissimilarity}
 6 | \usage{
 7 | dissimilarity(
 8 |   data,
 9 |   group,
10 |   unit,
11 |   weight = NULL,
12 |   se = FALSE,
13 |   CI = 0.95,
14 |   n_bootstrap = 100
15 | )
16 | }
17 | \arguments{
18 | \item{data}{A data frame.}
19 | 
20 | \item{group}{A categorical variable or a vector of variables
21 | contained in \code{data}. Defines the first dimension
22 | over which segregation is computed. The D index only
23 | allows two distinct groups.}
24 | 
25 | \item{unit}{A categorical variable or a vector of variables
26 | contained in \code{data}. Defines the second dimension
27 | over which segregation is computed.}
28 | 
29 | \item{weight}{Numeric. (Default \code{NULL})}
30 | 
31 | \item{se}{If \code{TRUE}, the segregation estimates are bootstrapped to provide
32 | standard errors and to apply bias correction. The bias that is reported
33 | has already been applied to the estimates (i.e. the reported estimates are "debiased")
34 | (Default \code{FALSE})}
35 | 
36 | \item{CI}{If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval
37 | in addition to the bootstrap standard error.
38 | This is based on percentiles of the bootstrap distribution, and a valid interpretation
39 | relies on a larger number of bootstrap iterations. (Default \code{0.95})}
40 | 
41 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})}
42 | }
43 | \value{
44 | Returns a data.table with one row. The column \code{est} contains
45 |   the Index of Dissimilarity.
46 |   If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
47 |   the associated bootstrapped standard errors, an additional column \code{CI} contains
48 |   the estimate confidence interval as a list column, an additional column \code{bias} contains
49 |   the estimated bias, and the column \code{est} contains the bias-corrected estimates.
50 | }
51 | \description{
52 | Returns the total segregation between \code{group} and \code{unit} using
53 | the Index of Dissimilarity.
54 | }
55 | \examples{
56 | # Example where D and H deviate
57 | m1 <- matrix_to_long(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2))
58 | m2 <- matrix_to_long(matrix(c(80, 80, 20, 20, 20, 20, 80, 80), ncol = 2))
59 | dissimilarity(m1, "group", "unit", weight = "n")
60 | dissimilarity(m2, "group", "unit", weight = "n")
61 | }
62 | \references{
63 | Otis Dudley Duncan and Beverly Duncan. 1955. "A Methodological Analysis of Segregation Indexes,"
64 |      American Sociological Review 20(2): 210-217.
65 | }
66 | 


--------------------------------------------------------------------------------
/man/mutual_total_expected.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mutual_expected.R
 3 | \name{mutual_total_expected}
 4 | \alias{mutual_total_expected}
 5 | \title{Calculates expected values when true segregation is zero}
 6 | \usage{
 7 | mutual_total_expected(
 8 |   data,
 9 |   group,
10 |   unit,
11 |   weight = NULL,
12 |   within = NULL,
13 |   fixed_margins = TRUE,
14 |   n_bootstrap = 100,
15 |   base = exp(1)
16 | )
17 | }
18 | \arguments{
19 | \item{data}{A data frame.}
20 | 
21 | \item{group}{A categorical variable or a vector of variables
22 | contained in \code{data}. Defines the first dimension
23 | over which segregation is computed.}
24 | 
25 | \item{unit}{A categorical variable or a vector of variables
26 | contained in \code{data}. Defines the second dimension
27 | over which segregation is computed.}
28 | 
29 | \item{weight}{Numeric. (Default \code{NULL})}
30 | 
31 | \item{within}{Apply algorithm within each group defined by this variable,
32 | and report the weighted average. (Default \code{NULL})}
33 | 
34 | \item{fixed_margins}{Should the margins be fixed or simulated? (Default \code{TRUE})}
35 | 
36 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})}
37 | 
38 | \item{base}{Base of the logarithm that is used in the calculation.
39 | Defaults to the natural logarithm.}
40 | }
41 | \value{
42 | A data.table with two rows, corresponding to the expected values of
43 |    segregation when true segregation is zero.
44 | }
45 | \description{
46 | When sample sizes are small, one group has a small proportion, or
47 | when there are many units, segregation indices are typically upwardly
48 | biased, even when true segregation is zero. This function simulates
49 | tables with zero segregation, given the marginals of the dataset,
50 | and calculates segregation. If the expected values are large,
51 | the interpretation of index scores might have to be adjusted.
52 | }
53 | \examples{
54 | \dontrun{
55 | # the schools00 dataset has a large sample size, so expected segregation is close to zero
56 | mutual_total_expected(schools00, "race", "school", weight = "n")
57 | 
58 | # but we can build a smaller table, with 100 students distributed across
59 | # 10 schools, where one racial group has 10\% of the students
60 | small <- data.frame(
61 |     school = c(1:10, 1:10),
62 |     race = c(rep("r1", 10), rep("r2", 10)),
63 |     n = c(rep(1, 10), rep(9, 10))
64 | )
65 | mutual_total_expected(small, "race", "school", weight = "n")
66 | # with an increase in sample size (n=1000), the values improve
67 | small$n <- small$n * 10
68 | mutual_total_expected(small, "race", "school", weight = "n")
69 | }
70 | }
71 | 


--------------------------------------------------------------------------------
/R/exposure.R:
--------------------------------------------------------------------------------
 1 | #' Calculates pairwise exposure indices
 2 | #'
 3 | #' Returns the pairwise exposure indices between groups
 4 | #'
 5 | #' @param data A data frame.
 6 | #' @param group A categorical variable
 7 | #'   contained in \code{data}. Defines the first dimension
 8 | #'   over which segregation is computed.
 9 | #' @param unit A vector of variables
10 | #'   contained in \code{data}. Defines the second dimension
11 | #'   over which segregation is computed.
12 | #' @param weight Numeric. (Default \code{NULL})
13 | #' @return Returns a data.table with columns "of", "to", and
14 | #'  "exposure". Read results as "exposure of group x to group y".
15 | #' @import data.table
16 | #' @export
17 | exposure <- function(data, group, unit, weight = NULL) {
18 |     checkmate::assert_data_frame(data)
19 |     checkmate::assert_character(group, len = 1)
20 |     checkmate::assert_character(unit, min.len = 1)
21 |     checkmate::assert_character(weight, null.ok = TRUE)
22 |     checkmate::assert_names(names(data), must.include = c(group, unit, weight))
23 | 
24 |     of <- prepare_data(data, group, unit, weight)
25 |     to <- copy(of)
26 |     of[, n_unit := sum(freq), by = unit]
27 |     of[, n_group := sum(freq), by = group]
28 |     setnames(of, "freq", "freq_of")
29 |     setnames(of, group, "of")
30 |     setnames(to, "freq", "freq_to")
31 |     setnames(to, group, "to")
32 |     d <- merge(of, to, by = unit, allow.cartesian = TRUE)
33 |     exp <- d[, .(exposure = sum(freq_of * freq_to / (n_unit * n_group))), by = .(of, to)]
34 |     exp[]
35 | }
36 | 
37 | #' Calculates isolation indices
38 | #'
39 | #' Returns isolation index of each group
40 | #'
41 | #' @param data A data frame.
42 | #' @param group A categorical variable
43 | #'   contained in \code{data}. Defines the first dimension
44 | #'   over which segregation is computed.
45 | #' @param unit A vector of variables
46 | #'   contained in \code{data}. Defines the second dimension
47 | #'   over which segregation is computed.
48 | #' @param weight Numeric. (Default \code{NULL})
49 | #' @return Returns a data.table with group column and isolation index.
50 | #' @import data.table
51 | #' @export
52 | #' @import data.table
53 | #' @export
54 | isolation <- function(data, group, unit, weight = NULL) {
55 |     checkmate::assert_data_frame(data)
56 |     checkmate::assert_character(group, len = 1)
57 |     checkmate::assert_character(unit, min.len = 1)
58 |     checkmate::assert_character(weight, null.ok = TRUE)
59 |     checkmate::assert_names(names(data), must.include = c(group, unit, weight))
60 | 
61 |     d <- prepare_data(data, group, unit, weight)
62 | 
63 |     d[, n_unit := sum(freq), by = unit]
64 |     d[, n_group := sum(freq), by = group]
65 |     iso <- d[, .(isolation = sum(freq^2 / (n_unit * n_group))), by = group]
66 |     iso[]
67 | }
68 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # segregation (development version)
 2 | 
 3 | - breaking: rename `mutual_expected` to `mutual_total_expected`
 4 | - add `mutual_local_expected`
 5 | - ipf: decrease default precision (longer runtime, but more precise results)
 6 | - entropy: add 'within' argument
 7 | 
 8 | # segregation 1.1.0
 9 | 
10 | - various improvements to compression algorithm
11 | - add dendrogram visualization
12 | - allow multiple curves in `segcurve` function
13 | - segplot: add optional 'secondary_plot' argument
14 | - segplot: remove 'title' argument
15 | - segplot: add optional 'hline' argument
16 | 
17 | # segregation 1.0.0
18 | 
19 | - add mutual_total_nested
20 | - add within argument to mutual_expected
21 | - add dissimilarity_expected
22 | - add suite of compression-related functions (some in C++)
23 | - add segplot function
24 | - add functions exposure and isolation
25 | - fix that roxygen2 problem
26 | 
27 | # segregation 0.6.0
28 | 
29 | - faster mutual_total(..., within)
30 | - updated docs
31 | - some minor bug fixes
32 | - some improved error messages
33 | 
34 | # segregation 0.5.0
35 | 
36 | - dissimilarity: support for the index of dissimilarity
37 | - add CI argument for confidence intervals
38 | - mutual_within: report ent_ratio instead of h_weight
39 | - matrix_to_long: convert contingency tables into long form
40 | - add introductory vignette
41 | 
42 | # segregation 0.4.0
43 | 
44 | - faster bootstrap
45 | - return bootstrap estimates as attr
46 | - add mutual_expected
47 | - apply bias-correction via bootstrap by default when se=TRUE
48 | 
49 | # segregation 0.3.0
50 | 
51 | - always return data.table
52 | - for ipf function, warn when groups/units are dropped
53 | - return sample size of source dataset for IPF
54 | - don't allow bootstrap when sample size is not an integer, but allow non-integer sample weights (which are unproblematic)
55 | - simplify precision parameter for ipf procedure
56 | - increase default bootstrap to 100
57 | - fix data.table issue (#3)
58 | 
59 | # segregation 0.2.0
60 | 
61 | - add "shapley" decomposition method, revisit other difference decomposition methods
62 | - better logging of bootstrap/IPF
63 | - several small fixes
64 | - add lintr package
65 | - add warning when attempting bootstrap with non-integer weights
66 | 
67 | # segregation 0.1.0
68 | 
69 | - switch group and unit definitions, to be consistent with the literature
70 | - add Theil's Information Index (H)
71 | - add entropy function
72 | - add mutual_within function to decompose weighted within indices
73 | - add "wide" option to mutual_local and mutual_within
74 | - add "ipf" (iterative proportional fitting) function and a difference decomposition based on IPF
75 | - "mrc_adjusted" difference decomposition is defined only on overlap sample of units and groups
76 | - internal refactoring
77 | 
78 | # segregation 0.0.1
79 | 
80 | Initial release.
81 | 


--------------------------------------------------------------------------------
/data-raw/clean_data.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(haven)
 3 | 
 4 | set.seed(179463)
 5 | 
 6 | prepare <- function(d) {
 7 |     d %>%
 8 |     # only primary schools
 9 |     filter(level == 1) %>%
10 |     na.omit() %>%
11 |     filter(state %in% c("AL", "CO", "CT")) %>%
12 |     mutate(state = recode(state, AL = "A", "CO" = "B", "CT" = "C")) %>%
13 |     mutate(check = native + asian + hisp + black + white) %>%
14 |     # only schools with >100 students
15 |     filter(total == check, total > 100) %>%
16 |     select(-total, -check, -level) %>%
17 |     gather(key = race, value = n, -state, -district, -school) %>%
18 |     filter(n > 0) %>%
19 |     # sort
20 |     arrange(state, district, school, race)
21 | }
22 | 
23 | # NCES Common Core data for 2000-01
24 | rawd <- read_sas("sc001aai.zip")
25 | schools00 <- select(rawd,
26 |     state = LSTATE00, district = LEAID,
27 |     school = NCESSCH, level = LEVEL00, total = MEMBER00,
28 |     native = AM00, asian = ASIAN00, hisp = HISP00,
29 |     black = BLACK00, white = WHITE00
30 | ) %>% prepare
31 | schools00$n <- round(rnorm(nrow(schools00), schools00$n, sd=sqrt(schools00$n)))
32 | schools00 <- filter(schools00, n > 0)
33 | 
34 | 
35 | # NCES Common Core data for 2005-06
36 | rawd <- read_sas("sc051aai_sas.zip")
37 | schools05 <- select(rawd,
38 |     state = LSTATE05, district = LEAID,
39 |     school = NCESSCH, level = LEVEL05, total = MEMBER05,
40 |     native = AM05, asian = ASIAN05, hisp = HISP05,
41 |     black = BLACK05, white = WHITE05
42 | ) %>% prepare
43 | schools05$n <- round(rnorm(nrow(schools05), schools05$n, sd=sqrt(schools05$n)))
44 | schools05 <- filter(schools05, n > 0)
45 | 
46 | # create new school and district identifiers
47 | district <- bind_rows(
48 |         select(schools00, state, district), 
49 |         select(schools05, state, district)) %>%
50 |     distinct() %>%
51 |     group_by(state) %>%
52 |     mutate(district_new=paste0(state, 1:n())) %>%
53 |     select(district, district_new)
54 | schools00 <- left_join(schools00, district)
55 | schools05 <- left_join(schools05, district)
56 | 
57 | school <- bind_rows(select(schools00, district_new, school), 
58 |     select(schools05, district_new, school)) %>%
59 |     distinct() %>%
60 |     group_by(district_new) %>%
61 |     mutate(school_new=paste0(district_new, '_', 1:n())) %>%
62 |     ungroup() %>%
63 |     select(school, school_new)
64 | schools00 <- left_join(schools00, school)
65 | schools05 <- left_join(schools05, school)
66 | 
67 | schools00 <- schools00 %>%
68 |     select(state, district = district_new, school = school_new, 
69 |         race, n) %>%
70 |     mutate_at(vars(state, district, school, race), as_factor)
71 | 
72 | schools05 <- schools05 %>%
73 |     select(state, district = district_new, school = school_new, 
74 |         race, n) %>%
75 |     mutate_at(vars(state, district, school, race), as_factor)
76 | 
77 | print(schools00)
78 | save(schools00, file = "../data/schools00.rda")
79 | print(schools05)
80 | save(schools05, file = "../data/schools05.rda")
81 | 


--------------------------------------------------------------------------------
/man/mutual_local.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mutual.R
 3 | \name{mutual_local}
 4 | \alias{mutual_local}
 5 | \title{Calculates local segregation scores based on M}
 6 | \usage{
 7 | mutual_local(
 8 |   data,
 9 |   group,
10 |   unit,
11 |   weight = NULL,
12 |   se = FALSE,
13 |   CI = 0.95,
14 |   n_bootstrap = 100,
15 |   base = exp(1),
16 |   wide = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{data}{A data frame.}
21 | 
22 | \item{group}{A categorical variable or a vector of variables
23 | contained in \code{data}. Defines the dimension
24 | over which segregation is computed.}
25 | 
26 | \item{unit}{A categorical variable or a vector of variables
27 | contained in \code{data}. Defines the group for which local
28 | segregation indices are calculated.}
29 | 
30 | \item{weight}{Numeric. (Default \code{NULL})}
31 | 
32 | \item{se}{If \code{TRUE}, the segregation estimates are bootstrapped to provide
33 | standard errors and to apply bias correction. The bias that is reported
34 | has already been applied to the estimates (i.e. the reported estimates are "debiased")
35 | (Default \code{FALSE})}
36 | 
37 | \item{CI}{If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval
38 | in addition to the bootstrap standard error.
39 | This is based on percentiles of the bootstrap distribution, and a valid interpretation
40 | relies on a larger number of bootstrap iterations. (Default \code{0.95})}
41 | 
42 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})}
43 | 
44 | \item{base}{Base of the logarithm that is used in the calculation.
45 | Defaults to the natural logarithm.}
46 | 
47 | \item{wide}{Returns a wide dataframe instead of a long dataframe.
48 | (Default \code{FALSE})}
49 | }
50 | \value{
51 | Returns a data.table with two rows for each category defined by \code{unit},
52 |   for a total of \code{2*(number of units)} rows.
53 |   The column \code{est} contains two statistics that
54 |   are provided for each unit: \code{ls}, the local segregation score, and
55 |   \code{p}, the proportion of the unit from the total number of cases.
56 |   If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
57 |   the associated bootstrapped standard errors, an additional column \code{CI} contains
58 |   the estimate confidence interval as a list column, an additional column \code{bias} contains
59 |   the estimated bias, and the column \code{est} contains the bias-corrected estimates.
60 |   If \code{wide} is set to \code{TRUE}, returns instead a wide dataframe, with one
61 |   row for each \code{unit}, and the associated statistics in separate columns.
62 | }
63 | \description{
64 | Returns local segregation indices for each category defined
65 | by \code{unit}.
66 | }
67 | \examples{
68 | # which schools are most segregated?
69 | (localseg <- mutual_local(schools00, "race", "school",
70 |     weight = "n", wide = TRUE
71 | ))
72 | 
73 | sum(localseg$p) # => 1
74 | 
75 | # the sum of the weighted local segregation scores equals
76 | # total segregation
77 | sum(localseg$ls * localseg$p) # => .425
78 | mutual_total(schools00, "school", "race", weight = "n") # M => .425
79 | }
80 | \references{
81 | Henri Theil. 1971. Principles of Econometrics. New York: Wiley.
82 | 
83 | Ricardo Mora and Javier Ruiz-Castillo. 2011.
84 |   "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194.
85 | }
86 | 


--------------------------------------------------------------------------------
/tests/testthat/test_plots.R:
--------------------------------------------------------------------------------
 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
 2 |     return()
 3 | }
 4 | 
 5 | library("segregation")
 6 | context("plots")
 7 | 
 8 | skip_if(!requireNamespace("ggplot2", quietly = TRUE))
 9 | skip_if(!requireNamespace("patchwork", quietly = TRUE))
10 | 
11 | plot_majority <- segplot(schools00, "race", "school", weight = "n", order = "majority", hline = "white")
12 | plot_majority_fixed <- segplot(schools00, "race", "school", weight = "n", order = "majority_fixed")
13 | plot_seg <- segplot(schools00, "race", "school", weight = "n", order = "segregation")
14 | plot_entropy <- segplot(schools00, "race", "school",
15 |     weight = "n", order = "entropy",
16 |     bar_space = 0.1
17 | )
18 | 
19 | test_that("dimensions", {
20 |     expect_equal(nrow(plot_majority$data), nrow(plot_seg$data))
21 |     expect_equal(nrow(plot_majority$data), nrow(plot_entropy$data))
22 |     expect_equal(nrow(plot_majority$data), nrow(plot_majority_fixed$data))
23 | })
24 | 
25 | test_that("reference", {
26 |     reference <- data.table::as.data.table(schools00)
27 |     reference <- reference[, .(N = sum(n)), by = .(race)]
28 |     reference[, p := N / sum(N)]
29 |     expect_error(segplot(schools00, "race", "school",
30 |         weight = "n", order = "segregation",
31 |         reference_distribution = reference
32 |     ))
33 |     expect_error(segplot(schools00, "race", "school",
34 |         weight = "n", order = "segregation",
35 |         reference_distribution = reference[1:2, ]
36 |     ))
37 |     expect_error(segplot(schools00, "race", "school",
38 |         weight = "n", order = "segregation",
39 |         reference_distribution = 1
40 |     ))
41 | 
42 |     reference[, N := NULL]
43 |     sp <- segplot(schools00, "race", "school",
44 |         weight = "n", order = "segregation"
45 |     )
46 |     sp_ref <- segplot(schools00, "race", "school",
47 |         weight = "n", order = "segregation",
48 |         reference_distribution = reference
49 |     )
50 |     # identical order
51 |     expect_equal(sp$data[["unit"]], sp_ref$data[["unit"]])
52 | })
53 | 
54 | test_that("secondary plot", {
55 |     plot_2a <- segplot(schools00, "race", "school",
56 |         weight = "n",
57 |         order = "segregation", secondary_plot = "segregation"
58 |     )
59 |     plot_2b <- segplot(schools00, "race", "school",
60 |         weight = "n",
61 |         order = "segregation", secondary_plot = "cumulative"
62 |     )
63 | 
64 |     expect_equal(plot_seg$data, plot_2a[[1]]$data)
65 |     expect_equal(plot_seg$data, plot_2b[[1]]$data)
66 |     expect_true("patchwork" %in% class(plot_2a))
67 |     expect_true("patchwork" %in% class(plot_2b))
68 | })
69 | 
70 | test_that("segcurve", {
71 |     expect_error(segcurve(schools00, "race", "school", weight = "n"))
72 |     expect_error(segcurve(schools00, "race", "school", weight = "n", segment = c("a", "b")))
73 |     expect_error(segcurve(schools00, "race", "school", weight = "n", segment = c("state", "school")))
74 | 
75 |     p1 <- segcurve(subset(schools00, race %in% c("white", "black")),
76 |         "race", "school",
77 |         weight = "n"
78 |     )
79 |     p2 <- segcurve(subset(schools00, race %in% c("white", "asian")),
80 |         "race", "school",
81 |         weight = "n"
82 |     )
83 |     p3 <- segcurve(subset(schools00, race %in% c("white", "asian")),
84 |         "race", "school",
85 |         weight = "n",
86 |         segment = "state"
87 |     )
88 |     expect_equal(p3$labels$colour, "state")
89 | })
90 | 


--------------------------------------------------------------------------------
/tests/testthat/test_mutual_local.R:
--------------------------------------------------------------------------------
 1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
 2 |     return()
 3 | }
 4 | 
 5 | library("segregation")
 6 | context("test_mutual_local")
 7 | 
 8 | test_data <- data.frame(
 9 |     u = c(rep("a", 4), rep("b", 4)),
10 |     g = rep(c(1, 2, 3, 4), 2),
11 |     n = c(40, 20, 5, 1, 20, 40, 60, 80)
12 | )
13 | 
14 | local <- mutual_local(test_data, "u", "g", weight = "n")
15 | local2 <- mutual_local(test_data, "g", "u", weight = "n")
16 | localse <- mutual_local(test_data, "u", "g", weight = "n", se = TRUE, n_bootstrap = 10)
17 | localbase2 <- mutual_local(test_data, "g", "u", weight = "n", base = 2)
18 | 
19 | test_that("local calculation works", {
20 |     expect_equal(sum(local[stat == "p", est]), 1)
21 |     expect_equal(sum(local2[local2$stat == "p", est]), 1)
22 |     expect_equal(sum(localse[localse$stat == "p", est]), 1)
23 | 
24 |     expect_equal(
25 |         sum(local[stat == "p", est] * local[stat == "ls", est]),
26 |         sum(local2[local2$stat == "p", est] * local2[local2$stat == "ls", est])
27 |     )
28 |     expect_equal(
29 |         sum(local[stat == "p", est] * local[stat == "ls", est]),
30 |         mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est]
31 |     )
32 |     expect_equal(
33 |         sum(localbase2[stat == "p", est] * localbase2[stat == "ls", est]),
34 |         mutual_total(test_data, "u", "g", weight = "n", base = 2)[stat == "M", est]
35 |     )
36 | })
37 | 
38 | test_that("return works", {
39 |     expect_equal(nrow(local), 8)
40 |     expect_equal(ncol(local), 3)
41 | })
42 | 
43 | test_that("bootstrapping works", {
44 |     expect_equal(nrow(localse), 8)
45 |     expect_equal(ncol(localse), 6)
46 | })
47 | 
48 | test_that("bootstrap attributes exists", {
49 |     expect_equal(dim(attr(localse, "bootstrap")), c(10 * length(unique(test_data$g)) * 2, 3))
50 | })
51 | 
52 | test_that("bootstrapping fails when sample size is non-integer", {
53 |     test_data <- data.frame(
54 |         u = c(rep("a", 4), rep("b", 4)),
55 |         g = rep(c(1, 2, 3, 4), 2),
56 |         n = c(40, 20, 5, 1, 20, 40, 60, 80.3)
57 |     )
58 | 
59 |     expect_error(mutual_local(test_data, "u", "g", weight = "n", se = TRUE))
60 |     # rescale
61 |     test_data$n2 <- test_data$n / sum(test_data$n) * round(sum(test_data$n))
62 |     ret <- mutual_local(test_data, "u", "g", weight = "n2", se = TRUE)
63 |     expect_equal(all(ret$se > 0), TRUE)
64 | })
65 | 
66 | test_that("option wide works", {
67 |     nowide <- mutual_local(test_data, "u", "g", weight = "n")
68 |     nowide_se <- mutual_local(test_data, "u", "g",
69 |         weight = "n",
70 |         se = TRUE, n_bootstrap = 10
71 |     )
72 |     wide <- mutual_local(test_data, "u", "g", weight = "n", wide = TRUE)
73 |     wide_se <- mutual_local(test_data, "u", "g",
74 |         weight = "n", wide = TRUE,
75 |         se = TRUE, n_bootstrap = 10
76 |     )
77 | 
78 |     expect_equal(ncol(nowide) + 3, ncol(nowide_se))
79 |     expect_equal(nrow(nowide), 8)
80 |     expect_equal(nrow(nowide), nrow(nowide_se))
81 |     expect_equal(nrow(nowide), nrow(wide) * 2)
82 |     expect_equal(ncol(wide) + 6, ncol(wide_se))
83 |     expect_equal(nrow(wide), nrow(wide_se))
84 | 
85 |     expect_equal(nowide[stat == "ls", est], wide$ls)
86 |     expect_equal(nowide[stat == "p", est], wide$p)
87 | 
88 |     total <- mutual_total(test_data, "u", "g", weight = "n")
89 |     expect_equal(
90 |         total[stat == "M", est],
91 |         sum(nowide[stat == "ls", est] * nowide[stat == "p", "est"])
92 |     )
93 |     expect_equal(total[stat == "M", est], sum(wide$ls * wide$p))
94 |     expect_equal(total[stat == "H", est], sum(wide$ls * wide$p) / entropy(test_data, "u", weight = "n"))
95 | 
96 |     expect_equal(all(nowide_se[["se"]] > 0), TRUE)
97 | })
98 | 


--------------------------------------------------------------------------------
/tests/testthat/test_mutual_local_expected.R:
--------------------------------------------------------------------------------
  1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
  2 |     return()
  3 | }
  4 | 
  5 | library("segregation")
  6 | context("test_mutual_local_expected")
  7 | 
  8 | data1 <- data.frame(
  9 |     u = rep(c(1, 2, 3, 4), 2),
 10 |     g = c(rep("a", 4), rep("b", 4)),
 11 |     n = c(40, 20, 5, 1, 20, 40, 60, 80),
 12 |     stringsAsFactors = FALSE
 13 | )
 14 | 
 15 | data2 <- data.frame(
 16 |     u = c(1:10, 1:10),
 17 |     g = c(rep("a", 10), rep("b", 10)),
 18 |     n = c(rep(1, 10), rep(9, 10)),
 19 |     stringsAsFactors = FALSE
 20 | )
 21 | 
 22 | test_that("works both ways around", {
 23 |     data1a <- mutual_local_expected(data1, "u", "g", weight = "n")
 24 |     data1b <- mutual_local_expected(data1, "g", "u", weight = "n")
 25 |     expect_equal(nrow(data1a), 2)
 26 |     expect_equal(nrow(data1b), 4)
 27 |     expect_equal(data1a[, sum(est * p_mean)], data1b[, sum(est * p_mean)], tolerance = 0.01)
 28 | 
 29 |     data2a <- mutual_local_expected(data2, "u", "g", weight = "n")
 30 |     data2b <- mutual_local_expected(data2, "g", "u", weight = "n")
 31 |     expect_equal(nrow(data2a), 2)
 32 |     expect_equal(nrow(data2b), 10)
 33 |     expect_equal(data2a[, sum(est * p_mean)], data2b[, sum(est * p_mean)], tolerance = 0.01)
 34 | })
 35 | 
 36 | test_that("works nested", {
 37 |     data1$superunit <- data1$u <= 3
 38 |     a <- mutual_local_expected(data1, "g", c("superunit", "u"), weight = "n")
 39 |     b <- mutual_local_expected(data1, "g", "u", weight = "n")
 40 | 
 41 |     expect_equal(a$est, b$est, tolerance = 0.01)
 42 |     expect_equal(a$se, b$se, tolerance = 0.01)
 43 |     expect_equal(length(a$est), 4)
 44 | })
 45 | 
 46 | test_that("fixed margins = FALSE", {
 47 |     data1a <- mutual_local_expected(data1, "u", "g", weight = "n", fixed_margins = FALSE)
 48 |     data1b <- mutual_local_expected(data1, "g", "u", weight = "n", fixed_margins = FALSE)
 49 |     expect_equal(nrow(data1a), 2)
 50 |     expect_equal(nrow(data1b), 4)
 51 |     expect_equal(data1a[, sum(est * p_mean)], data1b[, sum(est * p_mean)], tolerance = 0.01)
 52 | 
 53 |     data2a <- mutual_local_expected(data2, "u", "g", weight = "n", fixed_margins = FALSE)
 54 |     data2b <- mutual_local_expected(data2, "g", "u", weight = "n", fixed_margins = FALSE)
 55 |     expect_equal(nrow(data2a), 2)
 56 |     expect_equal(nrow(data2b), 10)
 57 |     expect_equal(data2a[, sum(est * p_mean)], data2b[, sum(est * p_mean)], tolerance = 0.01)
 58 | })
 59 | 
 60 | test_that("aligns with mutual_total_expected", {
 61 |     expect_equal(
 62 |         mutual_local_expected(data1, "u", "g", weight = "n")[, sum(est * p_mean)],
 63 |         mutual_total_expected(data1, "u", "g", weight = "n")[stat == "M under 0", est],
 64 |         tolerance = 0.01
 65 |     )
 66 | 
 67 |     expect_equal(
 68 |         mutual_local_expected(data2, "u", "g", weight = "n")[, sum(est * p_mean)],
 69 |         mutual_total_expected(data2, "u", "g", weight = "n")[stat == "M under 0", est],
 70 |         tolerance = 0.01
 71 |     )
 72 | 
 73 |     expect_equal(
 74 |         mutual_local_expected(data1, "u", "g", weight = "n", fixed_margins = FALSE)[, sum(est * p_mean)],
 75 |         mutual_total_expected(data1, "u", "g", weight = "n", fixed_margins = FALSE)[stat == "M under 0", est],
 76 |         tolerance = 0.01
 77 |     )
 78 | 
 79 |     expect_equal(
 80 |         mutual_local_expected(data2, "u", "g", weight = "n", fixed_margins = FALSE)[, sum(est * p_mean)],
 81 |         mutual_total_expected(data2, "u", "g", weight = "n", fixed_margins = FALSE)[stat == "M under 0", est],
 82 |         tolerance = 0.01
 83 |     )
 84 | })
 85 | 
 86 | 
 87 | test_that("errors", {
 88 |     dat <- data.frame(
 89 |         u = rep(c(1, 2, 3, 4), 2),
 90 |         g = c(rep("a", 4), rep("b", 4)),
 91 |         n = c(40.2, 20, 5, 1, 20, 40, 60, 80),
 92 |         stringsAsFactors = FALSE
 93 |     )
 94 | 
 95 |     expect_error(
 96 |         mutual_local_expected(dat, "g", "u", weight = "n"),
 97 |         "bootstrap with a total sample size"
 98 |     )
 99 | })
100 | 


--------------------------------------------------------------------------------
/man/mutual_within.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/mutual.R
  3 | \name{mutual_within}
  4 | \alias{mutual_within}
  5 | \title{Calculates detailed within-category segregation scores for M and H}
  6 | \usage{
  7 | mutual_within(
  8 |   data,
  9 |   group,
 10 |   unit,
 11 |   within,
 12 |   weight = NULL,
 13 |   se = FALSE,
 14 |   CI = 0.95,
 15 |   n_bootstrap = 100,
 16 |   base = exp(1),
 17 |   wide = FALSE
 18 | )
 19 | }
 20 | \arguments{
 21 | \item{data}{A data frame.}
 22 | 
 23 | \item{group}{A categorical variable or a vector of variables
 24 | contained in \code{data}. Defines the first dimension
 25 | over which segregation is computed.}
 26 | 
 27 | \item{unit}{A categorical variable or a vector of variables
 28 | contained in \code{data}. Defines the second dimension
 29 | over which segregation is computed.}
 30 | 
 31 | \item{within}{A categorical variable or a vector of variables
 32 | contained in \code{data} that defines the within-segregation categories.}
 33 | 
 34 | \item{weight}{Numeric. (Default \code{NULL})}
 35 | 
 36 | \item{se}{If \code{TRUE}, the segregation estimates are bootstrapped to provide
 37 | standard errors and to apply bias correction. The bias that is reported
 38 | has already been applied to the estimates (i.e. the reported estimates are "debiased")
 39 | (Default \code{FALSE})}
 40 | 
 41 | \item{CI}{If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval
 42 | in addition to the bootstrap standard error.
 43 | This is based on percentiles of the bootstrap distribution, and a valid interpretation
 44 | relies on a larger number of bootstrap iterations. (Default \code{0.95})}
 45 | 
 46 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})}
 47 | 
 48 | \item{base}{Base of the logarithm that is used in the calculation.
 49 | Defaults to the natural logarithm.}
 50 | 
 51 | \item{wide}{Returns a wide dataframe instead of a long dataframe.
 52 | (Default \code{FALSE})}
 53 | }
 54 | \value{
 55 | Returns a data.table with four rows for each category defined by \code{within}.
 56 |   The column \code{est} contains four statistics that
 57 |   are provided for each unit:
 58 |   \code{M} is the within-category M, and \code{p} is the proportion of the category.
 59 |   Multiplying \code{M} and \code{p} gives the contribution of each within-category
 60 |   towards the total M.
 61 |   \code{H} is the within-category H, and \code{ent_ratio} provides the entropy ratio,
 62 |   defined as \code{EW/E}, where \code{EW} is the within-category entropy,
 63 |   and \code{E} is the overall entropy.
 64 |   Multiplying \code{H}, \code{p}, and \code{ent_ratio} gives the contribution of each within-category
 65 |   towards the total H.
 66 |   If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
 67 |   the associated bootstrapped standard errors, an additional column \code{CI} contains
 68 |   the estimate confidence interval as a list column, an additional column \code{bias} contains
 69 |   the estimated bias, and the column \code{est} contains the bias-corrected estimates.
 70 |   If \code{wide} is set to \code{TRUE}, returns instead a wide dataframe, with one
 71 |   row for each \code{within} category, and the associated statistics in separate columns.
 72 | }
 73 | \description{
 74 | Calculates the segregation between \code{group} and \code{unit}
 75 | within each category defined by \code{within}.
 76 | }
 77 | \examples{
 78 | \dontrun{
 79 | (within <- mutual_within(schools00, "race", "school",
 80 |     within = "state",
 81 |     weight = "n", wide = TRUE
 82 | ))
 83 | # the M for state "A" is .409
 84 | # manual calculation
 85 | schools_A <- schools00[schools00$state == "A", ]
 86 | mutual_total(schools_A, "race", "school", weight = "n") # M => .409
 87 | 
 88 | # to recover the within M and H from the output, multiply
 89 | # p * M and p * ent_ratio * H, respectively
 90 | sum(within$p * within$M) # => .326
 91 | sum(within$p * within$ent_ratio * within$H) # => .321
 92 | # compare with:
 93 | mutual_total(schools00, "race", "school", within = "state", weight = "n")
 94 | }
 95 | }
 96 | \references{
 97 | Henri Theil. 1971. Principles of Econometrics. New York: Wiley.
 98 | 
 99 | Ricardo Mora and Javier Ruiz-Castillo. 2011.
100 |      "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194.
101 | }
102 | 


--------------------------------------------------------------------------------
/R/dissimilarity.R:
--------------------------------------------------------------------------------
 1 | abs_diff <- function(x) {
 2 |     if (length(x) == 1) {
 3 |         abs(x)
 4 |     } else {
 5 |         abs(diff(x))
 6 |     }
 7 | }
 8 | 
 9 | #' @import data.table
10 | dissimilarity_compute <- function(data, group, unit) {
11 |     data[, n_group := sum(freq), by = group]
12 |     est <- 1 / 2 * data[, abs_diff(freq / n_group), by = unit][, sum(V1)]
13 |     data.table(stat = "D", est = est, stringsAsFactors = FALSE)
14 | }
15 | 
16 | #' Calculates Index of Dissimilarity
17 | #'
18 | #' Returns the total segregation between \code{group} and \code{unit} using
19 | #' the Index of Dissimilarity.
20 | #'
21 | #' @param data A data frame.
22 | #' @param group A categorical variable or a vector of variables
23 | #'   contained in \code{data}. Defines the first dimension
24 | #'   over which segregation is computed. The D index only
25 | #'   allows two distinct groups.
26 | #' @param unit A categorical variable or a vector of variables
27 | #'   contained in \code{data}. Defines the second dimension
28 | #'   over which segregation is computed.
29 | #' @param weight Numeric. (Default \code{NULL})
30 | #' @param se If \code{TRUE}, the segregation estimates are bootstrapped to provide
31 | #'   standard errors and to apply bias correction. The bias that is reported
32 | #'   has already been applied to the estimates (i.e. the reported estimates are "debiased")
33 | #'   (Default \code{FALSE})
34 | #' @param CI If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval
35 | #'   in addition to the bootstrap standard error.
36 | #'   This is based on percentiles of the bootstrap distribution, and a valid interpretation
37 | #'   relies on a larger number of bootstrap iterations. (Default \code{0.95})
38 | #' @param n_bootstrap Number of bootstrap iterations. (Default \code{100})
39 | #' @return Returns a data.table with one row. The column \code{est} contains
40 | #'   the Index of Dissimilarity.
41 | #'   If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
42 | #'   the associated bootstrapped standard errors, an additional column \code{CI} contains
43 | #'   the estimate confidence interval as a list column, an additional column \code{bias} contains
44 | #'   the estimated bias, and the column \code{est} contains the bias-corrected estimates.
45 | #' @references
46 | #' Otis Dudley Duncan and Beverly Duncan. 1955. "A Methodological Analysis of Segregation Indexes,"
47 | #'      American Sociological Review 20(2): 210-217.
48 | #' @examples
49 | #' # Example where D and H deviate
50 | #' m1 <- matrix_to_long(matrix(c(100, 60, 40, 0, 0, 40, 60, 100), ncol = 2))
51 | #' m2 <- matrix_to_long(matrix(c(80, 80, 20, 20, 20, 20, 80, 80), ncol = 2))
52 | #' dissimilarity(m1, "group", "unit", weight = "n")
53 | #' dissimilarity(m2, "group", "unit", weight = "n")
54 | #' @import data.table
55 | #' @export
56 | dissimilarity <- function(data, group, unit, weight = NULL,
57 |                           se = FALSE, CI = 0.95, n_bootstrap = 100) {
58 |     stopifnot(CI > 0 & CI < 1)
59 |     if (length(unique(data[[group]])) != 2) {
60 |         stop("The D index only allows two distinct groups")
61 |     }
62 | 
63 |     d <- prepare_data(data, group, unit, weight)
64 |     ret <- dissimilarity_compute(d, group, unit)
65 | 
66 |     if (se == TRUE) {
67 |         vars <- attr(d, "vars")
68 |         n_total <- sum(d[["freq"]])
69 | 
70 |         if (all.equal(n_total, round(n_total)) == TRUE) {
71 |             message(paste0(n_bootstrap, " bootstrap iterations on ", n_total, " observations"))
72 |         } else {
73 |             stop(paste0(
74 |                 "bootstrap with a total sample size that is not an integer is not allowed, ",
75 |                 "maybe scale your weights?"
76 |             ))
77 |         }
78 |         # draw from a multinomial with weights specified by the cell counts
79 |         draws <- stats::rmultinom(n_bootstrap, n_total, d[["freq"]] / n_total)
80 | 
81 |         boot_ret <- lapply(seq_len(n_bootstrap), function(i) {
82 |             if (i %% 5 == 0) update_log(bs_n = i, bs_max = n_bootstrap)
83 |             d[, freq := as.double(draws[, i])]
84 |             dissimilarity_compute(d[freq > 0], group, unit)
85 |         })
86 |         close_log()
87 |         boot_ret <- rbindlist(boot_ret)
88 |         ret <- bootstrap_summary(ret, boot_ret, "stat", CI)
89 |         setattr(ret, "bootstrap", boot_ret)
90 |     }
91 |     ret
92 | }
93 | 


--------------------------------------------------------------------------------
/man/mutual_total.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/mutual.R
  3 | \name{mutual_total}
  4 | \alias{mutual_total}
  5 | \title{Calculates the Mutual Information Index M and Theil's Entropy Index H}
  6 | \usage{
  7 | mutual_total(
  8 |   data,
  9 |   group,
 10 |   unit,
 11 |   within = NULL,
 12 |   weight = NULL,
 13 |   se = FALSE,
 14 |   CI = 0.95,
 15 |   n_bootstrap = 100,
 16 |   base = exp(1)
 17 | )
 18 | }
 19 | \arguments{
 20 | \item{data}{A data frame.}
 21 | 
 22 | \item{group}{A categorical variable or a vector of variables
 23 | contained in \code{data}. Defines the first dimension
 24 | over which segregation is computed.}
 25 | 
 26 | \item{unit}{A categorical variable or a vector of variables
 27 | contained in \code{data}. Defines the second dimension
 28 | over which segregation is computed.}
 29 | 
 30 | \item{within}{A categorical variable or a vector of variables
 31 | contained in \code{data}. The variable(s) should be a superset of either
 32 | the \code{unit} or the \code{group} for the calculation to be meaningful.
 33 | If provided, segregation is
 34 | computed within the groups defined by the variable, and then averaged.
 35 | (Default \code{NULL})}
 36 | 
 37 | \item{weight}{Numeric. (Default \code{NULL})}
 38 | 
 39 | \item{se}{If \code{TRUE}, the segregation estimates are bootstrapped to provide
 40 | standard errors and to apply bias correction. The bias that is reported
 41 | has already been applied to the estimates (i.e. the reported estimates are "debiased")
 42 | (Default \code{FALSE})}
 43 | 
 44 | \item{CI}{If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval
 45 | in addition to the bootstrap standard error.
 46 | This is based on percentiles of the bootstrap distribution, and a valid interpretation
 47 | relies on a larger number of bootstrap iterations. (Default \code{0.95})}
 48 | 
 49 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})}
 50 | 
 51 | \item{base}{Base of the logarithm that is used in the calculation.
 52 | Defaults to the natural logarithm.}
 53 | }
 54 | \value{
 55 | Returns a data.table with two rows. The column \code{est} contains
 56 |   the Mutual Information Index, M, and Theil's Entropy Index, H. The H is the
 57 |   the M divided by the \code{group} entropy. If \code{within} was given,
 58 |   M and H are weighted averages of the within-category segregation scores.
 59 |   If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
 60 |   the associated bootstrapped standard errors, an additional column \code{CI} contains
 61 |   the estimate confidence interval as a list column, an additional column \code{bias} contains
 62 |   the estimated bias, and the column \code{est} contains the bias-corrected estimates.
 63 | }
 64 | \description{
 65 | Returns the total segregation between \code{group} and \code{unit}.
 66 | If \code{within} is given, calculates segregation within each
 67 | \code{within} category separately, and takes the weighted average.
 68 | Also see \code{\link{mutual_within}} for detailed within calculations.
 69 | }
 70 | \examples{
 71 | # calculate school racial segregation
 72 | mutual_total(schools00, "school", "race", weight = "n") # M => .425
 73 | 
 74 | # note that the definition of groups and units is arbitrary
 75 | mutual_total(schools00, "race", "school", weight = "n") # M => .425
 76 | 
 77 | # if groups or units are defined by a combination of variables,
 78 | # vectors of variable names can be provided -
 79 | # here there is no difference, because schools
 80 | # are nested within districts
 81 | mutual_total(schools00, "race", c("district", "school"),
 82 |     weight = "n"
 83 | ) # M => .424
 84 | 
 85 | # estimate standard errors and 95\% CI for M and H
 86 | \dontrun{
 87 | mutual_total(schools00, "race", "school",
 88 |     weight = "n",
 89 |     se = TRUE, n_bootstrap = 1000
 90 | )
 91 | 
 92 | # estimate segregation within school districts
 93 | mutual_total(schools00, "race", "school",
 94 |     within = "district", weight = "n"
 95 | ) # M => .087
 96 | 
 97 | # estimate between-district racial segregation
 98 | mutual_total(schools00, "race", "district", weight = "n") # M => .338
 99 | # note that the sum of within-district and between-district
100 | # segregation equals total school-race segregation;
101 | # here, most segregation is between school districts
102 | }
103 | }
104 | \references{
105 | Henri Theil. 1971. Principles of Econometrics. New York: Wiley.
106 | 
107 | Ricardo Mora and Javier Ruiz-Castillo. 2011.
108 |      "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194.
109 | }
110 | 


--------------------------------------------------------------------------------
/man/ipf.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/ipf.R
  3 | \name{ipf}
  4 | \alias{ipf}
  5 | \title{Adjustment of marginal distributions using iterative proportional fitting}
  6 | \usage{
  7 | ipf(
  8 |   source,
  9 |   target,
 10 |   group,
 11 |   unit,
 12 |   weight = NULL,
 13 |   max_iterations = 100,
 14 |   precision = 1e-07
 15 | )
 16 | }
 17 | \arguments{
 18 | \item{source}{A "source" data frame. The marginals of this
 19 | dataset are adjusted to the marginals of \code{target}.}
 20 | 
 21 | \item{target}{A "target" data frame. The function returns a dataset
 22 | where the marginal distributions of \code{group} and \code{unit} categories
 23 | are approximated by those of \code{target}.}
 24 | 
 25 | \item{group}{A categorical variable or a vector of variables
 26 | contained in \code{source} and \code{target}. Defines the first distribution
 27 | for adjustment.}
 28 | 
 29 | \item{unit}{A categorical variable or a vector of variables
 30 | contained in \code{source} and \code{target}. Defines the second distribution
 31 | for adjustment.}
 32 | 
 33 | \item{weight}{Numeric. (Default \code{NULL})}
 34 | 
 35 | \item{max_iterations}{Maximum number of iterations used for the IPF algorithm.}
 36 | 
 37 | \item{precision}{Convergence criterion for the IPF algorithm. In every iteration,
 38 | the ratio of the source and target marginals are calculated for every category of
 39 | \code{group} and \code{unit}. The algorithm converges when all ratios are smaller
 40 | than \code{1 + precision}.}
 41 | }
 42 | \value{
 43 | Returns a data frame that retains
 44 |   the association structure of \code{source} while approximating
 45 |   the marginal distributions for \code{group} and \code{unit} of \code{target}.
 46 |   The dataset identifies each combination of \code{group} and \code{unit},
 47 |   and categories that only occur in either \code{source} or \code{target} are dropped.
 48 |   The adjusted frequency of each combination is given by the column \code{n},
 49 |   while \code{n_target} and \code{n_source} contain the zero-adjusted frequencies
 50 |   in the target and source dataset, respectively.
 51 | }
 52 | \description{
 53 | Adjusts the marginal distributions for \code{group} and \code{unit}
 54 | in \code{source} to the respective marginal distributions in \code{target}, using the iterative
 55 | proportional fitting algorithm (IPF).
 56 | }
 57 | \details{
 58 | The algorithm works by scaling
 59 | the marginal distribution of \code{group} in the \code{source} data frame towards the
 60 | marginal distribution of \code{target}; then repeating this process for \code{unit}. The
 61 | algorithm then keeps alternating between \code{group} and \code{unit} until the marginals
 62 | of the adjusted data frame are within the allowed precision. This results in a dataset that
 63 | retains the association structure of \code{source} while approximating
 64 | the marginal distribution of \code{target}. If the number of \code{unit} and
 65 | \code{group} categories is different in \code{source} and \code{target}, the data frame returns
 66 | the combination of \code{unit} and \code{group} categories that occur in both datasets.
 67 | Zero values are replaced by a small, non-zero number (1e-4).
 68 | Note that the values returned sum to the observations of the source data frame, not the
 69 | target data frame. This is different from other IPF implementations, but ensures that the IPF
 70 | does not change the number of observations.
 71 | }
 72 | \examples{
 73 | \dontrun{
 74 | # adjusts the marginals of group and unit categories so that
 75 | # schools00 has similar marginals as schools05
 76 | adj <- ipf(schools00, schools05, "race", "school", weight = "n")
 77 | 
 78 | # check that the new "race" marginals are similar to the target marginals
 79 | # (the same could be done for schools)
 80 | aggregate(adj$n, list(adj$race), sum)
 81 | aggregate(adj$n_target, list(adj$race), sum)
 82 | 
 83 | # note that the adjusted dataset contains fewer
 84 | # schools than either the source or the target dataset,
 85 | # because the marginals are only defined for the overlap
 86 | # of schools
 87 | length(unique(schools00$school))
 88 | length(unique(schools05$school))
 89 | length(unique(adj$school))
 90 | }
 91 | }
 92 | \references{
 93 | W. E. Deming and F. F. Stephan. 1940.
 94 |   "On a Least Squares Adjustment of a Sampled Frequency Table
 95 |   When the Expected Marginal Totals are Known".
 96 |   Annals of Mathematical Statistics. 11 (4): 427–444.
 97 | 
 98 |  T. Karmel and M. Maclachlan. 1988.
 99 |   "Occupational Sex Segregation — Increasing or Decreasing?" Economic Record 64: 187-195.
100 | }
101 | 


--------------------------------------------------------------------------------
/tests/testthat/test_mutual_total_expected.R:
--------------------------------------------------------------------------------
  1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
  2 |     return()
  3 | }
  4 | 
  5 | library("segregation")
  6 | context("test_mutual_total_expected")
  7 | 
  8 | data1 <- data.frame(
  9 |     u = rep(c(1, 2, 3, 4), 2),
 10 |     g = c(rep("a", 4), rep("b", 4)),
 11 |     n = c(40, 20, 5, 1, 20, 40, 60, 80),
 12 |     stringsAsFactors = FALSE
 13 | )
 14 | 
 15 | data2 <- data.frame(
 16 |     u = c(1:10, 1:10),
 17 |     g = c(rep("a", 10), rep("b", 10)),
 18 |     n = c(rep(1, 10), rep(9, 10)),
 19 |     stringsAsFactors = FALSE
 20 | )
 21 | 
 22 | test_that("works both ways around", {
 23 |     expect_equal(
 24 |         mutual_total_expected(data1, "u", "g", weight = "n")[stat == "M under 0", est],
 25 |         mutual_total_expected(data1, "g", "u", weight = "n")[stat == "M under 0", est],
 26 |         tolerance = 0.01
 27 |     )
 28 | 
 29 |     expect_equal(
 30 |         mutual_total_expected(data2, "u", "g", weight = "n")[stat == "M under 0", est],
 31 |         mutual_total_expected(data2, "g", "u", weight = "n")[stat == "M under 0", est],
 32 |         tolerance = 0.05
 33 |     )
 34 | })
 35 | 
 36 | test_that("works nested", {
 37 |     data1$superunit <- data1$u <= 3
 38 |     a <- mutual_total_expected(data1, "g", c("superunit", "u"), weight = "n")
 39 |     b <- mutual_total_expected(data1, "g", "u", weight = "n")
 40 | 
 41 |     expect_equal(a$est, b$est, tolerance = 0.01)
 42 |     expect_equal(a$se, b$se, tolerance = 0.01)
 43 |     expect_equal(length(a$est), 2)
 44 | })
 45 | 
 46 | test_that("fixed margins = FALSE", {
 47 |     expect_equal(
 48 |         mutual_total_expected(data1, "u", "g", weight = "n", fixed_margins = FALSE)[stat == "M under 0", est],
 49 |         mutual_total_expected(data1, "g", "u", weight = "n", fixed_margins = FALSE)[stat == "M under 0", est],
 50 |         tolerance = 0.01
 51 |     )
 52 | })
 53 | 
 54 | test_that("within argument", {
 55 |     within <- mutual_total_expected(school_ses, "ethnic_group", "school_id", within = "ses_quintile")
 56 |     # manually
 57 |     d <- data.table::as.data.table(school_ses)
 58 |     manually <- d[, mutual_total_expected(.SD, "ethnic_group", "school_id"), by = .(ses_quintile)]
 59 |     p <- d[, .(p = .N), by = .(ses_quintile)][, .(ses_quintile, p = p / sum(p))]
 60 |     manually <- merge(manually, p)
 61 |     manually <- manually[, .(est_manual = sum(est * p)), by = .(stat)]
 62 |     compare <- merge(within, manually)
 63 |     expect_equal(compare$est, compare$est_manual, tolerance = 0.01)
 64 | })
 65 | 
 66 | 
 67 | test_that("dissimilarity", {
 68 |     expect_error(dissimilarity_expected(data1, "u", "g", weight = "n"))
 69 |     expect_equal(dissimilarity_expected(data1, "g", "u", n_bootstrap = 500, weight = "n")$est,
 70 |         0.098,
 71 |         tolerance = 0.1
 72 |     )
 73 |     expect_equal(
 74 |         dissimilarity_expected(data1, "g", "u",
 75 |             n_bootstrap = 500, weight = "n",
 76 |             fixed_margins = FALSE
 77 |         )$est,
 78 |         0.1003,
 79 |         tolerance = 0.1
 80 |     )
 81 | })
 82 | 
 83 | test_that("dissimilarity - Winship 1977", {
 84 |     # see table 2
 85 |     mat <- matrix(c(rep(1, 1000), rep(9, 1000)), ncol = 2)
 86 |     d <- matrix_to_long(mat)
 87 |     expect_equal(
 88 |         dissimilarity_expected(d, "group", "unit", weight = "n")$est,
 89 |         0.387,
 90 |         tolerance = 0.1
 91 |     )
 92 | 
 93 |     mat <- matrix(c(rep(5, 1000), rep(5, 1000)), ncol = 2)
 94 |     d <- matrix_to_long(mat)
 95 |     expect_equal(
 96 |         dissimilarity_expected(d, "group", "unit", weight = "n")$est,
 97 |         0.246,
 98 |         tolerance = 0.1
 99 |     )
100 | 
101 |     mat <- matrix(c(rep(10, 1000), rep(90, 1000)), ncol = 2)
102 |     d <- matrix_to_long(mat)
103 |     expect_equal(
104 |         dissimilarity_expected(d, "group", "unit", weight = "n")$est,
105 |         0.131,
106 |         tolerance = 0.1
107 |     )
108 | })
109 | 
110 | test_that("errors", {
111 |     dat <- data.frame(
112 |         u = rep(c(1, 2, 3, 4), 2),
113 |         g = c(rep("a", 4), rep("b", 4)),
114 |         n = c(40.2, 20, 5, 1, 20, 40, 60, 80),
115 |         stringsAsFactors = FALSE
116 |     )
117 | 
118 |     expect_error(
119 |         mutual_total_expected(dat, "g", "u", weight = "n"),
120 |         "bootstrap with a total sample size"
121 |     )
122 |     expect_error(
123 |         dissimilarity_expected(dat, "g", "u", weight = "n"),
124 |         "bootstrap with a total sample size"
125 |     )
126 | 
127 |     dat_within <- data.frame(
128 |         u = c(rep("a", 4), rep("b", 4)),
129 |         g = rep(c(1, 2), 4),
130 |         supergroup = rep(c(12, 12, 34, 34), 2),
131 |         n = c(40, 20, 5, 1, 20, 40, 60, 80),
132 |         stringsAsFactors = FALSE
133 |     )
134 | 
135 |     expect_message(mutual_total_expected(schools00, "race", "school",
136 |         within = "district", weight = "n",
137 |         n_bootstrap = 10
138 |     ), "singleton items")
139 | })
140 | 


--------------------------------------------------------------------------------
/tests/testthat/test_mutual_within.R:
--------------------------------------------------------------------------------
  1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
  2 |     return()
  3 | }
  4 | 
  5 | library("segregation")
  6 | context("test_mutual_within")
  7 | 
  8 | test_data <- data.frame(
  9 |     u = c(rep("a", 4), rep("b", 4)),
 10 |     g = rep(c(1, 2, 3, 4), 2),
 11 |     supergroup = rep(c(12, 12, 34, 34), 2),
 12 |     n = c(40, 20, 5, 1, 20, 40, 60, 80),
 13 |     stringsAsFactors = FALSE
 14 | )
 15 | 
 16 | test_that("dimensions and bootstrapping", {
 17 |     within <- mutual_within(test_data, "u", "g",
 18 |         within = "supergroup", weight = "n"
 19 |     )
 20 |     expect_equal(dim(within), c(2 * 4, 3))
 21 | 
 22 |     within_se <- mutual_within(test_data, "u", "g",
 23 |         within = "supergroup", weight = "n", se = TRUE, n_bootstrap = 10
 24 |     )
 25 |     expect_equal(dim(within_se), c(2 * 4, 6))
 26 | })
 27 | 
 28 | test_that("bootstrap attributes exists", {
 29 |     within_se <- mutual_within(test_data, "u", "g",
 30 |         within = "supergroup", weight = "n", se = TRUE, n_bootstrap = 10
 31 |     )
 32 | 
 33 |     expect_equal(dim(attr(within_se, "bootstrap")), c(10 * length(unique(test_data$supergroup)) * 4, 3))
 34 | })
 35 | 
 36 | test_that("bootstrapping fails when sample size is non-integer", {
 37 |     test_data <- data.frame(
 38 |         u = c(rep("a", 4), rep("b", 4)),
 39 |         g = rep(c(1, 2, 3, 4), 2),
 40 |         supergroup = rep(c(12, 12, 34, 34), 2),
 41 |         n = c(40.7, 20, 5, 1, 20.5, 40, 60, 80),
 42 |         stringsAsFactors = FALSE
 43 |     )
 44 | 
 45 |     expect_error(mutual_within(test_data, "u", "g",
 46 |         within = "supergroup", weight = "n", se = TRUE, n_bootstrap = 10
 47 |     ))
 48 |     # rescale
 49 |     test_data$n2 <- test_data$n / sum(test_data$n) * round(sum(test_data$n))
 50 |     ret <- mutual_within(test_data, "u", "g",
 51 |         within = "supergroup", weight = "n2", se = TRUE, n_bootstrap = 10
 52 |     )
 53 |     expect_equal(dim(ret), c(2 * 4, 6))
 54 | })
 55 | 
 56 | test_that("between + within = total", {
 57 |     total <- mutual_total(test_data, "u", "g", within = "supergroup", weight = "n")
 58 |     m <- total[stat == "M", est]
 59 |     h <- total[stat == "H", est]
 60 | 
 61 |     within <- mutual_within(test_data, "u", "g", within = "supergroup", weight = "n")
 62 |     within <- unstack(within, form = est ~ stat)
 63 | 
 64 |     expect_equal(m, sum(within$p * within$M))
 65 |     expect_equal(h, sum(within$p * within$ent_ratio * within$H))
 66 | 
 67 |     within_wide <- mutual_within(test_data, "u", "g",
 68 |         within = "supergroup",
 69 |         weight = "n", wide = T
 70 |     )
 71 |     expect_equal(sum(within$p * within$M), sum(within_wide$p * within_wide$M))
 72 |     expect_equal(
 73 |         sum(within$p * within$ent_ratio * within$H),
 74 |         sum(within_wide$p * within_wide$ent_ratio * within_wide$H)
 75 |     )
 76 | 
 77 |     # H is between 0 and 1
 78 |     expect_equal(all(within$H >= 0 & within$H <= 1), TRUE)
 79 |     expect_equal(all(h >= 0 & h <= 1), TRUE)
 80 | })
 81 | 
 82 | test_that("option wide works", {
 83 |     nowide <- mutual_within(test_data, "u", "g",
 84 |         within = "supergroup", weight = "n"
 85 |     )
 86 |     nowide_se <- mutual_within(test_data, "u", "g",
 87 |         within = "supergroup", weight = "n", se = TRUE, n_bootstrap = 10
 88 |     )
 89 |     wide <- mutual_within(test_data, "u", "g",
 90 |         within = "supergroup", weight = "n", wide = TRUE
 91 |     )
 92 |     wide_se <- mutual_within(test_data, "u", "g",
 93 |         within = "supergroup", weight = "n", wide = TRUE, se = TRUE, n_bootstrap = 10
 94 |     )
 95 | 
 96 |     expect_equal(ncol(nowide) + 3, ncol(nowide_se))
 97 |     expect_equal(nrow(nowide), 2 * 4)
 98 |     expect_equal(nrow(nowide), nrow(nowide_se))
 99 | 
100 |     expect_equal(ncol(wide) + 3 * 4, ncol(wide_se))
101 |     expect_equal(nrow(wide), 2)
102 |     expect_equal(nrow(wide), nrow(wide_se))
103 | 
104 |     expect_equal(nowide[stat == "M", est], wide$M)
105 |     expect_equal(nowide[stat == "p", est], wide$p)
106 |     expect_equal(nowide[stat == "H", est], wide$H)
107 |     expect_equal(nowide[stat == "ent_ratio", est], wide$ent_ratio)
108 | 
109 |     total <- mutual_total(test_data, "u", "g", within = "supergroup", weight = "n")
110 |     expect_equal(
111 |         total[stat == "M", est],
112 |         sum(nowide[stat == "M", est] * nowide[stat == "p", est])
113 |     )
114 |     expect_equal(total[stat == "M", est], sum(wide$M * wide$p))
115 |     expect_equal(
116 |         total[stat == "M", est],
117 |         sum(wide$H * wide$p * wide$ent_ratio * entropy(test_data, "u", weight = "n"))
118 |     )
119 | 
120 |     expect_equal(
121 |         total[stat == "H", est],
122 |         sum(nowide[stat == "H", est] *
123 |             nowide[stat == "p", est] *
124 |             nowide[stat == "ent_ratio", est])
125 |     )
126 |     expect_equal(total[stat == "H", est], sum(wide$H * wide$p * wide$ent_ratio))
127 | 
128 |     expect_equal(all(nowide_se[["se"]] > 0), TRUE)
129 | })
130 | 


--------------------------------------------------------------------------------
/tests/testthat/test_ipf.R:
--------------------------------------------------------------------------------
  1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
  2 |     return()
  3 | }
  4 | 
  5 | library("segregation")
  6 | context("test_ipf")
  7 | 
  8 | test_that("different precisions", {
  9 |     # reduce to overlap sample
 10 |     schools00_r <- schools00[schools00$school %in% schools05$school, ]
 11 |     schools05_r <- schools05[schools05$school %in% schools00$school, ]
 12 | 
 13 |     for (precision in c(.1, .01, .001)) {
 14 |         adj <- ipf(schools00_r, schools05_r, "race", "school", weight = "n", precision = precision)
 15 | 
 16 |         # check that the new "race" marginals are similar to the target marginals
 17 |         new <- aggregate(adj$n, list(adj$race), sum)
 18 |         old <- aggregate(schools05_r$n, list(schools05_r$race), sum)
 19 |         old <- old[match(new$Group.1, old$Group.1), ]
 20 |         new <- new$x / sum(new$x)
 21 |         old <- old$x / sum(old$x)
 22 |         expect_true(all(abs(new - old) < precision))
 23 | 
 24 |         # check that the new "school" marginals are similar to the target marginals
 25 |         new <- aggregate(adj$n, list(adj$school), sum)
 26 |         old <- aggregate(schools05_r$n, list(schools05_r$school), sum)
 27 |         old <- old[match(new$Group.1, old$Group.1), ]
 28 |         new <- new$x / sum(new$x)
 29 |         old <- old$x / sum(old$x)
 30 |         expect_true(all(abs(new - old) < precision))
 31 |     }
 32 | })
 33 | 
 34 | test_that("warn if iterations are too low", {
 35 |     expect_error(
 36 |         suppressWarnings(
 37 |             ipf(schools00, schools05, "race", "school",
 38 |                 weight = "n",
 39 |                 precision = .00001, max_iterations = 1
 40 |             )
 41 |         )
 42 |     )
 43 | })
 44 | 
 45 | test_that("gives sames results as mutual_difference", {
 46 |     diff <- mutual_difference(schools00, schools05,
 47 |         group = "race", unit = "school",
 48 |         weight = "n", method = "km", precision = 0.000001
 49 |     )
 50 |     # what changed from 2000 to 2005?
 51 |     # first reduce to overlap sample
 52 |     schools00_r <- schools00[schools00$school %in% schools05$school, ]
 53 |     schools05_r <- schools05[schools05$school %in% schools00$school, ]
 54 |     M_00 <- mutual_total(schools00_r, "race", "school", weight = "n")[stat == "M", est]
 55 |     M_05 <- mutual_total(schools05_r, "race", "school", weight = "n")[stat == "M", est]
 56 |     # adjust the 2000 margins to the 2005 margins
 57 |     # if only the margins changed, then this would explain all the difference
 58 |     adj_00 <- ipf(schools00_r, schools05_r, "race", "school", weight = "n", precision = 0.00001)
 59 |     M_margins <- mutual_total(adj_00, "race", "school", weight = "n")[stat == "M", est]
 60 |     structural_change <- M_05 - M_margins
 61 |     # test
 62 |     expect_equal(diff[stat == "M1", est] + diff[stat == "removals", est], M_00)
 63 |     expect_equal(diff[stat == "M2", est] - diff[stat == "additions", est], M_05)
 64 |     expect_equal(structural_change, diff[stat == "structural", est], tolerance = .001)
 65 |     expect_equal(
 66 |         M_05 - M_00 - structural_change,
 67 |         diff[stat %in% c("unit_marginal", "group_marginal", "interaction"), sum(est)],
 68 |         tolerance = .001
 69 |     )
 70 | })
 71 | 
 72 | test_that("example from Karmel & Maclachlan 1988", {
 73 |     source <- data.frame(
 74 |         occ = rep(c(1, 2, 3), 2),
 75 |         gender = c(rep("male", 3), rep("female", 3)),
 76 |         n = c(100, 50, 100, 50, 50, 50)
 77 |     )
 78 |     target <- data.frame(
 79 |         occ = rep(c(1, 2, 3), 2),
 80 |         gender = c(rep("male", 3), rep("female", 3)),
 81 |         n = c(125, 100, 100, 100, 100, 75)
 82 |     )
 83 |     adj <- ipf(source, target, "occ", "gender", "n", precision = 0.0000000001)
 84 |     # K-M report on the scale of the target distribution
 85 |     adj$n <- round(adj$n / sum(adj$n) * sum(adj$n_target), 1)
 86 |     expect_equal(adj[adj$gender == "male" & adj$occ == 1, "n"][[1]], 134.7)
 87 |     expect_equal(adj[adj$gender == "male" & adj$occ == 2, "n"][[1]], 85.5)
 88 |     expect_equal(adj[adj$gender == "male" & adj$occ == 3, "n"][[1]], 104.8)
 89 |     expect_equal(adj[adj$gender == "female" & adj$occ == 1, "n"][[1]], 90.3)
 90 |     expect_equal(adj[adj$gender == "female" & adj$occ == 2, "n"][[1]], 114.5)
 91 |     expect_equal(adj[adj$gender == "female" & adj$occ == 3, "n"][[1]], 70.2)
 92 | 
 93 |     expect_equal(
 94 |         sum(adj[adj$gender == "male", "n"]),
 95 |         sum(adj[adj$gender == "male", "n_target"])
 96 |     )
 97 |     expect_equal(
 98 |         sum(adj[adj$gender == "female", "n"]),
 99 |         sum(adj[adj$gender == "female", "n_target"])
100 |     )
101 |     expect_equal(sum(adj[adj$occ == 1, "n"]), sum(adj[adj$occ == 1, "n_target"]))
102 |     expect_equal(sum(adj[adj$occ == 2, "n"]), sum(adj[adj$occ == 2, "n_target"]))
103 |     expect_equal(sum(adj[adj$occ == 3, "n"]), sum(adj[adj$occ == 3, "n_target"]))
104 | })
105 | 
106 | test_that("warning about units and groups being dropped", {
107 |     expect_warning(ipfd <- ipf(schools00, schools05, "race", "school", weight = "n"))
108 |     expect_equal(sum(ipfd$n), sum(ipfd$n_source))
109 | })
110 | 
111 | test_that("returns same number of observations as before", {
112 |     # schools are dropped here
113 |     suppressWarnings(ipfd <- ipf(schools00, schools05, "race", "school", weight = "n"))
114 |     expect_equal(sum(ipfd$n), sum(ipfd$n_source))
115 | 
116 |     # reduce to overlap sample, because then by definition the counts are identical
117 |     schools00_r <- schools00[schools00$school %in% schools05$school, ]
118 |     schools05_r <- schools05[schools05$school %in% schools00$school, ]
119 | 
120 |     ipfd <- ipf(schools00_r, schools05_r, "race", "school", weight = "n")
121 |     expect_equal(sum(ipfd$n), sum(ipfd$n_source))
122 |     expect_equal(sum(schools00_r$n), sum(ipfd$n))
123 |     expect_equal(sum(schools00_r$n), sum(ipfd$n_source))
124 |     expect_equal(sum(schools05_r$n), sum(ipfd$n_target))
125 | })
126 | 


--------------------------------------------------------------------------------
/vignettes/faq.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "FAQ"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{FAQ}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | ```{r, include = FALSE}
 11 | knitr::opts_chunk$set(
 12 |   collapse = TRUE,
 13 |   comment = "#>"
 14 | )
 15 | 
 16 | data.table::setDTthreads(1)
 17 | 
 18 | # skip this vignette on CRAN etc.
 19 | BUILD_VIGNETTE <- identical(Sys.getenv("BUILD_VIGNETTE"), "true")
 20 | knitr::opts_chunk$set(eval = BUILD_VIGNETTE)
 21 | 
 22 | library("dplyr")
 23 | library("data.table")
 24 | library("tigris")
 25 | library("segregation")
 26 | options(tigris_use_cache = TRUE)
 27 | schools00 <- schools00
 28 | ```
 29 | 
 30 | ## Can index X be added to the package?
 31 | 
 32 | Adding new segregation indices is not a big trouble. Please
 33 | [open an issue](https://github.com/elbersb/segregation/issues) on GitHub
 34 | to request an index to be added.
 35 | 
 36 | ## How can I compute indices for different areas at once?
 37 | 
 38 | If you use the `dplyr` package, one pattern that works well is
 39 | to use `group_modify`. Here, we compute the pairwise Black-White
 40 | dissimilarity index for each state separately:
 41 | 
 42 | ```{r}
 43 | library("segregation")
 44 | library("dplyr")
 45 | 
 46 | schools00 %>%
 47 |   filter(race %in% c("black", "white")) %>%
 48 |   group_by(state) %>%
 49 |   group_modify(~ dissimilarity(
 50 |     data = .x,
 51 |     group = "race",
 52 |     unit = "school",
 53 |     weight = "n"
 54 |   ))
 55 | ```
 56 | 
 57 | A similar pattern works also well with `data.table`:
 58 | 
 59 | ```{r}
 60 | library("data.table")
 61 | 
 62 | schools00 <- as.data.table(schools00)
 63 | schools00[
 64 |   race %in% c("black", "white"),
 65 |   dissimilarity(data = .SD, group = "race", unit = "school", weight = "n"),
 66 |   by = .(state)
 67 | ]
 68 | ```
 69 | 
 70 | To compute many decompositions at once, it's easiest
 71 | to combine the data for the two time points. For instance,
 72 | here's a `dplyr` solution to decompose the state-specific
 73 | M indices between 2000 and 2005:
 74 | 
 75 | ```{r}
 76 | # helper function for decomposition
 77 | diff <- function(df, group) {
 78 |   data1 <- filter(df, year == 2000)
 79 |   data2 <- filter(df, year == 2005)
 80 |   mutual_difference(data1, data2, group = "race", unit = "school", weight = "n")
 81 | }
 82 | 
 83 | # add year indicators
 84 | schools00$year <- 2000
 85 | schools05$year <- 2005
 86 | combine <- bind_rows(schools00, schools05)
 87 | 
 88 | combine %>%
 89 |   group_by(state) %>%
 90 |   group_modify(diff) %>%
 91 |   head(5)
 92 | ```
 93 | 
 94 | Again, here's also a `data.table` solution:
 95 | 
 96 | ```{r}
 97 | setDT(combine)
 98 | combine[, diff(.SD), by = .(state)] %>% head(5)
 99 | ```
100 | 
101 | ## How can I use Census data from `tidycensus` to compute segregation indices?
102 | 
103 | Here are a few examples thanks to [Kyle Walker](https://twitter.com/kyle_e_walker/status/1392188844724809728), the author of the [tidycensus](https://walker-data.com/tidycensus/articles/basic-usage.html) package.
104 | 
105 | First, download the data:
106 | 
107 | ```{r}
108 | library("tidycensus")
109 | 
110 | cook_data <- get_acs(
111 |   geography = "tract",
112 |   variables = c(
113 |     white = "B03002_003",
114 |     black = "B03002_004",
115 |     asian = "B03002_006",
116 |     hispanic = "B03002_012"
117 |   ),
118 |   state = "IL",
119 |   county = "Cook"
120 | )
121 | ```
122 | 
123 | Because this data is in "long" format, it's easy to compute segregation indices:
124 | 
125 | ```{r}
126 | # compute index of dissimilarity
127 | cook_data %>%
128 |   filter(variable %in% c("black", "white")) %>%
129 |   dissimilarity(
130 |     group = "variable",
131 |     unit = "GEOID",
132 |     weight = "estimate"
133 |   )
134 | 
135 | # compute multigroup M/H indices
136 | cook_data %>%
137 |   mutual_total(
138 |     group = "variable",
139 |     unit = "GEOID",
140 |     weight = "estimate"
141 |   )
142 | ```
143 | 
144 | Producing a map of local segregation scores is also not hard:
145 | 
146 | ```{r fig.width=7, fig.height=7}
147 | library("tigris")
148 | library("ggplot2")
149 | 
150 | local_seg <- mutual_local(cook_data,
151 |   group = "variable",
152 |   unit = "GEOID",
153 |   weight = "estimate",
154 |   wide = TRUE
155 | )
156 | 
157 | # download shapefile
158 | seg_geom <- tracts("IL", "Cook", cb = TRUE, progress_bar = FALSE) %>%
159 |   left_join(local_seg, by = "GEOID")
160 | 
161 | ggplot(seg_geom, aes(fill = ls)) +
162 |   geom_sf(color = NA) +
163 |   coord_sf(crs = 3435) +
164 |   scale_fill_viridis_c() +
165 |   theme_void() +
166 |   labs(
167 |     title = "Local segregation scores for Cook County, IL",
168 |     fill = NULL
169 |   )
170 | ```
171 | 
172 | ## Can I compute local segregation scores for the H index?
173 | 
174 | See [this paper](https://osf.io/preprints/socarxiv/3juyc) for more information. The short
175 | answer is that you can divide the local segregation scores of the M index by the entropy
176 | of the group distribution. A weighted average of these scores must then equal the H index, as
177 | the H index is just the M index divided by the entropy of the group distribution.
178 | 
179 | Here's an example:
180 | 
181 | ```{r}
182 | (mutual_total(schools00, "race", "school", weight = "n"))
183 | 
184 | local <- mutual_local(schools00, "race", "school", weight = "n", wide = TRUE)
185 | (local[, sum(p * ls)]) # same as M index above
186 | local[, ls_H := ls / entropy(schools00, "race", weight = "n")]
187 | (local[, sum(p * ls_H)]) # same as H index above
188 | ```
189 | 
190 | 
191 | ## How can I compute margins-adjusted local segregation scores?
192 | 
193 | When using `mutual_difference`, supply `method = "shapley_detailed"`
194 | to get two different local segregation scores that are margins-adjusted 
195 | (one is coming from adjusting forward, the other from adjusting
196 | backwards). By averaging them we can create a single margins-adjusted 
197 | local segregation score:
198 | 
199 | ```{r}
200 | diff <- mutual_difference(schools00, schools05, "race", "school",
201 |   weight = "n", method = "shapley_detailed"
202 | )
203 | 
204 | diff[stat %in% c("ls_diff1", "ls_diff2"),
205 |   .(ls_diff_adjusted = mean(est)),
206 |   by = .(school)
207 | ]
208 | ```
209 | 


--------------------------------------------------------------------------------
/tests/testthat/test_compression.R:
--------------------------------------------------------------------------------
  1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
  2 |     return()
  3 | }
  4 | 
  5 | library("segregation")
  6 | context("test_compression")
  7 | 
  8 | subset <- schools00[1:50, ]
  9 | data.table::setDT(subset)
 10 | n_schools <- length(unique(subset$school))
 11 | 
 12 | all_neighbors <- unique(subset$school)
 13 | all_neighbors <- expand.grid(a = all_neighbors, b = all_neighbors)
 14 | res_all <- compress(subset, "race", "school", weight = "n", neighbors = all_neighbors)
 15 | 
 16 | test_that("result is the same with no neighbors given", {
 17 |     res2 <- compress(subset, "race", "school", neighbors = "all", weight = "n")
 18 |     expect_equal(res_all$iterations, res2$iterations)
 19 | })
 20 | 
 21 | test_that("compress works", {
 22 |     # 9 merges
 23 |     expect_equal(nrow(res_all$iterations), 16)
 24 |     # M values is declining continously
 25 |     expect_equal(all(res_all$iterations$M[2:16] < res_all$M$iterations[1:15]), TRUE)
 26 |     # number of units are correct
 27 |     expect_equal(res_all$iteration$N_units[[1]], n_schools - 1)
 28 | })
 29 | 
 30 | test_that("print", {
 31 |     expect_output(print(res_all), "17 units")
 32 |     expect_output(print(res_all), "Threshold 99%")
 33 | })
 34 | 
 35 | test_that("get_crosswalk works", {
 36 |     expect_error(
 37 |         get_crosswalk(schools00),
 38 |         "either n_units or percent has to be given"
 39 |     )
 40 |     expect_error(
 41 |         get_crosswalk(res_all, n_units = -1),
 42 |         "n_units is out of bounds"
 43 |     )
 44 |     expect_error(
 45 |         get_crosswalk(res_all, n_units = 20),
 46 |         "n_units is out of bounds"
 47 |     )
 48 |     expect_error(
 49 |         get_crosswalk(res_all, n_units = 3, percent = 0.2),
 50 |         "only n_units or percent has to be given"
 51 |     )
 52 | 
 53 |     expect_equal(nrow(get_crosswalk(res_all, n_units = 1)), n_schools)
 54 |     expect_equal(nrow(get_crosswalk(res_all, n_units = 5)), n_schools)
 55 |     expect_equal(nrow(get_crosswalk(res_all, n_units = 15)), n_schools)
 56 |     expect_equal(nrow(get_crosswalk(res_all, percent = 0.1)), n_schools)
 57 |     expect_equal(nrow(get_crosswalk(res_all, percent = 0.6)), n_schools)
 58 |     expect_equal(nrow(get_crosswalk(res_all, percent = 0.9)), n_schools)
 59 |     expect_equal(as.character(get_crosswalk(res_all, n_units = 1)$new), rep("M1", 17))
 60 | })
 61 | 
 62 | test_that("parts", {
 63 |     # get_crosswalk
 64 |     res_no_parts <- get_crosswalk(res_all, percent = 0.6)
 65 |     res_parts <- get_crosswalk(res_all, percent = 0.6, parts = TRUE)
 66 |     expect_equal(names(res_no_parts), c("school", "new"))
 67 |     expect_equal(names(res_parts), c("school", "new", "parts"))
 68 |     expect_equal(res_no_parts, res_parts[, -"parts"])
 69 | 
 70 |     res_no_parts <- get_crosswalk(res_all, percent = 0.99)
 71 |     res_parts <- get_crosswalk(res_all, percent = 0.99, parts = TRUE)
 72 |     expect_equal(res_no_parts, res_parts[, -"parts"])
 73 | 
 74 |     expect_true(all(!is.na(res_parts[grepl("^M", new)][["parts"]])))
 75 | 
 76 |     # merge_units
 77 |     merged_no_parts <- merge_units(res_all, percent = .8)
 78 |     merged_parts <- merge_units(res_all, percent = .8, parts = TRUE)
 79 |     expect_equal(res_no_parts, res_parts[, -"parts"])
 80 |     expect_equal(names(merged_no_parts), c("school", "race", "n"))
 81 |     expect_equal(names(merged_parts), c("school", "race", "n", "parts"))
 82 | })
 83 | 
 84 | test_that("compress edge case", {
 85 |     res_edge <- compress(subset, "race", "school", neighbors = "all", weight = "n", max_iter = 1)
 86 |     expect_equal(nrow(get_crosswalk(res_edge, n_units = 16)), n_schools)
 87 | })
 88 | 
 89 | test_that("merge_units", {
 90 |     merged <- merge_units(res_all, percent = 0.8)
 91 |     new_units_cw <- sort(unique(get_crosswalk(res_all, percent = 0.8)$new))
 92 |     new_units_merged <- sort(unique(merged$school))
 93 |     expect_equal(new_units_cw, new_units_merged)
 94 | })
 95 | 
 96 | test_that("percent works", {
 97 |     M_full <- mutual_total(subset, "race", "school", weight = "n")[stat == "M"][["est"]]
 98 | 
 99 |     for (pct in seq(0.1, 0.9, by = 0.05)) {
100 |         merged_pct <- merge_units(res_all, percent = pct)
101 |         M_pct <- mutual_total(merged_pct, "race", "school", weight = "n")[stat == "M"][["est"]]
102 |         pct_M <- M_pct / M_full
103 |         expect_true(pct_M > pct)
104 |         expect_equal(res_all$iterations[N_units == merged_pct[, uniqueN(school)]][["pct_M"]], pct_M)
105 |     }
106 | })
107 | 
108 | test_that("merge_units edge case", {
109 |     res_edge <- compress(subset, "race", "school", neighbors = "all", weight = "n", max_iter = 1)
110 |     merged <- merge_units(res_edge, n_units = 16)
111 |     # replicate manual merge
112 |     units <- c(res_edge$iterations$old_unit, res_edge$iterations$new_unit)
113 |     merged_manually <- subset[school %in% units, .(n = sum(n)), by = .(race)]
114 |     merged_algo <- merged[school == "M1" & n != 0][, -"school"]
115 |     expect_equal(merged_manually, merged_algo)
116 | })
117 | 
118 | test_that("scree plot", {
119 |     if (requireNamespace("ggplot2", quietly = TRUE)) {
120 |         plot <- scree_plot(res_all)
121 |         expect_equal(nrow(plot$data), n_schools)
122 | 
123 |         plot <- scree_plot(res_all, tail = 3)
124 |         expect_equal(nrow(plot$data), 3)
125 |     }
126 | })
127 | 
128 | test_that("data set names", {
129 |     subset <- schools00[1:50, ]
130 |     data.table::setDT(subset)
131 |     names(subset) <- c("state", "district", "unit", "group", "n")
132 |     res <- compress(subset, "group", "unit", neighbors = "all", weight = "n")
133 | 
134 |     expect_equal(nrow(res$iterations), 16)
135 | })
136 | 
137 | test_that("local neighbors", {
138 |     subset <- schools00[1:500, ]
139 |     data.table::setDT(subset)
140 |     res_local <- compress(subset, "race", "school", neighbors = "local", n_neighbors = 100, weight = "n")
141 |     res_local_small <- compress(subset, "race", "school", neighbors = "local", n_neighbors = 5, weight = "n")
142 |     res_all <- compress(subset, "race", "school", neighbors = "all", weight = "n")
143 | 
144 |     expect_equal(res_local$iterations$old_unit, res_all$iterations$old_unit)
145 |     expect_true(
146 |         res_local_small$iterations[pct_M > 0.99][.N][["N_units"]] >
147 |             res_local$iterations[pct_M > 0.99][.N][["N_units"]]
148 |     )
149 | })
150 | 
151 | test_that("dendrogram", {
152 |     dend <- as.dendrogram(res_all)
153 |     expect_equal(attr(dend, "height"), res_all$iterations$M[[1]])
154 |     expect_equal(attr(dend, "members"), length(unique(res_all$data$school)))
155 | 
156 |     res_limited <- compress(subset, "race", "school", weight = "n", max_iter = 5)
157 |     expect_error(as.dendrogram(res_limited))
158 | })
159 | 


--------------------------------------------------------------------------------
/tests/testthat/test_mutual_total.R:
--------------------------------------------------------------------------------
  1 | if (!identical(Sys.getenv("NOT_CRAN"), "true")) {
  2 |     return()
  3 | }
  4 | 
  5 | library("segregation")
  6 | context("test_mutual_total")
  7 | 
  8 | test_data <- data.frame(
  9 |     u = c(rep("a", 4), rep("b", 4)),
 10 |     g = rep(c(1, 2, 3, 4), 2),
 11 |     supergroup = rep(c(12, 12, 34, 34), 2),
 12 |     n = c(40, 20, 5, 1, 20, 40, 60, 80),
 13 |     stringsAsFactors = FALSE
 14 | )
 15 | 
 16 | test_that("mutual M works both ways around", {
 17 |     expect_equal(
 18 |         mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est],
 19 |         mutual_total(test_data, "g", "u", weight = "n")[stat == "M", est]
 20 |     )
 21 | 
 22 |     expanded <- test_data[rep(seq_len(nrow(test_data)), test_data$n), 1:3]
 23 |     expect_equal(
 24 |         mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est],
 25 |         mutual_total(expanded, "u", "g")[stat == "M", est]
 26 |     )
 27 | 
 28 |     expect_equal(
 29 |         mutual_total(expanded, "u", "g")[stat == "M", est],
 30 |         mutual_total(expanded, "g", "u")[stat == "M", est]
 31 |     )
 32 | 
 33 |     expect_equal(
 34 |         mutual_total(test_data, "u", "g", weight = "n", base = 2)[stat == "M", est],
 35 |         mutual_total(test_data, "g", "u", weight = "n", base = 2)[stat == "M", est]
 36 |     )
 37 | 
 38 |     expect_equal(
 39 |         mutual_total(test_data, "u", c("supergroup", "g"), weight = "n")[stat == "M", est],
 40 |         mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est]
 41 |     )
 42 | 
 43 |     expect_equal(
 44 |         mutual_total(test_data, "u", c("supergroup", "g"), weight = "n")[stat == "M", est],
 45 |         mutual_total(test_data, c("supergroup", "g"), "u", weight = "n")[stat == "M", est]
 46 |     )
 47 | })
 48 | 
 49 | test_that("between + within = total", {
 50 |     expect_equal(
 51 |         mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est],
 52 |         mutual_total(test_data, "u", "supergroup", weight = "n")[stat == "M", est] +
 53 |             mutual_total(test_data, "u", "g", within = "supergroup", weight = "n")[stat == "M", est]
 54 |     )
 55 |     expect_equal(
 56 |         mutual_total(test_data, "u", "g", weight = "n")[stat == "H", est],
 57 |         mutual_total(test_data, "u", "supergroup", weight = "n")[stat == "H", est] +
 58 |             mutual_total(test_data, "u", "g", within = "supergroup", weight = "n")[stat == "H", est]
 59 |     )
 60 | })
 61 | 
 62 | p_12 <- sum(test_data[test_data$supergroup == 12, "n"]) / sum(test_data$n)
 63 | p_34 <- sum(test_data[test_data$supergroup == 34, "n"]) / sum(test_data$n)
 64 | test_that("within estimations are correct", {
 65 |     d_12 <- test_data[test_data$supergroup == 12, ]
 66 |     d_34 <- test_data[test_data$supergroup == 34, ]
 67 |     expect_equal(
 68 |         p_12 * mutual_total(d_12, "u", "g", weight = "n")[stat == "M", est] +
 69 |             p_34 * mutual_total(d_34, "u", "g", weight = "n")[stat == "M", est],
 70 |         mutual_total(test_data, "u", "g", within = "supergroup", weight = "n")[stat == "M", est]
 71 |     )
 72 |     # this decomposition does not exist in the same way for H
 73 | })
 74 | 
 75 | test_that("H is correct", {
 76 |     ret <- mutual_total(test_data, "u", "g", weight = "n")
 77 |     expect_equal(ret[stat == "H", est] >= 0 & ret[stat == "H", est] <= 1, TRUE)
 78 | })
 79 | 
 80 | test_that("bootstrapping works", {
 81 |     ret <- mutual_total(test_data, "u", "g", weight = "n", se = TRUE, n_bootstrap = 10)
 82 |     expect_equal(dim(ret), c(2, 5))
 83 |     expect_equal(all(ret$se > 0), TRUE)
 84 | 
 85 |     ret <- mutual_total(test_data, "u", "g",
 86 |         weight = "n", se = TRUE, n_bootstrap = 10,
 87 |         within = "supergroup"
 88 |     )
 89 |     expect_equal(dim(ret), c(2, 5))
 90 |     expect_equal(all(ret$se > 0), TRUE)
 91 | })
 92 | 
 93 | test_that("bootstrap attributes exists", {
 94 |     ret <- mutual_total(test_data, "u", "g", weight = "n", se = TRUE, n_bootstrap = 10)
 95 |     expect_equal(dim(attr(ret, "bootstrap")), c(2 * 10, 2))
 96 | })
 97 | 
 98 | test_that("bootstrapping fails when sample size is non-integer", {
 99 |     test_data <- data.frame(
100 |         u = c(rep("a", 4), rep("b", 4)),
101 |         g = rep(c(1, 2, 3, 4), 2),
102 |         n = c(40, 20, 5, 1.8, 20, 40, 60, 80),
103 |         stringsAsFactors = FALSE
104 |     )
105 | 
106 |     expect_error(mutual_total(test_data, "u", "g", weight = "n", se = TRUE, n_bootstrap = 10))
107 |     # rescale
108 |     test_data$n2 <- test_data$n / sum(test_data$n) * round(sum(test_data$n))
109 |     ret <- mutual_total(test_data, "u", "g", weight = "n2", se = TRUE, n_bootstrap = 10)
110 |     expect_equal(dim(ret), c(2, 5))
111 |     expect_equal(all(ret$se > 0), TRUE)
112 | })
113 | 
114 | 
115 | test_data <- data.frame(
116 |     u = c(rep("a", 4), rep("b", 4)),
117 |     g = rep(c(1, 2, 3, 4), 2),
118 |     n = c(40, 0, 0, 0, 0, 0, 0, 40)
119 | )
120 | 
121 | test_that("zero weights no problem", {
122 |     expect_equal(dim(mutual_total(test_data, "u", "g",
123 |         weight = "n",
124 |         se = TRUE, n_bootstrap = 10
125 |     )), c(2, 5))
126 |     expect_equal(dim(mutual_total(test_data, "u", "g", weight = "n")), c(2, 2))
127 |     expect_equal(mutual_total(test_data, "u", "g", weight = "n")[stat == "M", est], log(2))
128 |     expect_equal(mutual_total(test_data, "u", "g", weight = "n")[stat == "H", est], 1)
129 | 
130 |     test_data2 <- copy(test_data)
131 |     test_data2$g <- as.factor(test_data2$g)
132 |     expect_equal(
133 |         mutual_total(test_data, "u", "g", weight = "n")[["est"]],
134 |         mutual_total(test_data2, "u", "g", weight = "n")[["est"]]
135 |     )
136 | })
137 | 
138 | test_that("gives errors", {
139 |     expect_error(mutual_total("test_data", "u", "g", weight = "n"), "not a data.frame")
140 |     expect_error(
141 |         mutual_total(test_data[test_data$u == "c", ], "u", "g", weight = "n"),
142 |         "data.frame is empty"
143 |     )
144 | 
145 |     expect_error(mutual_total(test_data, "u2", "g", weight = "n"), "u2 not in data.frame")
146 |     expect_error(mutual_total(test_data, "u2", "g2", weight = "n"), "u2, g2 not in data.frame")
147 |     expect_error(mutual_total(test_data, "u2", "g2", weight = "n2"), "u2, g2, n2 not in data.frame")
148 | 
149 |     test_data_constant1 <- data.frame(u = c(rep("a", 4), rep("b", 4)), g = 1)
150 |     expect_error(mutual_total(test_data_constant1, "g", "u"), "group variable is constant")
151 |     test_data_constant2 <- data.frame(g = c(rep("a", 4), rep("b", 4)), u = 1)
152 |     expect_error(mutual_total(test_data_constant2, "g", "u"), "unit variable is constant")
153 | })
154 | 
155 | test_that("debiasing works correctly", {
156 |     nose <- mutual_total(test_data, "u", "g", weight = "n")
157 |     withse <- mutual_total(test_data, "u", "g", weight = "n", se = TRUE)
158 |     expect_equal(nose$est, withse$est + withse$bias)
159 | })
160 | 


--------------------------------------------------------------------------------
/vignettes/plotting.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Visualizing and compressing segregation"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{Visualizing and compressing segregation}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | ```{r, include = FALSE}
 11 | knitr::opts_chunk$set(
 12 |   collapse = TRUE,
 13 |   comment = "#>"
 14 | )
 15 | 
 16 | data.table::setDTthreads(1)
 17 | 
 18 | # skip this vignette on CRAN etc.
 19 | BUILD_VIGNETTE <- identical(Sys.getenv("BUILD_VIGNETTE"), "true")
 20 | knitr::opts_chunk$set(eval = BUILD_VIGNETTE)
 21 | 
 22 | library("segregation")
 23 | ```
 24 | 
 25 | The package provides the functions `segcurve()` and `segplot()` to visualize segregation.
 26 | These functions return simple ggplots, which can then be further styled and themed.
 27 | For the `segplot()` function, it is often interesting to also compress the segregation
 28 | information that is contained in large datasets. How to do this using 
 29 | the functions `compress()` and `merge_units()` is also described below, and in more
 30 | detail [in this working paper](https://osf.io/preprints/socarxiv/ruw4g/).
 31 | 
 32 | ## Segregation curve
 33 | 
 34 | The segregation curve was first introduced by [Duncan and Duncan (1955)](https://www.jstor.org/stable/2088328).
 35 | The function `segcurve()` provides a simple way of plotting one or several segregation curves:
 36 | 
 37 | ```{r}
 38 | segcurve(subset(schools00, race %in% c("white", "asian")),
 39 |   "race", "school",
 40 |   weight = "n",
 41 |   segment = "state" # leave this out to produce a single curve
 42 | )
 43 | ```
 44 | 
 45 | In this case, state `A` is the most segregated, while state `B` and `C` are similarly segregated,
 46 | but at a lower level. Segregation curves are closely related to the index of dissimilarity, and 
 47 | here this corresponds to the following index values:
 48 | 
 49 | ```{r}
 50 | # converting to data.table makes this easier
 51 | data.table::as.data.table(schools00)[
 52 |   race %in% c("white", "asian"),
 53 |   dissimilarity(.SD, "race", "school", weight = "n"),
 54 |   by = .(state)
 55 | ]
 56 | ```
 57 | 
 58 | 
 59 | ## Segplot
 60 | 
 61 | ::: {.alert .alert-primary}
 62 | Please consider citing the following paper if you use segplot: Benjamin Elbers and Rob Gruijters. 2023. "[Segplot: A New Method for Visualizing Patterns of Multi-Group Segregation](https://doi.org/10.1016/j.rssm.2023.100860). Research in Social Stratification and Mobility.
 63 | :::
 64 | 
 65 | The function `segplot()` is provided to generate segplots. Segplots are described in more
 66 | detail [in this working paper](https://osf.io/preprints/socarxiv/ruw4g/).
 67 | The function requires the dataset, the group, and unit variables, and, if required, 
 68 | a variable that identifies the weight (`n` in this case). 
 69 | 
 70 | Other options to customize the look of the segplot are given by the argument `order`. 
 71 | By default, the units of the segplot are ordered by their local segregation score,
 72 | but it is also possible to order them by entropy (i.e., diversity) or by share 
 73 | of the majority population. This last option can be useful for the two-group case. 
 74 | The argument `bar_space` can be used to increase the space between the units 
 75 | from the default of zero space between bars. When plotting a subset of the dataset, 
 76 | the reference distribution shown on the right of the segplot can be changed by 
 77 | supplying a two-column data frame to the `reference_distribution` argument. 
 78 | One column of this frame should contain the group identifiers, and 
 79 | the other should include the reference proportion of each group. 
 80 | 
 81 | Examples of how to use these arguments are given below:
 82 | 
 83 | ```{r}
 84 | sch <- subset(schools00, state == "A")
 85 | 
 86 | # basic segplot
 87 | segplot(sch, "race", "school", weight = "n")
 88 | 
 89 | # order by majority group (white in this case)
 90 | segplot(sch, "race", "school", weight = "n", order = "majority")
 91 | 
 92 | # increase the space between bars
 93 | # (has to be very low here because there are many schools in this dataset)
 94 | segplot(sch, "race", "school", weight = "n", bar_space = 0.0005)
 95 | 
 96 | # change the reference distribution
 97 | # (here, we just use an equalized distribution across the five groups)
 98 | (ref <- data.frame(race = unique(schools00$race), p = rep(0.2, 5)))
 99 | segplot(sch, "race", "school",
100 |   weight = "n",
101 |   reference_distribution = ref
102 | )
103 | ```
104 | 
105 | It is also possible to show a secondary plot that shows the adjusted local segregation scores:
106 | 
107 | ```{r}
108 | segplot(sch, "race", "school", weight = "n", secondary_plot = "segregation")
109 | ```
110 | 
111 | ## Compressing segregation information
112 | 
113 | The compression algorithm requires three steps to be taken. 
114 | First, it is important to decide which units should be permitted to merge: 
115 | for residential segregation, we may only want to allow neighboring units 
116 | (such as tracts) to be mergeable. In this case, the first step consists
117 | of compiling a data frame with exactly two columns, where each row identifies 
118 | a pair of neighboring units. In other cases, we may want to allow all units 
119 | to be mergeable, in principle. However, this can be very time-consuming as 
120 | it requires each unit to be compared to all others at every step of the merging 
121 | operation. To speed up compression, we therefore implement an option that 
122 | allows units to be merged only within a window of "neighboring" units, 
123 | where the definition of each window is based on similarities in local segregation. 
124 | Hence, for a given unit, only `n_neighbors` are considered at every step, and
125 | these neighbors are based on similarities in local segregation. 
126 | Smaller `n_neighbors` values will result in faster run times, but increase 
127 | the probability of non-optimal merges. The method of merging can be 
128 | specified in the `compress()` function by supplying the argument neighbors.
129 | 
130 | The second step is then to run the actual compression algorithm using `compress()`. 
131 | For this example, we choose to compress based on a relatively small window:
132 | 
133 | ```{r, results='hide'}
134 | # compression based on window of 20 'neighboring' units
135 | # in terms of local segregation (alternatively, neighbors can be a data frame)
136 | comp <- compress(sch, "race", "school",
137 |   weight = "n", neighbors = "local", n_neighbors = 20
138 | )
139 | ```
140 | 
141 | After running `compress()`—which can take some time depending on how 
142 | many neighbors need to be considered—the output summarizes the compression 
143 | that can be achieved:
144 | 
145 | ```{r}
146 | comp
147 | ```
148 | 
149 | The results indicate that 99% of the segregation information can be retained by 
150 | only 98 units (out of 560 in the original dataset), 95% in only 24 units, 
151 | and 90% in 10 units. The percentage of information retained on each iteration 
152 | can be accessed via the data frame available through `comp$iterations`. 
153 | This data frame can also be used to generate a plot that shows the relationship
154 | between the number of merges and the loss in segregation information:
155 | 
156 | ```{r}
157 | scree_plot(comp)
158 | ```
159 | 
160 | Another way to learn more about the compression is to visualize the information as a dendrogram:
161 | 
162 | ```{r}
163 | dend <- as.dendrogram(comp)
164 | plot(dend, leaflab = "none")
165 | ```
166 | 
167 | The third step is to create a new dataset based on the desired level of compression. 
168 | This can be achieved using the function `merge_units()`, and either `n_units` or `percent`
169 | can be specified to indicate the desired level of compression.
170 | 
171 | ```{r}
172 | sch_compressed <- merge_units(comp, n_units = 15)
173 | # or, for instance: merge_units(comp, percent = 0.80)
174 | head(sch_compressed)
175 | ```
176 | 
177 | The compressed dataset has the same format as the original dataset 
178 | and can now be used to produce another segplot, e.g.
179 | 
180 | ```{r}
181 | segplot(sch_compressed, "race", "school", weight = "n", secondary_plot = "segregation")
182 | ```
183 | 


--------------------------------------------------------------------------------
/man/mutual_difference.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/mutual_difference.R
  3 | \name{mutual_difference}
  4 | \alias{mutual_difference}
  5 | \title{Decomposes the difference between two M indices}
  6 | \usage{
  7 | mutual_difference(
  8 |   data1,
  9 |   data2,
 10 |   group,
 11 |   unit,
 12 |   weight = NULL,
 13 |   method = "shapley",
 14 |   se = FALSE,
 15 |   CI = 0.95,
 16 |   n_bootstrap = 100,
 17 |   base = exp(1),
 18 |   ...
 19 | )
 20 | }
 21 | \arguments{
 22 | \item{data1}{A data frame with same structure as \code{data2}.}
 23 | 
 24 | \item{data2}{A data frame with same structure as \code{data1}.}
 25 | 
 26 | \item{group}{A categorical variable or a vector of variables
 27 | contained in \code{data}. Defines the first dimension
 28 | over which segregation is computed.}
 29 | 
 30 | \item{unit}{A categorical variable or a vector of variables
 31 | contained in \code{data}. Defines the second dimension
 32 | over which segregation is computed.}
 33 | 
 34 | \item{weight}{Numeric. (Default \code{NULL})}
 35 | 
 36 | \item{method}{Either "shapley" (the default), "km" (Karmel and Maclachlan method), or
 37 | "mrc" (Mora and Ruiz-Castillo method).}
 38 | 
 39 | \item{se}{If \code{TRUE}, the segregation estimates are bootstrapped to provide
 40 | standard errors and to apply bias correction. The bias that is reported
 41 | has already been applied to the estimates (i.e. the reported estimates are "debiased")
 42 | (Default \code{FALSE})}
 43 | 
 44 | \item{CI}{If \code{se = TRUE}, compute the confidence (CI*100)% confidence interval
 45 | in addition to the bootstrap standard error.
 46 | This is based on percentiles of the bootstrap distribution, and a valid interpretation
 47 | relies on a larger number of bootstrap iterations. (Default \code{0.95})}
 48 | 
 49 | \item{n_bootstrap}{Number of bootstrap iterations. (Default \code{100})}
 50 | 
 51 | \item{base}{Base of the logarithm that is used in the calculation.
 52 | Defaults to the natural logarithm.}
 53 | 
 54 | \item{...}{Only used for additional arguments when
 55 | when \code{method} is set to \code{shapley} or \code{km}. See \link{ipf} for details.}
 56 | }
 57 | \value{
 58 | Returns a data.table with columns \code{stat} and \code{est}. The data frame contains
 59 |   the following rows defined by \code{stat}:
 60 |   \code{M1} contains the M for \code{data1}.
 61 |   \code{M2} contains the M for \code{data2}.
 62 |   \code{diff} is the difference between \code{M2} and \code{M1}.
 63 |   The sum of the five rows following \code{diff} equal \code{diff}.
 64 | 
 65 |   \code{additions} contains the change in M induces by \code{unit} and \code{group} categories
 66 |   present in \code{data2} but not \code{data1}, and \code{removals} the reverse.
 67 | 
 68 |   All methods return the following three terms:
 69 |   \code{unit_marginal} is the contribution of unit composition differences.
 70 |   \code{group_marginal} is the contribution of group composition differences.
 71 |   \code{structural} is the contribution unexplained by the marginal changes, i.e. the structural
 72 |     difference. Note that the interpretation of these terms depend on the exact method used.
 73 | 
 74 |   When using "km", one additional row is returned:
 75 |    \code{interaction} is the contribution of differences in the joint marginal distribution
 76 |      of \code{unit} and \code{group}.
 77 | 
 78 |   When "shapley_detailed" is used, an additional column "unit" is returned, along with
 79 |     six additional rows for each unit that is present in both \code{data1} and \code{data2}.
 80 |     The five rows have the following meaning:
 81 |     \code{p1} (\code{p2}) is the proportion of the unit in \code{data1} (\code{data2})
 82 |     once non-intersecting units/groups have been removed. The changes in local linkage are
 83 |     given by \code{ls_diff1} and \code{ls_diff2}, and their average is given by
 84 |     \code{ls_diff_mean}. The row named \code{total}
 85 |     summarizes the contribution of
 86 |     the unit towards structural change
 87 |     using the formula \code{.5 * p1 * ls_diff1 + .5 * p2 * ls_diff2}.
 88 |     The sum of all "total" components equals structural change.
 89 | 
 90 |   If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
 91 |   the associated bootstrapped standard errors, an additional column \code{CI} contains
 92 |   the estimate confidence interval as a list column, an additional column \code{bias} contains
 93 |   the estimated bias, and the column \code{est} contains the bias-corrected estimates.
 94 | }
 95 | \description{
 96 | Uses one of three methods to decompose the difference between two M indices:
 97 | (1) "shapley" / "shapley_detailed": a method based on the Shapley decomposition
 98 | with a few advantages over the Karmel-Maclachlan method
 99 | (recommended and the default, Deutsch et al. 2006),
100 | (2) "km": the method based on Karmel-Maclachlan (1988),
101 | (3) "mrc": the method developed by Mora and Ruiz-Castillo (2009).
102 | All methods have been extended to account for missing units/groups in either data input.
103 | }
104 | \details{
105 | The Shapley method is an improvement over the Karmel-Maclachlan method (Deutsch et al. 2006).
106 | It is based on several margins-adjusted data inputs
107 | and yields symmetrical results (i.e. \code{data1} and \code{data2} can be switched).
108 | When "shapley_detailed" is used, the structural component is further decomposed into
109 | the contributions of individuals units.
110 | 
111 | The Karmel-Maclachlan method (Karmel and Maclachlan 1988) adjusts
112 | the margins of \code{data1} to be similar to the margins of \code{data2}. This process
113 | is not symmetrical.
114 | 
115 | The Shapley and Karmel-Maclachlan methods are based on iterative proportional fitting (IPF),
116 | first introduced by Deming and Stephan (1940).
117 | Depending on the size of the dataset, this may take a few seconds (see \link{ipf} for details).
118 | 
119 | The method developed by Mora and Ruiz-Castillo (2009) uses an algebraic approach to estimate the
120 | size of the components. This will often yield substantively different results from the Shapley
121 | and Karmel-Maclachlan methods. Note that this method is not symmetric in terms of what is
122 | defined as \code{group} and \code{unit} categories, which may yield contradictory results.
123 | 
124 | A problem arises when there are \code{group} and/or \code{unit} categories in \code{data1}
125 | that are not present in \code{data2} (or vice versa).
126 | All methods estimate the difference only
127 | for categories that are present in both datasets, and report additionally
128 | the change in M that is induced by these cases as
129 | \code{additions} (present in \code{data2}, but not in \code{data1}) and
130 | \code{removals} (present in \code{data1}, but not in \code{data2}).
131 | }
132 | \examples{
133 | \dontrun{
134 | # decompose the difference in school segregation between 2000 and 2005,
135 | # using the Shapley method
136 | mutual_difference(schools00, schools05,
137 |     group = "race", unit = "school",
138 |     weight = "n", method = "shapley", precision = .1
139 | )
140 | # => the structural component is close to zero, thus most change is in the marginals.
141 | # This method gives identical results when we switch the unit and group definitions,
142 | # and when we switch the data inputs.
143 | 
144 | # the Karmel-Maclachlan method is similar, but only adjust the data in the forward direction...
145 | mutual_difference(schools00, schools05,
146 |     group = "school", unit = "race",
147 |     weight = "n", method = "km", precision = .1
148 | )
149 | 
150 | # ...this means that the results won't be identical when we switch the data inputs
151 | mutual_difference(schools05, schools00,
152 |     group = "school", unit = "race",
153 |     weight = "n", method = "km", precision = .1
154 | )
155 | 
156 | # the MRC method indicates a much higher structural change...
157 | mutual_difference(schools00, schools05,
158 |     group = "race", unit = "school",
159 |     weight = "n", method = "mrc"
160 | )
161 | 
162 | # ...and is not symmetric
163 | mutual_difference(schools00, schools05,
164 |     group = "school", unit = "race",
165 |     weight = "n", method = "mrc"
166 | )
167 | }
168 | }
169 | \references{
170 | W. E. Deming, F. F. Stephan. 1940. "On a Least Squares Adjustment of a Sampled Frequency Table
171 |    When the Expected Marginal Totals are Known."
172 |    The Annals of Mathematical Statistics 11(4): 427-444.
173 | 
174 | T. Karmel and M. Maclachlan. 1988.
175 |   "Occupational Sex Segregation — Increasing or Decreasing?" Economic Record 64: 187-195.
176 | 
177 | R. Mora and J. Ruiz-Castillo. 2009. "The Invariance Properties of the
178 |   Mutual Information Index of Multigroup Segregation." Research on Economic Inequality 17: 33-53.
179 | 
180 | J. Deutsch, Y. Flückiger, and J. Silber. 2009.
181 |       "Analyzing Changes in Occupational Segregation: The Case of Switzerland (1970–2000)."
182 |        Research on Economic Inequality 17: 171–202.
183 | }
184 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output:
  3 |   md_document:
  4 |     variant: gfm
  5 | editor_options: 
  6 |   markdown: 
  7 |     wrap: 72
  8 | ---
  9 | 
 10 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 11 | 
 12 | ```{r, echo = FALSE}
 13 | knitr::opts_chunk$set(
 14 |     collapse = TRUE,
 15 |     comment = "#>",
 16 |     fig.path = "man/figures/README-"
 17 | )
 18 | options(scipen = 999)
 19 | options(digits = 3)
 20 | set.seed(69839)
 21 | ```
 22 | 
 23 | # segregation
 24 | 
 25 | [![CRAN
 26 | Version](https://www.r-pkg.org/badges/version/segregation)](https://CRAN.R-project.org/package=segregation)
 27 | [![R build
 28 | status](https://github.com/elbersb/segregation/workflows/R-CMD-check/badge.svg)](https://github.com/elbersb/segregation/actions)
 29 | [![Coverage
 30 | status](https://codecov.io/gh/elbersb/segregation/branch/master/graph/badge.svg)](https://app.codecov.io/github/elbersb/segregation?branch=master)
 31 | 
 32 | An R package to calculate, visualize, and decompose various segregation indices. 
 33 | The package currently supports
 34 | 
 35 | -   the Mutual Information Index (M),
 36 | -   Theil's Information Index (H),
 37 | -   the index of Dissimilarity (D),
 38 | -   the isolation and exposure index.
 39 | 
 40 | Find more information in `vignette("segregation")`
 41 | and the [documentation](https://elbersb.de/segregation).
 42 | 
 43 | The package also supports
 44 | 
 45 | -   [standard error and confidence intervals estimation via bootstrapping](https://elbersb.com/public/posts/2021-11-24-segregation-bias/),
 46 |     which also corrects for small sample bias
 47 | -   decomposition of the M and H indices (within/between, local segregation)
 48 | -   decomposing differences in total segregation over time (Elbers 2020)
 49 | -   [segregation visualizations](https://elbersb.github.io/segregation/articles/plotting.html) (segregation curves and 'segplots')
 50 | 
 51 | Most methods return [tidy](https://vita.had.co.nz/papers/tidy-data.html)
 52 | [data.tables](https://rdatatable.gitlab.io/data.table/) for easy
 53 | post-processing and plotting. For speed, the package uses the [`data.table`](https://rdatatable.gitlab.io/data.table/)
 54 | package internally, and implements some functions in C++.
 55 | 
 56 | Most of the procedures implemented in this package are described in more
 57 | detail [in this SMR
 58 | paper](https://journals.sagepub.com/doi/full/10.1177/0049124121986204)
 59 | ([Preprint](https://osf.io/preprints/socarxiv/ya7zs/)) and [in this
 60 | working paper](https://osf.io/preprints/socarxiv/ruw4g/).
 61 | 
 62 | ## Usage
 63 | 
 64 | The package provides an easy way to calculate segregation measures,
 65 | based on the Mutual Information Index (M) and Theil's Entropy Index (H).
 66 | 
 67 | ```{r}
 68 | library(segregation)
 69 | 
 70 | # example dataset with fake data provided by the package
 71 | mutual_total(schools00, "race", "school", weight = "n")
 72 | ```
 73 | 
 74 | Standard errors in all functions can be estimated via boostrapping. This
 75 | will also apply bias-correction to the estimates:
 76 | 
 77 | ```{r}
 78 | mutual_total(schools00, "race", "school",
 79 |     weight = "n",
 80 |     se = TRUE, CI = 0.90, n_bootstrap = 500
 81 | )
 82 | ```
 83 | 
 84 | Decompose segregation into a between-state and a within-state term (the
 85 | sum of these equals total segregation):
 86 | 
 87 | ```{r}
 88 | # between states
 89 | mutual_total(schools00, "race", "state", weight = "n")
 90 | 
 91 | # within states
 92 | mutual_total(schools00, "race", "school", within = "state", weight = "n")
 93 | ```
 94 | 
 95 | Local segregation (`ls`) is a decomposition by units or groups (here
 96 | racial groups). This function also support standard error and CI
 97 | estimation. The sum of the proportion-weighted local segregation scores
 98 | equals M:
 99 | 
100 | ```{r}
101 | local <- mutual_local(schools00,
102 |     group = "school", unit = "race", weight = "n",
103 |     se = TRUE, CI = 0.90, n_bootstrap = 500, wide = TRUE
104 | )
105 | local[, c("race", "ls", "p", "ls_CI")]
106 | sum(local$p * local$ls)
107 | ```
108 | 
109 | Decompose the difference in M between 2000 and 2005, using iterative
110 | proportional fitting (IPF) and the Shapley decomposition (see Elbers
111 | 2021 for details):
112 | 
113 | ```{r}
114 | mutual_difference(schools00, schools05,
115 |     group = "race", unit = "school",
116 |     weight = "n", method = "shapley"
117 | )
118 | ```
119 | 
120 | Show a segplot:
121 | 
122 | ```{r segplot}
123 | segplot(schools00, group = "race", unit = "school", weight = "n")
124 | ```
125 | 
126 | Find more information in the
127 | [documentation](https://elbersb.github.io/segregation/).
128 | 
129 | ## How to install
130 | 
131 | To install the package from CRAN, use
132 | 
133 | ```{r eval=FALSE}
134 | install.packages("segregation")
135 | ```
136 | 
137 | To install the development version, use
138 | 
139 | ```{r eval=FALSE}
140 | devtools::install_github("elbersb/segregation")
141 | ```
142 | 
143 | ## Citation
144 | 
145 | If you use this package for your research, please cite one of the following papers:
146 | 
147 | - Elbers, Benjamin (2021). A Method for Studying Differences in Segregation
148 | Across Time and Space. Sociological Methods & Research.
149 | <https://doi.org/10.1177/0049124121986204>
150 | 
151 | - Elbers, Benjamin and Rob Gruijters (2023). Segplot: A New Method for Visualizing Patterns of Multi-Group Segregation.
152 | <https://doi.org/10.1016/j.rssm.2023.100860>
153 | 
154 | ## Some additional resources
155 | 
156 | -   The book *Analyzing US Census Data: Methods, Maps, and Models in R*
157 |     by Kyle E. Walker contains [a discussion of this
158 |     package](https://walker-data.com/census-r/modeling-us-census-data.html#indices-of-segregation-and-diversity),
159 |     and is a great resource for anyone working with spatial data,
160 |     especially U.S. Census data.
161 | -   A paper that makes use of this package: [Did Residential Racial
162 |     Segregation in the U.S. Really Increase? An Analysis Accounting for
163 |     Changes in Racial
164 |     Diversity](https://elbersb.com/public/posts/2021-07-23-segregation-increase/)
165 |     ([Code and Data](https://osf.io/mg9q4/))
166 | -   Some of the analyses [in this
167 |     article](https://multimedia.tijd.be/diversiteit/) by the Belgian
168 |     newspaper *De Tijd* used the package.
169 | -   The analyses of [this article in the Wall Street
170 |     Journal](https://www.wsj.com/articles/chicago-vs-dallas-why-the-north-lags-behind-the-south-and-west-in-racial-integration-11657936680)
171 |     were produced using this package.
172 | 
173 | ## References on entropy-based segregation indices
174 | 
175 | Deutsch, J., Flückiger, Y. & Silber, J. (2009). Analyzing Changes in
176 | Occupational Segregation: The Case of Switzerland (1970--2000), in: Yves
177 | Flückiger, Sean F. Reardon, Jacques Silber (eds.) Occupational and
178 | Residential Segregation (Research on Economic Inequality, Volume 17),
179 | 171--202.
180 | 
181 | DiPrete, T. A., Eller, C. C., Bol, T., & van de Werfhorst, H. G. (2017).
182 | School-to-Work Linkages in the United States, Germany, and France.
183 | American Journal of Sociology, 122(6), 1869-1938.
184 | <https://doi.org/10.1086/691327>
185 | 
186 | Elbers, B. (2021). A Method for Studying Differences in Segregation
187 | Across Time and Space. Sociological Methods & Research.
188 | <https://doi.org/10.1177/0049124121986204>
189 | 
190 | Forster, A. G., & Bol, T. (2017). Vocational education and employment
191 | over the life course using a new measure of occupational specificity.
192 | Social Science Research, 70, 176-197.
193 | <https://doi.org/10.1016/j.ssresearch.2017.11.004>
194 | 
195 | Theil, H. (1971). Principles of Econometrics. New York: Wiley.
196 | 
197 | Frankel, D. M., & Volij, O. (2011). Measuring school segregation.
198 | Journal of Economic Theory, 146(1), 1-38.
199 | <https://doi.org/10.1016/j.jet.2010.10.008>
200 | 
201 | Mora, R., & Ruiz-Castillo, J. (2003). Additively decomposable
202 | segregation indexes. The case of gender segregation by occupations and
203 | human capital levels in Spain. The Journal of Economic Inequality, 1(2),
204 | 147-179. <https://doi.org/10.1023/A:1026198429377>
205 | 
206 | Mora, R., & Ruiz-Castillo, J. (2009). The Invariance Properties of the
207 | Mutual Information Index of Multigroup Segregation, in: Yves Flückiger,
208 | Sean F. Reardon, Jacques Silber (eds.) Occupational and Residential
209 | Segregation (Research on Economic Inequality, Volume 17), 33-53.
210 | 
211 | Mora, R., & Ruiz-Castillo, J. (2011). Entropy-based Segregation Indices.
212 | Sociological Methodology, 41(1), 159--194.
213 | <https://doi.org/10.1111/j.1467-9531.2011.01237.x>
214 | 
215 | Van Puyenbroeck, T., De Bruyne, K., & Sels, L. (2012). More than 'Mutual
216 | Information': Educational and sectoral gender segregation and their
217 | interaction on the Flemish labor market. Labour Economics, 19(1), 1-8.
218 | <https://doi.org/10.1016/j.labeco.2011.05.002>
219 | 
220 | Watts, M. The Use and Abuse of Entropy Based Segregation Indices.
221 | Working Paper. URL:
222 | <http://www.ecineq.org/ecineq_lux15/FILESx2015/CR2/p217.pdf>
223 | 


--------------------------------------------------------------------------------
/R/segregation.R:
--------------------------------------------------------------------------------
  1 | #' segregation: Entropy-based segregation indices
  2 | #'
  3 | #' Calculate and decompose entropy-based, multigroup segregation indices, with a focus
  4 | #' on the Mutual Information Index (M) and Theil's Information Index (H).
  5 | #' Provides tools to decompose the measures by groups and units, and by within
  6 | #' and between terms. Includes standard error estimation by bootstrapping.
  7 | #'
  8 | #' @seealso \url{https://elbersb.com/segregation}
  9 | #'
 10 | #' @docType package
 11 | #' @name segregation
 12 | #' @keywords internal
 13 | "_PACKAGE"
 14 | 
 15 | #' @importFrom Rcpp sourceCpp
 16 | #' @import RcppProgress
 17 | #' @useDynLib segregation, .registration = TRUE
 18 | NULL
 19 | 
 20 | globalVariables(c(
 21 |     "V1", "V2", "cond1", "cond2", "entropy_cond", "entropy_cond1", "entropy_cond2", "entropyw",
 22 |     "est", "freq", "freq1", "freq2", "freq_orig1", "freq_orig2",
 23 |     "ls_diff_mean", "ls_diff1", "ls_diff2", "ls_unit", "bias", "boot_est", "est_debiased",
 24 |     "n", "n_group", "n_group_target", "n_source", "n_target", "n_unit", "n_unit_target",
 25 |     "n_within_group", "p", "p_exp", "p1", "p2", "p_group", "p_group_g_unit", "p_group_g_unit1",
 26 |     "p_group_g_unit2", "p_group_s", "p_group_t", "p_unit", "p_unit1", "p_unit2", "p_unit_s",
 27 |     "p_unit_t", "p_within", "sumcond1", "sumcond2", "total", "unit1", "unit2",
 28 |     ".", "..base", "..fixed_margins", "..group", "..n_bootstrap", "..unit", "se", "stat",
 29 |     "M", "N_units", "i.freq1", "i.freq2", "iter", "ls1", "ls2", "p_unit_g_group1",
 30 |     "p_unit_g_group2", "pair", "pct_M",
 31 |     "x", "xend", "y", "yend", "xmax", "xmin", "ymax", "ymin", "..cols", "p_overall",
 32 |     "freq_of", "freq_to",
 33 |     "cumul_prob_1", "cumul_prob_2", "group1", "group2", "pct_group_1",
 34 |     ".data", "new_unit", "old_unit"
 35 | ))
 36 | 
 37 | # log
 38 | 
 39 | log_env <- new.env()
 40 | assign("n_printed", 0, envir = log_env)
 41 | 
 42 | update_log <- function(bs_n = NULL, bs_max = NULL, ipf_n = NULL, ipf_max = NULL) {
 43 |     if (!is.null(bs_n)) assign("bs_n", bs_n, envir = log_env)
 44 |     if (!is.null(bs_max)) assign("bs_max", bs_max, envir = log_env)
 45 |     if (!is.null(ipf_n)) assign("ipf_n", ipf_n, envir = log_env)
 46 |     if (!is.null(ipf_max)) assign("ipf_max", ipf_max, envir = log_env)
 47 | 
 48 |     if (!is.null(get("bs_n", envir = log_env)) && !is.null(get("ipf_n", envir = log_env))) {
 49 |         text <- paste0("[", "Bootstrap ", get("bs_n", envir = log_env), "/",
 50 |             get("bs_max", envir = log_env),
 51 |             " IPF ", get("ipf_n", envir = log_env), "/",
 52 |             get("ipf_max", envir = log_env), "] ",
 53 |             collapse = ""
 54 |         )
 55 |     } else if (!is.null(get("bs_n", envir = log_env))) {
 56 |         text <- paste0("[", "Bootstrap ", get("bs_n", envir = log_env),
 57 |             "/", get("bs_max", envir = log_env), "] ",
 58 |             collapse = ""
 59 |         )
 60 |     } else if (!is.null(get("ipf_n", envir = log_env))) {
 61 |         text <- paste0("[", "IPF ", get("ipf_n", envir = log_env),
 62 |             "/", get("ipf_max", envir = log_env), "] ",
 63 |             collapse = ""
 64 |         )
 65 |     }
 66 | 
 67 |     clear_log()
 68 |     assign("n_printed", nchar(text), envir = log_env)
 69 |     cat(text, file = stderr())
 70 | }
 71 | 
 72 | update_log_progress <- function(text) {
 73 |     assign("n_printed", get("n_printed", envir = log_env) + nchar(text), envir = log_env)
 74 |     cat(text, file = stderr())
 75 | }
 76 | 
 77 | clear_log <- function() {
 78 |     utils::flush.console()
 79 | 
 80 |     if (get("n_printed", envir = log_env) > 0) {
 81 |         cat("\r", paste0(rep(" ", times = get("n_printed", envir = log_env)), collapse = ""),
 82 |             "\r",
 83 |             file = stderr()
 84 |         )
 85 | 
 86 |         assign("n_printed", 0, envir = log_env)
 87 |     }
 88 | }
 89 | 
 90 | close_log <- function() {
 91 |     clear_log()
 92 |     assign("bs_n", NULL, envir = log_env)
 93 |     assign("bs_max", NULL, envir = log_env)
 94 |     assign("ipf_n", NULL, envir = log_env)
 95 |     assign("ipf_max", NULL, envir = log_env)
 96 | }
 97 | close_log()
 98 | 
 99 | # helpers
100 | 
101 | logf <- function(v, base = exp(1)) {
102 |     logged <- log(v, base = base)
103 |     logged[!is.finite(logged)] <- 0
104 |     logged
105 | }
106 | 
107 | #' @import data.table
108 | prepare_data <- function(data, group, unit, weight, within = NULL) {
109 |     if ("data.frame" %in% class(data)) {
110 |         if (nrow(data) == 0) {
111 |             stop("data.frame is empty")
112 |         }
113 |         test_vars <- c(group, unit, weight, within)
114 |         test_vars <- test_vars[!test_vars %in% names(data)]
115 |         if (length(test_vars) > 0) {
116 |             test_vars <- paste0(test_vars, collapse = ", ")
117 |             stop(paste0("variable(s) ", test_vars, " not in data.frame"))
118 |         }
119 |     } else {
120 |         stop("not a data.frame")
121 |     }
122 |     vars <- c(group, unit)
123 | 
124 |     # create a copy
125 |     data <- as.data.table(data)
126 | 
127 |     # check whether there is variation
128 |     n_groups <- nrow(data[, .N, by = group])
129 |     n_units <- nrow(data[, .N, by = unit])
130 |     if (n_groups == 1) stop("Cannot compute segregation: the group variable is constant")
131 |     if (n_units == 1) stop("Cannot compute segregation: the unit variable is constant")
132 | 
133 |     # use provided weight or weight of 1
134 |     weight_no_conflict <- weight
135 |     if (!is.null(weight_no_conflict)) {
136 |         if (weight_no_conflict == "weight") {
137 |             data[, freq := as.double(weight)]
138 |         } else {
139 |             data[, freq := as.double(get(weight_no_conflict))]
140 |         }
141 |     } else {
142 |         data[, freq := 1]
143 |     }
144 | 
145 |     if (!is.null(within)) {
146 |         vars <- c(vars, within)
147 |     }
148 | 
149 |     # drop unused factor levels - these can lead to problems downstream
150 |     for (var in vars) {
151 |         if (is.factor(data[[var]])) {
152 |             data[[var]] <- droplevels(data[[var]])
153 |         }
154 |     }
155 | 
156 |     # collapse on vars, and select only positive weights
157 |     data <- data[freq > 0, list(freq = sum(freq)), by = vars]
158 |     setattr(data, "vars", vars)
159 |     setkey(data, NULL)
160 |     data
161 | }
162 | 
163 | 
164 | #' @import data.table
165 | add_local <- function(data, group_var, unit_var, base, weight = "freq") {
166 |     n_total <- sum(data[, get(weight)])
167 |     # generate unit and group totals
168 |     data[, n_unit := sum(get(weight)), by = unit_var]
169 |     data[, n_group := sum(get(weight)), by = group_var]
170 |     # generate unit and group proportions and the
171 |     # conditional probability of being in any group given the unit
172 |     data[, `:=`(
173 |         p_unit = n_unit / n_total,
174 |         p_group = n_group / n_total,
175 |         p_group_g_unit = get(weight) / n_unit
176 |     )]
177 |     # calculate local linkage, i.e. log(cond.) * log(cond./marginal)
178 |     data[, ls_unit := sum(p_group_g_unit * logf(p_group_g_unit / p_group, base)),
179 |         by = unit_var
180 |     ]
181 | }
182 | 
183 | #' @import data.table
184 | bootstrap_summary <- function(ret, boot_ret, cols, CI) {
185 |     setnames(boot_ret, "est", "boot_est")
186 |     ret <- merge(ret, boot_ret, by = cols, sort = FALSE)
187 |     # create a "debiased" version of bootstrap estimates
188 |     ret[, est_debiased := 2 * est - boot_est, by = cols]
189 |     pct <- c((1 - CI) / 2, 1 - (1 - CI) / 2)
190 |     # estimate the "debiased" mean, standard error, CI, and quantify bias
191 |     ret <- ret[, list(
192 |         est = mean(est_debiased),
193 |         se = stats::sd(est_debiased),
194 |         CI = list(stats::quantile(est_debiased, pct)),
195 |         bias = first(est) - mean(est_debiased)
196 |     ), by = cols]
197 |     ret[]
198 | }
199 | 
200 | #' Turns a contingency table into long format
201 | #'
202 | #' Returns a data.table in long form, such that it is suitable
203 | #' for use in \link{mutual_total}, etc. Colnames and rownames of
204 | #' the matrix will be respected.
205 | #'
206 | #' @param matrix A matrix, where the rows represent the units, and the
207 | #'    column represent the groups.
208 | #' @param group Variable name for group. (Default \code{group})
209 | #' @param unit Variable name for unit. (Default \code{unit})
210 | #' @param weight Variable name for frequency weight. (Default \code{weight})
211 | #' @param drop_zero Drop unit-group combinations with zero weight. (Default \code{TRUE})
212 | #' @return A data.table.
213 | #' @examples
214 | #' m <- matrix(c(10, 20, 30, 30, 20, 10), nrow = 3)
215 | #' colnames(m) <- c("Black", "White")
216 | #' long <- matrix_to_long(m, group = "race", unit = "school")
217 | #' mutual_total(long, "race", "school", weight = "n")
218 | #' @import data.table
219 | #' @export
220 | matrix_to_long <- function(matrix, group = "group", unit = "unit",
221 |                            weight = "n", drop_zero = TRUE) {
222 |     if (!is.matrix(matrix)) stop("matrix needs be a matrix object")
223 |     if (is.null(rownames(matrix))) rownames(matrix) <- seq_len(nrow(matrix))
224 |     if (is.null(colnames(matrix))) colnames(matrix) <- seq_len(ncol(matrix))
225 |     d <- as.data.table(matrix, keep.rownames = unit)
226 |     long <- melt(d,
227 |         id.vars = unit,
228 |         variable.name = group,
229 |         variable.factor = FALSE,
230 |         value.name = weight
231 |     )
232 |     if (drop_zero == TRUE) {
233 |         ind <- long[[weight]] > 0
234 |         long[ind]
235 |     } else {
236 |         long
237 |     }
238 | }
239 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  2 | 
  3 | # segregation
  4 | 
  5 | [![CRAN
  6 | Version](https://www.r-pkg.org/badges/version/segregation)](https://CRAN.R-project.org/package=segregation)
  7 | [![R build
  8 | status](https://github.com/elbersb/segregation/workflows/R-CMD-check/badge.svg)](https://github.com/elbersb/segregation/actions)
  9 | [![Coverage
 10 | status](https://codecov.io/gh/elbersb/segregation/branch/master/graph/badge.svg)](https://app.codecov.io/github/elbersb/segregation?branch=master)
 11 | 
 12 | An R package to calculate, visualize, and decompose various segregation
 13 | indices. The package currently supports
 14 | 
 15 | -   the Mutual Information Index (M),
 16 | -   Theil’s Information Index (H),
 17 | -   the index of Dissimilarity (D),
 18 | -   the isolation and exposure index.
 19 | 
 20 | Find more information in `vignette("segregation")` and the
 21 | [documentation](https://elbersb.de/segregation).
 22 | 
 23 | The package also supports
 24 | 
 25 | -   [standard error and confidence intervals estimation via
 26 |     bootstrapping](https://elbersb.com/public/posts/2021-11-24-segregation-bias/),
 27 |     which also corrects for small sample bias
 28 | -   decomposition of the M and H indices (within/between, local
 29 |     segregation)
 30 | -   decomposing differences in total segregation over time (Elbers 2020)
 31 | -   [segregation
 32 |     visualizations](https://elbersb.github.io/segregation/articles/plotting.html)
 33 |     (segregation curves and ‘segplots’)
 34 | 
 35 | Most methods return [tidy](https://vita.had.co.nz/papers/tidy-data.html)
 36 | [data.tables](https://rdatatable.gitlab.io/data.table/) for easy
 37 | post-processing and plotting. For speed, the package uses the
 38 | [`data.table`](https://rdatatable.gitlab.io/data.table/) package
 39 | internally, and implements some functions in C++.
 40 | 
 41 | Most of the procedures implemented in this package are described in more
 42 | detail [in this SMR
 43 | paper](https://journals.sagepub.com/doi/full/10.1177/0049124121986204)
 44 | ([Preprint](https://osf.io/preprints/socarxiv/ya7zs/)) and [in this
 45 | working paper](https://osf.io/preprints/socarxiv/ruw4g/).
 46 | 
 47 | ## Usage
 48 | 
 49 | The package provides an easy way to calculate segregation measures,
 50 | based on the Mutual Information Index (M) and Theil’s Entropy Index (H).
 51 | 
 52 | ``` r
 53 | library(segregation)
 54 | 
 55 | # example dataset with fake data provided by the package
 56 | mutual_total(schools00, "race", "school", weight = "n")
 57 | #>      stat   est
 58 | #>    <char> <num>
 59 | #> 1:      M 0.426
 60 | #> 2:      H 0.419
 61 | ```
 62 | 
 63 | Standard errors in all functions can be estimated via boostrapping. This
 64 | will also apply bias-correction to the estimates:
 65 | 
 66 | ``` r
 67 | mutual_total(schools00, "race", "school",
 68 |     weight = "n",
 69 |     se = TRUE, CI = 0.90, n_bootstrap = 500
 70 | )
 71 | #> 500 bootstrap iterations on 877739 observations
 72 | #>      stat   est       se          CI    bias
 73 | #>    <char> <num>    <num>      <list>   <num>
 74 | #> 1:      M 0.422 0.000775 0.421,0.423 0.00361
 75 | #> 2:      H 0.415 0.000712 0.414,0.416 0.00356
 76 | ```
 77 | 
 78 | Decompose segregation into a between-state and a within-state term (the
 79 | sum of these equals total segregation):
 80 | 
 81 | ``` r
 82 | # between states
 83 | mutual_total(schools00, "race", "state", weight = "n")
 84 | #>      stat    est
 85 | #>    <char>  <num>
 86 | #> 1:      M 0.0992
 87 | #> 2:      H 0.0977
 88 | 
 89 | # within states
 90 | mutual_total(schools00, "race", "school", within = "state", weight = "n")
 91 | #>      stat   est
 92 | #>    <char> <num>
 93 | #> 1:      M 0.326
 94 | #> 2:      H 0.321
 95 | ```
 96 | 
 97 | Local segregation (`ls`) is a decomposition by units or groups (here
 98 | racial groups). This function also support standard error and CI
 99 | estimation. The sum of the proportion-weighted local segregation scores
100 | equals M:
101 | 
102 | ``` r
103 | local <- mutual_local(schools00,
104 |     group = "school", unit = "race", weight = "n",
105 |     se = TRUE, CI = 0.90, n_bootstrap = 500, wide = TRUE
106 | )
107 | #> 500 bootstrap iterations on 877739 observations
108 | local[, c("race", "ls", "p", "ls_CI")]
109 | #>      race    ls       p       ls_CI
110 | #>    <fctr> <num>   <num>      <list>
111 | #> 1:  asian 0.591 0.02255 0.582,0.601
112 | #> 2:  black 0.876 0.19017 0.873,0.879
113 | #> 3:   hisp 0.771 0.15167 0.767,0.775
114 | #> 4:  white 0.183 0.62810 0.182,0.184
115 | #> 5: native 1.352 0.00751   1.32,1.38
116 | sum(local$p * local$ls)
117 | #> [1] 0.422
118 | ```
119 | 
120 | Decompose the difference in M between 2000 and 2005, using iterative
121 | proportional fitting (IPF) and the Shapley decomposition (see Elbers
122 | 2021 for details):
123 | 
124 | ``` r
125 | mutual_difference(schools00, schools05,
126 |     group = "race", unit = "school",
127 |     weight = "n", method = "shapley"
128 | )
129 | #>              stat      est
130 | #>            <char>    <num>
131 | #> 1:             M1  0.42554
132 | #> 2:             M2  0.41339
133 | #> 3:           diff -0.01215
134 | #> 4:      additions -0.00341
135 | #> 5:       removals -0.01141
136 | #> 6: group_marginal  0.01787
137 | #> 7:  unit_marginal -0.01171
138 | #> 8:     structural -0.00349
139 | ```
140 | 
141 | Show a segplot:
142 | 
143 | ``` r
144 | segplot(schools00, group = "race", unit = "school", weight = "n")
145 | ```
146 | 
147 | ![](man/figures/README-segplot-1.png)<!-- -->
148 | 
149 | Find more information in the
150 | [documentation](https://elbersb.github.io/segregation/).
151 | 
152 | ## How to install
153 | 
154 | To install the package from CRAN, use
155 | 
156 | ``` r
157 | install.packages("segregation")
158 | ```
159 | 
160 | To install the development version, use
161 | 
162 | ``` r
163 | devtools::install_github("elbersb/segregation")
164 | ```
165 | 
166 | ## Citation
167 | 
168 | If you use this package for your research, please cite one of the
169 | following papers:
170 | 
171 | -   Elbers, Benjamin (2021). A Method for Studying Differences in
172 |     Segregation Across Time and Space. Sociological Methods & Research.
173 |     <https://doi.org/10.1177/0049124121986204>
174 | 
175 | -   Elbers, Benjamin and Rob Gruijters (2023). Segplot: A New Method for
176 |     Visualizing Patterns of Multi-Group Segregation.
177 |     <https://doi.org/10.1016/j.rssm.2023.100860>
178 | 
179 | ## Some additional resources
180 | 
181 | -   The book *Analyzing US Census Data: Methods, Maps, and Models in R*
182 |     by Kyle E. Walker contains [a discussion of this
183 |     package](https://walker-data.com/census-r/modeling-us-census-data.html#indices-of-segregation-and-diversity),
184 |     and is a great resource for anyone working with spatial data,
185 |     especially U.S. Census data.
186 | -   A paper that makes use of this package: [Did Residential Racial
187 |     Segregation in the U.S. Really Increase? An Analysis Accounting for
188 |     Changes in Racial
189 |     Diversity](https://elbersb.com/public/posts/2021-07-23-segregation-increase/)
190 |     ([Code and Data](https://osf.io/mg9q4/))
191 | -   Some of the analyses [in this
192 |     article](https://multimedia.tijd.be/diversiteit/) by the Belgian
193 |     newspaper *De Tijd* used the package.
194 | -   The analyses of [this article in the Wall Street
195 |     Journal](https://www.wsj.com/articles/chicago-vs-dallas-why-the-north-lags-behind-the-south-and-west-in-racial-integration-11657936680)
196 |     were produced using this package.
197 | 
198 | ## References on entropy-based segregation indices
199 | 
200 | Deutsch, J., Flückiger, Y. & Silber, J. (2009). Analyzing Changes in
201 | Occupational Segregation: The Case of Switzerland (1970–2000), in: Yves
202 | Flückiger, Sean F. Reardon, Jacques Silber (eds.) Occupational and
203 | Residential Segregation (Research on Economic Inequality, Volume 17),
204 | 171–202.
205 | 
206 | DiPrete, T. A., Eller, C. C., Bol, T., & van de Werfhorst, H. G. (2017).
207 | School-to-Work Linkages in the United States, Germany, and France.
208 | American Journal of Sociology, 122(6), 1869-1938.
209 | <https://doi.org/10.1086/691327>
210 | 
211 | Elbers, B. (2021). A Method for Studying Differences in Segregation
212 | Across Time and Space. Sociological Methods & Research.
213 | <https://doi.org/10.1177/0049124121986204>
214 | 
215 | Forster, A. G., & Bol, T. (2017). Vocational education and employment
216 | over the life course using a new measure of occupational specificity.
217 | Social Science Research, 70, 176-197.
218 | <https://doi.org/10.1016/j.ssresearch.2017.11.004>
219 | 
220 | Theil, H. (1971). Principles of Econometrics. New York: Wiley.
221 | 
222 | Frankel, D. M., & Volij, O. (2011). Measuring school segregation.
223 | Journal of Economic Theory, 146(1), 1-38.
224 | <https://doi.org/10.1016/j.jet.2010.10.008>
225 | 
226 | Mora, R., & Ruiz-Castillo, J. (2003). Additively decomposable
227 | segregation indexes. The case of gender segregation by occupations and
228 | human capital levels in Spain. The Journal of Economic Inequality, 1(2),
229 | 147-179. <https://doi.org/10.1023/A:1026198429377>
230 | 
231 | Mora, R., & Ruiz-Castillo, J. (2009). The Invariance Properties of the
232 | Mutual Information Index of Multigroup Segregation, in: Yves Flückiger,
233 | Sean F. Reardon, Jacques Silber (eds.) Occupational and Residential
234 | Segregation (Research on Economic Inequality, Volume 17), 33-53.
235 | 
236 | Mora, R., & Ruiz-Castillo, J. (2011). Entropy-based Segregation Indices.
237 | Sociological Methodology, 41(1), 159–194.
238 | <https://doi.org/10.1111/j.1467-9531.2011.01237.x>
239 | 
240 | Van Puyenbroeck, T., De Bruyne, K., & Sels, L. (2012). More than ‘Mutual
241 | Information’: Educational and sectoral gender segregation and their
242 | interaction on the Flemish labor market. Labour Economics, 19(1), 1-8.
243 | <https://doi.org/10.1016/j.labeco.2011.05.002>
244 | 
245 | Watts, M. The Use and Abuse of Entropy Based Segregation Indices.
246 | Working Paper. URL:
247 | <http://www.ecineq.org/ecineq_lux15/FILESx2015/CR2/p217.pdf>
248 | 


--------------------------------------------------------------------------------
/R/ipf.R:
--------------------------------------------------------------------------------
  1 | #' Adjustment of marginal distributions using iterative proportional fitting
  2 | #'
  3 | #' Adjusts the marginal distributions for \code{group} and \code{unit}
  4 | #' in \code{source} to the respective marginal distributions in \code{target}, using the iterative
  5 | #' proportional fitting algorithm (IPF).
  6 | #'
  7 | #' The algorithm works by scaling
  8 | #' the marginal distribution of \code{group} in the \code{source} data frame towards the
  9 | #' marginal distribution of \code{target}; then repeating this process for \code{unit}. The
 10 | #' algorithm then keeps alternating between \code{group} and \code{unit} until the marginals
 11 | #' of the adjusted data frame are within the allowed precision. This results in a dataset that
 12 | #' retains the association structure of \code{source} while approximating
 13 | #' the marginal distribution of \code{target}. If the number of \code{unit} and
 14 | #' \code{group} categories is different in \code{source} and \code{target}, the data frame returns
 15 | #' the combination of \code{unit} and \code{group} categories that occur in both datasets.
 16 | #' Zero values are replaced by a small, non-zero number (1e-4).
 17 | #' Note that the values returned sum to the observations of the source data frame, not the
 18 | #' target data frame. This is different from other IPF implementations, but ensures that the IPF
 19 | #' does not change the number of observations.
 20 | #'
 21 | #' @param source A "source" data frame. The marginals of this
 22 | #'   dataset are adjusted to the marginals of \code{target}.
 23 | #' @param target A "target" data frame. The function returns a dataset
 24 | #'   where the marginal distributions of \code{group} and \code{unit} categories
 25 | #'   are approximated by those of \code{target}.
 26 | #' @param group A categorical variable or a vector of variables
 27 | #'   contained in \code{source} and \code{target}. Defines the first distribution
 28 | #'   for adjustment.
 29 | #' @param unit A categorical variable or a vector of variables
 30 | #'   contained in \code{source} and \code{target}. Defines the second distribution
 31 | #'   for adjustment.
 32 | #' @param weight Numeric. (Default \code{NULL})
 33 | #' @param max_iterations Maximum number of iterations used for the IPF algorithm.
 34 | #' @param precision Convergence criterion for the IPF algorithm. In every iteration,
 35 | #'   the ratio of the source and target marginals are calculated for every category of
 36 | #'   \code{group} and \code{unit}. The algorithm converges when all ratios are smaller
 37 | #'   than \code{1 + precision}.
 38 | #' @return Returns a data frame that retains
 39 | #'   the association structure of \code{source} while approximating
 40 | #'   the marginal distributions for \code{group} and \code{unit} of \code{target}.
 41 | #'   The dataset identifies each combination of \code{group} and \code{unit},
 42 | #'   and categories that only occur in either \code{source} or \code{target} are dropped.
 43 | #'   The adjusted frequency of each combination is given by the column \code{n},
 44 | #'   while \code{n_target} and \code{n_source} contain the zero-adjusted frequencies
 45 | #'   in the target and source dataset, respectively.
 46 | #' @references
 47 | #'   W. E. Deming and F. F. Stephan. 1940.
 48 | #'   "On a Least Squares Adjustment of a Sampled Frequency Table
 49 | #'   When the Expected Marginal Totals are Known".
 50 | #'   Annals of Mathematical Statistics. 11 (4): 427–444.
 51 | #'
 52 | #'  T. Karmel and M. Maclachlan. 1988.
 53 | #'   "Occupational Sex Segregation — Increasing or Decreasing?" Economic Record 64: 187-195.
 54 | #' @examples
 55 | #' \dontrun{
 56 | #' # adjusts the marginals of group and unit categories so that
 57 | #' # schools00 has similar marginals as schools05
 58 | #' adj <- ipf(schools00, schools05, "race", "school", weight = "n")
 59 | #'
 60 | #' # check that the new "race" marginals are similar to the target marginals
 61 | #' # (the same could be done for schools)
 62 | #' aggregate(adj$n, list(adj$race), sum)
 63 | #' aggregate(adj$n_target, list(adj$race), sum)
 64 | #'
 65 | #' # note that the adjusted dataset contains fewer
 66 | #' # schools than either the source or the target dataset,
 67 | #' # because the marginals are only defined for the overlap
 68 | #' # of schools
 69 | #' length(unique(schools00$school))
 70 | #' length(unique(schools05$school))
 71 | #' length(unique(adj$school))
 72 | #' }
 73 | #' @import data.table
 74 | #' @export
 75 | ipf <- function(source, target, group, unit, weight = NULL,
 76 |                 max_iterations = 100, precision = 1e-07) {
 77 |     d1 <- prepare_data(source, group, unit, weight)
 78 |     d2 <- prepare_data(target, group, unit, weight)
 79 | 
 80 |     common_data <- create_common_data(d1, d2, group, unit)
 81 |     update_log(ipf_n = 1, ipf_max = 1)
 82 |     ret <- ipf_compute(common_data, group, unit, max_iterations, precision)
 83 |     close_log()
 84 | 
 85 |     as.data.frame(ret)
 86 | }
 87 | 
 88 | #' @import data.table
 89 | create_common_data <- function(d1, d2, group, unit, suppress_warnings = FALSE, fill_na = 1e-10) {
 90 |     # generate the crossproduct of common groups and units to
 91 |     # preserve all possible combinations
 92 |     # we reselect to make sure that by removing some unit/group, we don't create any new
 93 |     # combinations that don't exist. There might be rare cases where the loop needs to be repeated
 94 |     # even further, but then the table is probably too sparse anyway
 95 |     common_group <- fintersect(d1[, group, with = FALSE], d2[, group, with = FALSE])
 96 |     common_unit <- fintersect(
 97 |         merge(d1, common_group)[, unit, with = FALSE],
 98 |         merge(d2, common_group)[, unit, with = FALSE]
 99 |     )
100 |     common_group <- fintersect(
101 |         merge(d1, common_unit)[, group, with = FALSE],
102 |         merge(d2, common_unit)[, group, with = FALSE]
103 |     )
104 |     common_group$key <- 1
105 |     common_unit$key <- 1
106 |     common <- merge(common_unit, common_group, allow.cartesian = TRUE)
107 |     common[, "key" := NULL]
108 | 
109 |     # this is optional because mutual_difference handles this internally -- no need
110 |     # to print a warning here
111 |     if (suppress_warnings == FALSE) {
112 |         group_removed_d1 <- fsetdiff(d1[, group, with = FALSE], common_group[, group, with = FALSE])
113 |         group_removed_d2 <- fsetdiff(d2[, group, with = FALSE], common_group[, group, with = FALSE])
114 |         unit_removed_d1 <- fsetdiff(d1[, unit, with = FALSE], common_unit[, unit, with = FALSE])
115 |         unit_removed_d2 <- fsetdiff(d2[, unit, with = FALSE], common_unit[, unit, with = FALSE])
116 | 
117 |         if (nrow(group_removed_d1) > 0) {
118 |             warning(paste0(
119 |                 "IPF procedure removed ", nrow(group_removed_d1), " group categories from source, ",
120 |                 "this likely reduced the sample size (check columns n_source and n_target)"
121 |             ))
122 |         }
123 |         if (nrow(group_removed_d2) > 0) {
124 |             warning(paste0(
125 |                 "IPF procedure removed ", nrow(group_removed_d2), " group categories from target, ",
126 |                 "this likely reduced the sample size (check columns n_source and n_target)"
127 |             ))
128 |         }
129 |         if (nrow(unit_removed_d1) > 0) {
130 |             warning(paste0(
131 |                 "IPF procedure removed ", nrow(unit_removed_d1), " units from source, ",
132 |                 "this likely reduced the sample size (check columns n_source and n_target)"
133 |             ))
134 |         }
135 |         if (nrow(unit_removed_d1) > 0) {
136 |             warning(paste0(
137 |                 "IPF procedure removed ", nrow(unit_removed_d2), " units from target, ",
138 |                 "this likely reduced the sample size (check columns n_source and n_target)"
139 |             ))
140 |         }
141 |     }
142 | 
143 |     # make sure that fill_na is much smaller than the smallest frequency
144 |     min_freq <- min(c(d1[["freq"]], d2[["freq"]]))
145 |     if (fill_na > min_freq) {
146 |         fill_na <- min_freq / 1000
147 |     }
148 | 
149 |     # join original frequencies
150 |     common <- merge(common, d1, by = c(group, unit), all.x = TRUE)
151 |     setnames(common, "freq", "freq_orig1")
152 |     common <- merge(common, d2, by = c(group, unit), all.x = TRUE)
153 |     setnames(common, "freq", "freq_orig2")
154 |     # NA in freq1 and freq2 are replaced by a small non-zero number
155 |     common[, freq1 := ifelse(is.na(freq_orig1), fill_na, freq_orig1)]
156 |     common[, freq2 := ifelse(is.na(freq_orig2), fill_na, freq_orig2)]
157 |     common[, freq_orig1 := ifelse(is.na(freq_orig1), 0, freq_orig1)]
158 |     common[, freq_orig2 := ifelse(is.na(freq_orig2), 0, freq_orig2)]
159 |     common
160 | }
161 | 
162 | 
163 | #' @import data.table
164 | ipf_compute <- function(data, group, unit,
165 |                         max_iterations = 100, precision = .001,
166 |                         only_group = FALSE, only_unit = FALSE) {
167 |     # work with relative weights
168 |     data[, p1 := freq1 / sum(freq1)]
169 |     data[, p2 := freq2 / sum(freq2)]
170 |     data[, p_group_s := sum(p1), by = group]
171 |     data[, p_unit_s := sum(p2), by = unit]
172 | 
173 |     if (only_group == TRUE) {
174 |         data[, p_group_t := sum(p2), by = group]
175 |         data[, p_unit_t := sum(p1), by = unit]
176 |     } else if (only_unit == TRUE) {
177 |         data[, p_group_t := sum(p1), by = group]
178 |         data[, p_unit_t := sum(p2), by = unit]
179 |     } else {
180 |         data[, p_group_t := sum(p2), by = group]
181 |         data[, p_unit_t := sum(p2), by = unit]
182 |     }
183 | 
184 |     # IPF algorithm
185 |     converged <- FALSE
186 |     start_i <- sample(2, 1) # start randomly with either row or column adjustment
187 |     max_iterations <- max_iterations - 1 + start_i # offset so max_iterations is still the same
188 | 
189 |     for (i in start_i:(max_iterations * 2)) {
190 |         if (i %% 2 == 0) {
191 |             data[, `:=`(p1 = p1 * p_unit_t / p_unit_s)]
192 |         } else {
193 |             data[, `:=`(p1 = p1 * p_group_t / p_group_s)]
194 |         }
195 |         data[, p1 := p1 / sum(p1)]
196 | 
197 |         if (i %% 5 == 0) {
198 |             update_log_progress("#")
199 |         }
200 | 
201 |         data[, p_group_s := sum(p1), by = group]
202 |         data[, p_unit_s := sum(p1), by = unit]
203 | 
204 |         group_ratio <- data[, list(first(p_group_s), first(p_group_t)), by = group][, abs(V1 - V2)]
205 |         unit_ratio <- data[, list(first(p_unit_s), first(p_unit_t)), by = unit][, abs(V1 - V2)]
206 | 
207 |         if (all(group_ratio <= precision) && all(unit_ratio <= precision)) {
208 |             converged <- TRUE
209 |             break
210 |         }
211 |     }
212 | 
213 |     if (!converged) {
214 |         close_log()
215 |         stop("IPF did not converge; lower precision or increase max_iterations.")
216 |     }
217 | 
218 |     # adjust ipf-adjusted count to sample size of source
219 |     data[, n := p1 / sum(p1) * sum(freq1)]
220 |     setnames(data, "freq1", "n_source")
221 |     setnames(data, "freq2", "n_target")
222 | 
223 |     data[, c(
224 |         "p_unit_t", "p_unit_s", "p_group_t", "p_group_s",
225 |         "freq_orig1", "freq_orig2", "p1", "p2"
226 |     ) := NULL]
227 |     setkey(data, NULL)
228 |     data
229 | }
230 | 


--------------------------------------------------------------------------------
/R/plots.R:
--------------------------------------------------------------------------------
  1 | #' A visual representation of segregation
  2 | #'
  3 | #' Produces a segregation plot.
  4 | #'
  5 | #' @param data A data frame.
  6 | #' @param group A categorical variable or a vector of variables
  7 | #'   contained in \code{data}. Defines the first dimension
  8 | #'   over which segregation is computed.
  9 | #' @param unit A categorical variable or a vector of variables
 10 | #'   contained in \code{data}. Defines the second dimension
 11 | #'   over which segregation is computed.
 12 | #' @param weight Numeric. (Default \code{NULL})
 13 | #' @param order A character, either
 14 | #'   "segregation", "entropy", "majority", or "majority_fixed".
 15 | #'   Affects the ordering of the units.
 16 | #'   The horizontal ordering of the groups can be changed
 17 | #'   by using a factor variable for \code{group}.
 18 | #'   The difference between "majority" and "majority_fixed" is that the former
 19 | #'   will reorder the groups in such a way that the majority group actually comes first.
 20 | #'   If you want to control the ordering yourself, use "majority_fixed" and specify
 21 | #'   the \code{group} variable as a factor variable.
 22 | #' @param secondary_plot If \code{NULL} (default), no secondary plot is drawn.
 23 | #'   If "segregation", a secondary plot is drawn that shows adjusted local segregation
 24 | #'   scores for each unit. If "cumulative", a secondary plot is drawn that shows
 25 | #'   the cumulative contribution of each unit toward the total H (calculated as the
 26 | #'   proportion of each unit times the adjusted local segregation of each unit)0.
 27 | #' @param reference_distribution Specifies the reference distribution, given as
 28 | #'   a two-column data frame, to be plotted on the right.
 29 | #'   If order is \code{segregation}, then this reference distribution is
 30 | #'   also used to compute the local segregation scores.
 31 | #' @param bar_space Specifies space between single units.
 32 | #' @param hline Default \code{NULL}. If a color is specified,
 33 | #'   horizontal lines will be drawn where groups are separated.
 34 | #' @return Returns a ggplot2 or patchwork object.
 35 | #' @import data.table
 36 | #' @export
 37 | segplot <- function(data, group, unit, weight,
 38 |                     order = "segregation", secondary_plot = NULL,
 39 |                     reference_distribution = NULL,
 40 |                     bar_space = 0, hline = NULL) {
 41 |     if (!requireNamespace("patchwork", quietly = TRUE)) {
 42 |         stop("Please install patchwork to use this function")
 43 |     }
 44 |     if (!requireNamespace("ggplot2", quietly = TRUE)) {
 45 |         stop("Please install ggplot2 to use this function")
 46 |     }
 47 | 
 48 |     checkmate::assert_character(group, len = 1)
 49 |     checkmate::assert_character(unit, len = 1)
 50 |     d <- prepare_data(data, group, unit, weight)
 51 |     # easier if renamed
 52 |     setnames(d, group, "group")
 53 |     setnames(d, unit, "unit")
 54 | 
 55 |     # check other arguments
 56 |     checkmate::assert_choice(order, c("segregation", "entropy", "majority", "majority_fixed"))
 57 |     checkmate::assert_choice(secondary_plot, c("segregation", "cumulative"), null.ok = TRUE)
 58 |     checkmate::assert_data_frame(reference_distribution, ncols = 2, nrows = d[, uniqueN(group)], null.ok = TRUE)
 59 |     checkmate::assert_numeric(bar_space, len = 1, lower = 0)
 60 |     checkmate::assert_character(hline, len = 1, null.ok = TRUE)
 61 | 
 62 |     d[, unit := as.character(unit)]
 63 |     d[, p := freq / sum(freq), by = .(unit)]
 64 |     d[, p_unit := sum(freq), by = .(unit)]
 65 |     N <- d[, first(p_unit), by = .(unit)][, sum(V1)]
 66 |     d[, p_unit := p_unit / N]
 67 | 
 68 |     # overall
 69 |     if (is.null(reference_distribution)) {
 70 |         overall <- d[, .(freq = sum(freq)), by = .(group)]
 71 |         overall[, p := freq / sum(freq)]
 72 |     } else {
 73 |         stopifnot(names(reference_distribution) == c(group, "p"))
 74 |         overall <- as.data.table(reference_distribution)
 75 |         setnames(overall, group, "group")
 76 |     }
 77 |     # order by size
 78 |     setorder(overall, -p)
 79 |     group_order <- overall[["group"]]
 80 |     if (order == "majority") {
 81 |         # if majority, we always force a reordering by group size
 82 |         d[, group := as.character(group)]
 83 |         overall[, group := as.character(group)]
 84 |         d[, group := factor(group, levels = group_order)]
 85 |         overall[, group := factor(group, levels = group_order)]
 86 |     }
 87 | 
 88 |     wide <- data.table::dcast(d[, -"freq"], p_unit + unit ~ group, value.var = "p", fill = 0)
 89 | 
 90 |     if (order == "segregation") {
 91 |         ls <- merge(d, overall[, .(group, p_overall = p)], by = "group", all.x = TRUE)
 92 |         ls <- ls[, .(ls = sum(p * logf(p / p_overall))), by = .(unit)]
 93 |         wide <- merge(ls, wide, by = "unit")
 94 |         setorder(wide, -ls)
 95 |     } else if (order == "entropy") {
 96 |         ent <- d[, .(entropy = entropy(.SD, "group", weight = "freq")), by = .(unit)]
 97 |         wide <- merge(ent, wide, by = "unit")
 98 |         setorder(wide, entropy)
 99 |     } else if (order %in% c("majority", "majority_fixed")) {
100 |         if (is.factor(d[["group"]])) {
101 |             group_order <- d[, levels(group)]
102 |         }
103 |         setorderv(wide,
104 |             c(group_order[[1]], utils::tail(group_order, 1)),
105 |             order = c(1, -1)
106 |         )
107 |     }
108 | 
109 |     # format units
110 |     wide[, xmin := cumsum(p_unit) - p_unit]
111 |     wide[, xmax := cumsum(p_unit)]
112 |     wide[, xmin := xmin + (.I - 1) * bar_space]
113 |     wide[, xmax := xmax + (.I - 1) * bar_space]
114 |     d <- merge(d, wide[, .(unit, xmin, xmax)], by = "unit")
115 |     setorderv(d, c("xmin", "group"))
116 |     d[, ymin := cumsum(p) - p, by = .(unit)]
117 |     d[, ymax := cumsum(p), by = .(unit)]
118 | 
119 |     if (bar_space == 0) {
120 |         breaks <- c(wide[["xmin"]], wide[, max(xmax)])
121 |     } else {
122 |         breaks <- c()
123 |     }
124 | 
125 |     # format overall
126 |     overall[, xmin := wide[, max(xmax)] + 0.05]
127 |     overall[, xmax := wide[, max(xmax)] + 0.10]
128 |     setorderv(overall, "group")
129 |     overall[, ymin := cumsum(p) - p]
130 |     overall[, ymax := cumsum(p)]
131 | 
132 |     combine <- rbindlist(list(d, overall), use.names = TRUE, fill = TRUE)
133 |     plot <- ggplot2::ggplot(
134 |         combine,
135 |         ggplot2::aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax)
136 |     ) +
137 |         ggplot2::geom_vline(xintercept = wide[, max(xmax)]) +
138 |         ggplot2::geom_vline(xintercept = wide[, max(xmax) + 0.05]) +
139 |         ggplot2::geom_rect(ggplot2::aes(fill = .data[["group"]])) +
140 |         ggplot2::scale_x_continuous(breaks = breaks, expand = c(0, 0)) +
141 |         ggplot2::scale_y_continuous(
142 |             labels = scales::percent_format(),
143 |             expand = c(0, 0),
144 |             sec.axis = ggplot2::dup_axis()
145 |         ) +
146 |         ggplot2::guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
147 |         ggplot2::theme_bw() +
148 |         ggplot2::theme(
149 |             panel.grid = ggplot2::element_blank(),
150 |             axis.text.x = ggplot2::element_blank(),
151 |             legend.position = "right"
152 |         ) +
153 |         ggplot2::labs(fill = NULL)
154 | 
155 |     if (!is.null(hline)) {
156 |         plot <- plot + ggplot2::geom_hline(yintercept = overall[1:(.N - 1), ymax], color = hline)
157 |     }
158 | 
159 |     if (order == "segregation" && is.null(secondary_plot)) {
160 |         plot <- plot + ggplot2::labs(x = "< more segregated | less segregated >")
161 |     } else if (order == "segregation") {
162 |         entropy <- entropy(overall, "group", weight = "p")
163 |         H_index <- wide[, sum(ls * p_unit) / entropy]
164 | 
165 |         if (secondary_plot == "segregation") {
166 |             wide[, stat := ls / entropy]
167 |             label <- "adj. LS"
168 |         } else if (secondary_plot == "cumulative") {
169 |             wide[, stat := cumsum(p_unit * ls / entropy)]
170 |             wide[, stat := max(stat) - stat + min(stat)]
171 |             label <- "Cumulative"
172 |         }
173 | 
174 |         stat_segments_h <- wide[, .(unit, x = ifelse(.I == 1, xmin, xmin - bar_space), xend = xmax, y = stat, yend = stat)]
175 |         stat_segments_v <- wide[, .(unit, x = xmax, xend = xmax, y = stat, yend = shift(stat, type = "lead"))]
176 |         stat_segments <- rbindlist(list(stat_segments_h, stat_segments_v))
177 |         sub_plot <- ggplot2::ggplot(
178 |             mapping = ggplot2::aes(x = x, y = y, xend = xend, yend = yend)
179 |         ) +
180 |             ggplot2::geom_vline(xintercept = wide[, max(xmax)], linewidth = 0.2) +
181 |             ggplot2::geom_hline(yintercept = H_index, color = "orange") +
182 |             ggplot2::geom_segment(data = stats::na.omit(stat_segments)) +
183 |             ggplot2::theme_bw() +
184 |             ggplot2::scale_x_continuous(
185 |                 limits = c(0, overall[, max(xmax)]), expand = c(0, 0),
186 |                 labels = function(x) scales::label_percent(1)(1 - x)
187 |             ) +
188 |             ggplot2::scale_y_continuous(sec.axis = ggplot2::dup_axis(
189 |                 breaks = H_index,
190 |                 labels = function(x) paste0("H = ", round(H_index, 3))
191 |             )) +
192 |             ggplot2::theme(
193 |                 panel.grid.major.x = ggplot2::element_blank(),
194 |                 panel.grid.minor.x = ggplot2::element_blank(),
195 |                 panel.grid.minor.y = ggplot2::element_blank(),
196 |                 axis.text.x = ggplot2::element_blank(),
197 |                 axis.ticks.x = ggplot2::element_blank(),
198 |                 axis.title.y.right = ggplot2::element_blank()
199 |             ) +
200 |             ggplot2::labs(y = label, x = "< more segregated | less segregated >")
201 | 
202 |         plot <- plot / sub_plot + patchwork::plot_layout(heights = c(4, 1))
203 |     }
204 | 
205 |     plot
206 | }
207 | 
208 | 
209 | #' A visual representation of two-group segregation
210 | #'
211 | #' Produces one or several segregation curves, as defined in Duncan and Duncan (1955)
212 | #'
213 | #' @param data A data frame.
214 | #' @param group A categorical variable contained in \code{data}.
215 | #'   Defines the first dimension over which segregation is computed.
216 | #' @param unit A categorical variable contained in \code{data}.
217 | #'   Defines the second dimension over which segregation is computed.
218 | #' @param weight Numeric. (Default \code{NULL})
219 | #' @param segment A categorical variable contained in \code{data}. (Default \code{NULL})
220 | #'   If given, several segregation curves will be shown, one for each segment.
221 | #' @return Returns a ggplot2 object.
222 | #' @import data.table
223 | #' @export
224 | segcurve <- function(data, group, unit, weight = NULL, segment = NULL) {
225 |     if (!requireNamespace("ggplot2", quietly = TRUE)) {
226 |         stop("Please install ggplot2 to use this function")
227 |     }
228 | 
229 |     stopifnot(length(group) == 1)
230 |     stopifnot(length(unit) == 1)
231 |     d <- prepare_data(data, group, unit, weight, within = segment)
232 |     # easier if renamed
233 |     setnames(d, group, "group")
234 |     setnames(d, unit, "unit")
235 |     if (is.null(segment)) {
236 |         d[["segment"]] <- 1
237 |     } else {
238 |         stopifnot(length(segment) == 1)
239 |         setnames(d, segment, "segment")
240 |         d[["segment"]] <- as.factor(d[["segment"]])
241 |     }
242 | 
243 |     if (d[, uniqueN(group)] != 2) {
244 |         stop("requires exactly two groups")
245 |     }
246 | 
247 |     wide <- dcast(d, segment + unit ~ group, value.var = "freq", fill = 0)
248 |     group_names <- names(wide)[3:4]
249 |     setnames(wide, group_names, c("group1", "group2"))
250 |     wide[, pct_group_1 := group1 / (group1 + group2)]
251 |     setorder(wide, segment, pct_group_1)
252 |     wide[, cumul_prob_1 := cumsum(group1) / sum(group1), by = .(segment)]
253 |     wide[, cumul_prob_2 := cumsum(group2) / sum(group2), by = .(segment)]
254 |     # need to add line through origin
255 |     wide <- wide[, .(segment, cumul_prob_1, cumul_prob_2)]
256 |     zeros <- wide[, .(cumul_prob_1 = 0, cumul_prob_2 = 0), by = .(segment)]
257 |     wide <- rbindlist(list(wide, zeros))
258 | 
259 |     p <- ggplot2::ggplot(wide, ggplot2::aes(x = cumul_prob_2, y = cumul_prob_1)) +
260 |         ggplot2::annotate(geom = "segment", x = 0, y = 0, xend = 1, yend = 1, colour = "darkgray") +
261 |         ggplot2::scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
262 |         ggplot2::scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
263 |         ggplot2::labs(
264 |             x = paste("Cumulative % ", group_names[2]),
265 |             y = paste("Cumulative % ", group_names[1])
266 |         ) +
267 |         ggplot2::coord_fixed()
268 | 
269 |     if (is.null(segment)) {
270 |         p <- p + ggplot2::geom_line()
271 |     } else {
272 |         p <- p +
273 |             ggplot2::geom_line(ggplot2::aes(color = segment)) +
274 |             ggplot2::labs(color = segment)
275 |     }
276 | 
277 |     return(p)
278 | }
279 | 


--------------------------------------------------------------------------------
/R/mutual_expected.R:
--------------------------------------------------------------------------------
  1 | expected_compute <- function(index, d, group_var, unit_var,
  2 |                              fixed_margins, n_bootstrap, base) {
  3 |     n_group <- d[, sum(freq), by = group_var][, V1]
  4 |     by_unit <- d[, list(n = sum(freq)), by = unit_var] # to keep the order
  5 |     n_unit <- by_unit[, n]
  6 |     if (length(n_group) == 1 || length(n_unit) == 1) {
  7 |         if (index == "mh") {
  8 |             return(data.table(
  9 |                 stat = c("M under 0", "H under 0"),
 10 |                 est = c(NA_real_, NA_real_), se = c(NA_real_, NA_real_)
 11 |             ))
 12 |         } else {
 13 |             return(data.table(
 14 |                 stat = "D under 0",
 15 |                 est = NA_real_, se = NA_real_
 16 |             ))
 17 |         }
 18 |     }
 19 |     p_group <- n_group / sum(n_group)
 20 |     p_unit <- n_unit / sum(n_unit)
 21 | 
 22 |     if (fixed_margins == TRUE) {
 23 |         # take the margins from the table provided
 24 |         entropy_group <- -sum(p_group * logf(p_group, base))
 25 |         sim_fixed <- stats::r2dtable(n_bootstrap, n_group, n_unit)
 26 |     } else {
 27 |         # simulate margins using a multinomial model
 28 |         group_margins <- stats::rmultinom(n_bootstrap, sum(n_group), p_group)
 29 |         unit_margins <- stats::rmultinom(n_bootstrap, sum(n_unit), p_unit)
 30 |     }
 31 | 
 32 |     boot_ret <- lapply(seq_len(n_bootstrap), function(i) {
 33 |         if (i %% 5 == 0) update_log(bs_n = i, bs_max = n_bootstrap)
 34 | 
 35 |         if (index %in% c("ls", "mh")) {
 36 |             if (fixed_margins == TRUE) {
 37 |                 p <- sim_fixed[[i]] / sum(sim_fixed[[i]])
 38 |                 # margins are fixed, just copy over
 39 |                 p_group_sim <- p_group
 40 |                 p_unit_sim <- p_unit
 41 |                 entropy_group_sim <- entropy_group
 42 |             } else {
 43 |                 sim <- stats::r2dtable(1, group_margins[, i], unit_margins[, i])[[1]]
 44 |                 p <- sim / sum(sim)
 45 |                 # margins are simulated, calculate new margins
 46 |                 p_group_sim <- rowSums(p)
 47 |                 p_unit_sim <- colSums(p)
 48 |                 entropy_group_sim <- -sum(p_group_sim * logf(p_group_sim, base))
 49 |             }
 50 | 
 51 |             # calculate local segregation scores per unit
 52 |             p_within_unit <- sweep(p, 2, p_unit_sim, "/")
 53 |             ls <- apply(p_within_unit, 2, function(x) sum(x * logf(x / p_group_sim, base)))
 54 | 
 55 |             if (index == "mh") {
 56 |                 m <- sum(p_unit_sim * ls)
 57 |                 h <- m / entropy_group_sim
 58 |                 data.table(
 59 |                     stat = c("M under 0", "H under 0"),
 60 |                     est = c(m, h)
 61 |                 )
 62 |             } else {
 63 |                 ls_data <- copy(by_unit)
 64 |                 ls_data[, n := NULL]
 65 |                 ls_data[, ls := ls]
 66 |                 ls_data[, p := p_unit_sim]
 67 |                 ls_data
 68 |             }
 69 |         } else {
 70 |             if (fixed_margins == TRUE) {
 71 |                 tab <- sim_fixed[[i]]
 72 |             } else {
 73 |                 tab <- stats::r2dtable(1, group_margins[, i], unit_margins[, i])[[1]]
 74 |             }
 75 | 
 76 |             div <- sweep(tab, 1, rowSums(tab), "/")
 77 |             d <- 1 / 2 * sum(apply(div, 2, abs_diff))
 78 |             data.table(
 79 |                 stat = c("D under 0"),
 80 |                 est = d
 81 |             )
 82 |         }
 83 |     })
 84 |     close_log()
 85 | 
 86 |     res <- data.table::rbindlist(boot_ret)
 87 |     if (index == "ls") {
 88 |         res[, .(stat = "LS under 0", est = mean(ls), se = stats::sd(ls), p_mean = mean(p)), by = unit_var]
 89 |     } else {
 90 |         res[, list(est = mean(est), se = stats::sd(est)), by = .(stat)]
 91 |     }
 92 | }
 93 | 
 94 | #' Calculates expected values when true segregation is zero
 95 | #'
 96 | #' When sample sizes are small, one group has a small proportion, or
 97 | #' when there are many units, segregation indices are typically upwardly
 98 | #' biased, even when true segregation is zero. This function simulates
 99 | #' tables with zero segregation, given the marginals of the dataset,
100 | #' and calculates segregation. If the expected values are large,
101 | #' the interpretation of index scores might have to be adjusted.
102 | #'
103 | #' @param data A data frame.
104 | #' @param group A categorical variable or a vector of variables
105 | #'   contained in \code{data}. Defines the first dimension
106 | #'   over which segregation is computed.
107 | #' @param unit A categorical variable or a vector of variables
108 | #'   contained in \code{data}. Defines the second dimension
109 | #'   over which segregation is computed.
110 | #' @param weight Numeric. (Default \code{NULL})
111 | #' @param within Apply algorithm within each group defined by this variable,
112 | #'   and report the weighted average. (Default \code{NULL})
113 | #' @param fixed_margins Should the margins be fixed or simulated? (Default \code{TRUE})
114 | #' @param n_bootstrap Number of bootstrap iterations. (Default \code{100})
115 | #' @param base Base of the logarithm that is used in the calculation.
116 | #'   Defaults to the natural logarithm.
117 | #' @return A data.table with two rows, corresponding to the expected values of
118 | #'    segregation when true segregation is zero.
119 | #' @examples
120 | #' \dontrun{
121 | #' # the schools00 dataset has a large sample size, so expected segregation is close to zero
122 | #' mutual_total_expected(schools00, "race", "school", weight = "n")
123 | #'
124 | #' # but we can build a smaller table, with 100 students distributed across
125 | #' # 10 schools, where one racial group has 10% of the students
126 | #' small <- data.frame(
127 | #'     school = c(1:10, 1:10),
128 | #'     race = c(rep("r1", 10), rep("r2", 10)),
129 | #'     n = c(rep(1, 10), rep(9, 10))
130 | #' )
131 | #' mutual_total_expected(small, "race", "school", weight = "n")
132 | #' # with an increase in sample size (n=1000), the values improve
133 | #' small$n <- small$n * 10
134 | #' mutual_total_expected(small, "race", "school", weight = "n")
135 | #' }
136 | #' @import data.table
137 | #' @export
138 | mutual_total_expected <- function(data, group, unit, weight = NULL,
139 |                                   within = NULL, fixed_margins = TRUE,
140 |                                   n_bootstrap = 100, base = exp(1)) {
141 |     d <- prepare_data(data, group, unit, weight, within)
142 |     n_total <- d[, sum(freq)]
143 |     if (all.equal(n_total, round(n_total)) != TRUE) {
144 |         stop(paste0(
145 |             "bootstrap with a total sample size that is not an integer is not allowed, ",
146 |             "maybe scale your weights?"
147 |         ))
148 |     }
149 | 
150 |     if (is.null(within)) {
151 |         res <- expected_compute(
152 |             "mh", d, group, unit,
153 |             fixed_margins, n_bootstrap, base
154 |         )
155 |     } else {
156 |         res <- d[, expected_compute(
157 |             "mh", .SD, ..group, ..unit,
158 |             ..fixed_margins, ..n_bootstrap, ..base
159 |         ),
160 |         by = within
161 |         ]
162 |         n_na <- sum(is.na(res$se)) / 2
163 |         if (n_na > 0) {
164 |             message(paste0("Removed ", n_na, " singleton items"))
165 |             res <- res[!is.na(est)]
166 |         }
167 |         p <- d[, .(p = sum(freq)), by = within]
168 |         res <- merge(res, p, all.x = TRUE, by = within)
169 |         res[, p := p / sum(p), by = .(stat)]
170 |         res <- res[, .(est = sum(est * p), se = sqrt(sum(p * se^2))), by = .(stat)]
171 |     }
172 |     res
173 | }
174 | 
175 | #' Calculates expected values when true segregation is zero
176 | #'
177 | #' When sample sizes are small, one group has a small proportion, or
178 | #' when there are many units, segregation indices are typically upwardly
179 | #' biased, even when true segregation is zero. This function simulates
180 | #' tables with zero segregation, given the marginals of the dataset,
181 | #' and calculates segregation. If the expected values are large,
182 | #' the interpretation of index scores might have to be adjusted.
183 | #'
184 | #' @param data A data frame.
185 | #' @param group A categorical variable or a vector of variables
186 | #'   contained in \code{data}. Defines the first dimension
187 | #'   over which segregation is computed.
188 | #' @param unit A categorical variable or a vector of variables
189 | #'   contained in \code{data}. Defines the second dimension
190 | #'   over which segregation is computed.
191 | #' @param weight Numeric. (Default \code{NULL})
192 | #' @param fixed_margins Should the margins be fixed or simulated? (Default \code{TRUE})
193 | #' @param n_bootstrap Number of bootstrap iterations. (Default \code{100})
194 | #' @return A data.table with one row, corresponding to the expected value of
195 | #'    the D index when true segregation is zero.
196 | #' @examples
197 | #' # build a smaller table, with 100 students distributed across
198 | #' # 10 schools, where one racial group has 10% of the students
199 | #' small <- data.frame(
200 | #'     school = c(1:10, 1:10),
201 | #'     race = c(rep("r1", 10), rep("r2", 10)),
202 | #'     n = c(rep(1, 10), rep(9, 10))
203 | #' )
204 | #' dissimilarity_expected(small, "race", "school", weight = "n")
205 | #' # with an increase in sample size (n=1000), the values improve
206 | #' small$n <- small$n * 10
207 | #' dissimilarity_expected(small, "race", "school", weight = "n")
208 | #' @import data.table
209 | #' @export
210 | dissimilarity_expected <- function(data, group, unit, weight = NULL,
211 |                                    fixed_margins = TRUE,
212 |                                    n_bootstrap = 100) {
213 |     if (length(unique(data[[group]])) != 2) {
214 |         stop("The D index only allows two distinct groups")
215 |     }
216 |     d <- prepare_data(data, group, unit, weight)
217 | 
218 |     n_total <- d[, sum(freq)]
219 |     if (all.equal(n_total, round(n_total)) != TRUE) {
220 |         stop(paste0(
221 |             "bootstrap with a total sample size that is not an integer is not allowed, ",
222 |             "maybe scale your weights?"
223 |         ))
224 |     }
225 | 
226 |     expected_compute(
227 |         "d", d, group, unit,
228 |         fixed_margins, n_bootstrap, exp(1)
229 |     )
230 | }
231 | 
232 | #' Calculates expected local segregation scores when true segregation is zero
233 | #'
234 | #' When sample sizes are small, one group has a small proportion, or
235 | #' when there are many units, segregation indices are typically upwardly
236 | #' biased, even when true segregation is zero. This function simulates
237 | #' tables with zero segregation, given the marginals of the dataset,
238 | #' and calculates local segregation scores. If the expected values are large,
239 | #' the interpretation of index scores might have to be adjusted.
240 | #'
241 | #' @param data A data frame.
242 | #' @param group A categorical variable or a vector of variables
243 | #'   contained in \code{data}. Defines the first dimension
244 | #'   over which segregation is computed.
245 | #' @param unit A categorical variable or a vector of variables
246 | #'   contained in \code{data}. Defines the group for which local
247 | #'   segregation indices are calculated.
248 | #' @param weight Numeric. (Default \code{NULL})
249 | #' @param fixed_margins Should the margins be fixed or simulated? (Default \code{TRUE})
250 | #' @param n_bootstrap Number of bootstrap iterations. (Default \code{100})
251 | #' @param base Base of the logarithm that is used in the calculation.
252 | #'   Defaults to the natural logarithm.
253 | #' @return A data.table with two rows, corresponding to the expected values of
254 | #'    segregation when true segregation is zero.
255 | #' @examples
256 | #' \dontrun{
257 | #' # the schools00 dataset has a large sample size, so expected segregation is close to zero
258 | #' mutual_local_expected(schools00, "race", "school", weight = "n")
259 | #'
260 | #' # but we can build a smaller table, with 100 students distributed across
261 | #' # 10 schools, where one racial group has 10% of the students
262 | #' small <- data.frame(
263 | #'     school = c(1:10, 1:10),
264 | #'     race = c(rep("r1", 10), rep("r2", 10)),
265 | #'     n = c(rep(1, 10), rep(9, 10))
266 | #' )
267 | #' mutual_local_expected(small, "race", "school", weight = "n")
268 | #' # with an increase in sample size (n=1000), the values improve
269 | #' small$n <- small$n * 10
270 | #' mutual_local_expected(small, "race", "school", weight = "n")
271 | #' }
272 | #' @import data.table
273 | #' @export
274 | mutual_local_expected <- function(data, group, unit, weight = NULL,
275 |                                   fixed_margins = TRUE, n_bootstrap = 100, base = exp(1)) {
276 |     d <- prepare_data(data, group, unit, weight)
277 |     n_total <- d[, sum(freq)]
278 |     if (all.equal(n_total, round(n_total)) != TRUE) {
279 |         stop(paste0(
280 |             "bootstrap with a total sample size that is not an integer is not allowed, ",
281 |             "maybe scale your weights?"
282 |         ))
283 |     }
284 | 
285 |     expected_compute(
286 |         "ls", d, group, unit,
287 |         fixed_margins, n_bootstrap, base
288 |     )
289 | }
290 | 


--------------------------------------------------------------------------------
/src/compression.cpp:
--------------------------------------------------------------------------------
  1 | // [[Rcpp::depends(RcppProgress)]]
  2 | #include <Rcpp.h>
  3 | #include <progress.hpp>
  4 | #include <progress_bar.hpp>
  5 | using namespace Rcpp;
  6 | 
  7 | struct CompressionResults
  8 | {
  9 |     std::vector<int> iter;
 10 |     std::vector<double> M;
 11 |     std::vector<double> M_wgt;
 12 |     std::vector<double> N_units;
 13 |     std::vector<std::string> old_unit;
 14 |     std::vector<std::string> new_unit;
 15 | };
 16 | 
 17 | std::tuple<double, double> calculate_m(std::map<std::string, std::vector<double>> &data)
 18 | {
 19 |     // create group sums
 20 |     int n_groups = data.begin()->second.size();
 21 |     std::vector<double> group_sums(n_groups, 0.0);
 22 |     for (auto &[unit, counts] : data)
 23 |     {
 24 |         for (int i = 0; i < n_groups; i++)
 25 |             group_sums[i] += counts[i];
 26 |     }
 27 | 
 28 |     double n_total = std::accumulate(group_sums.begin(), group_sums.end(), 0);
 29 | 
 30 |     double m_total = 0.0;
 31 |     for (auto &[unit, counts] : data)
 32 |     {
 33 |         double n_unit = std::accumulate(counts.begin(), counts.end(), 0);
 34 |         for (int i = 0; i < n_groups; i++)
 35 |         {
 36 |             double obs = counts[i] / n_total;
 37 |             if (obs == 0)
 38 |                 continue;
 39 |             double exp = (n_unit / n_total * group_sums[i] / n_total);
 40 |             m_total += obs * std::log(obs / exp);
 41 |         }
 42 |     }
 43 | 
 44 |     return std::make_tuple(n_total, m_total);
 45 | }
 46 | 
 47 | std::vector<std::pair<std::string, double>> calculate_ls(std::map<std::string, std::vector<double>> &data)
 48 | {
 49 |     // create group sums
 50 |     int n_groups = data.begin()->second.size();
 51 |     std::vector<double> group_sums(n_groups, 0.0);
 52 |     for (auto &[unit, counts] : data)
 53 |     {
 54 |         for (int i = 0; i < n_groups; i++)
 55 |             group_sums[i] += counts[i];
 56 |     }
 57 | 
 58 |     // create group proportions
 59 |     double n_total = std::accumulate(group_sums.begin(), group_sums.end(), 0);
 60 |     std::vector<double> group_p(n_groups, 0.0);
 61 |     for (int i = 0; i < n_groups; i++)
 62 |     {
 63 |         group_p[i] = group_sums[i] / n_total;
 64 |     }
 65 | 
 66 |     // create local segregation scores for each unit
 67 |     std::vector<std::pair<std::string, double>> ls;
 68 |     for (auto &[unit, counts] : data)
 69 |     {
 70 |         double n_unit = std::accumulate(counts.begin(), counts.end(), 0);
 71 |         double ls_unit = 0.0;
 72 |         for (int i = 0; i < n_groups; i++)
 73 |         {
 74 |             double p_group_given_unit = counts[i] / n_unit;
 75 |             if (p_group_given_unit == 0)
 76 |                 continue;
 77 |             ls_unit += p_group_given_unit * std::log(p_group_given_unit / group_p[i]);
 78 |         }
 79 |         ls.push_back({unit, ls_unit});
 80 |     }
 81 | 
 82 |     std::sort(ls.begin(), ls.end(), [](auto &left, auto &right)
 83 |               { return left.second < right.second; });
 84 | 
 85 |     return ls;
 86 | }
 87 | 
 88 | double calculate_reduction(double n, std::vector<double> &unit1, std::vector<double> &unit2)
 89 | {
 90 |     // create group sums
 91 |     const int n_groups = unit1.size();
 92 |     double n_total = 0.0;
 93 |     std::vector<double> group_sums(n_groups, 0.0);
 94 |     for (int i = 0; i < n_groups; i++)
 95 |     {
 96 |         group_sums[i] += unit1[i];
 97 |         group_sums[i] += unit2[i];
 98 |         n_total += unit1[i];
 99 |         n_total += unit2[i];
100 |     }
101 | 
102 |     // create unit sums
103 |     const double n_unit1 = std::accumulate(unit1.begin(), unit1.end(), 0);
104 |     const double n_unit2 = std::accumulate(unit2.begin(), unit2.end(), 0);
105 | 
106 |     // calculate M
107 |     double m_total = 0.0;
108 |     for (int i = 0; i < n_groups; i++)
109 |     {
110 |         // unit 1
111 |         double obs1 = unit1[i] / n_total;
112 |         if (obs1 != 0)
113 |         {
114 |             double exp1 = (n_unit1 / n_total * group_sums[i] / n_total);
115 |             m_total += obs1 * std::log(obs1 / exp1);
116 |         }
117 |         // unit 2
118 |         double obs2 = unit2[i] / n_total;
119 |         if (obs2 != 0)
120 |         {
121 |             double exp2 = (n_unit2 / n_total * group_sums[i] / n_total);
122 |             m_total += obs2 * std::log(obs2 / exp2);
123 |         }
124 |     }
125 | 
126 |     return n_total / n * m_total;
127 | }
128 | 
129 | typedef std::pair<std::string, std::string> t_neighbor;
130 | 
131 | t_neighbor neighbor_make_pair(std::string a, std::string b)
132 | {
133 |     if (a < b)
134 |         return t_neighbor(a, b);
135 |     else
136 |         return t_neighbor(b, a);
137 | }
138 | 
139 | // [[Rcpp::export]]
140 | List compress_compute_cpp(
141 |     std::string neighbors_option,
142 |     StringMatrix m_neighbors,
143 |     int n_neighbors,
144 |     NumericMatrix m_data,
145 |     std::vector<std::string> unit_names,
146 |     int max_iter)
147 | {
148 |     // prepare main data structure: map, where the key is the unit name
149 |     // and the values are the ordered group counts
150 |     std::map<std::string, std::vector<double>> data;
151 |     for (int i = 0; i < m_data.nrow(); i++)
152 |     {
153 |         auto unit = unit_names[i];
154 |         data[unit] = {};
155 |         for (int j = 0; j < m_data.ncol(); j++)
156 |             data[unit].push_back(m_data(i, j));
157 |     }
158 | 
159 |     int n_groups = m_data.ncol();
160 |     // compute total M index
161 |     double n_total, m_total;
162 |     std::tie(n_total, m_total) = calculate_m(data);
163 | 
164 |     // prepare neighbors data structure (list of pairs, where order is unimportant)
165 |     std::map<t_neighbor, double> neighbors;
166 |     if (neighbors_option == "all")
167 |     {
168 |         for (int row = 0; row < unit_names.size(); row++)
169 |         {
170 |             for (int col = row + 1; col < unit_names.size(); col++)
171 |             {
172 |                 neighbors[neighbor_make_pair(unit_names[row], unit_names[col])] = 0;
173 |             }
174 |         }
175 |     }
176 |     else if (neighbors_option == "local")
177 |     {
178 | 
179 |         auto ls = calculate_ls(data);
180 |         for (int i = 0; i < ls.size(); i++)
181 |         {
182 |             for (int j = std::max(i - n_neighbors, 0);
183 |                  j < std::min(i + n_neighbors + 1, static_cast<int>(ls.size()) - 1);
184 |                  j++)
185 |             {
186 |                 if (i != j)
187 |                     neighbors[neighbor_make_pair(ls[i].first, ls[j].first)] = 0;
188 |             }
189 |         }
190 |     }
191 |     else if (neighbors_option == "df")
192 |     {
193 |         for (int i = 0; i < m_neighbors.nrow(); i++)
194 |         {
195 |             std::string unit1 = Rcpp::as<std::string>(m_neighbors(i, 0));
196 |             std::string unit2 = Rcpp::as<std::string>(m_neighbors(i, 1));
197 |             if (unit1 != unit2)
198 |                 neighbors[neighbor_make_pair(unit1, unit2)] = 0;
199 |         }
200 |     }
201 | 
202 |     // calculate reduction for each neighbor pair
203 |     // (we don't do this in the previous step because otherwise we
204 |     // might do a lot of duplicate calculations)
205 |     for (const auto &[key, reduction] : neighbors)
206 |     {
207 |         neighbors[neighbor_make_pair(key.first, key.second)] = calculate_reduction(n_total, data[key.first], data[key.second]);
208 |     }
209 | 
210 |     // determine maximum number of iterations
211 |     if (max_iter == -1)
212 |     {
213 |         max_iter = std::min(static_cast<int>(neighbors.size()), static_cast<int>(unit_names.size()) - 1);
214 |     }
215 | 
216 |     CompressionResults results;
217 |     results.iter.reserve(max_iter);
218 |     results.M_wgt.reserve(max_iter);
219 |     results.M.reserve(max_iter);
220 |     results.N_units.reserve(max_iter);
221 |     results.old_unit.reserve(max_iter);
222 |     results.new_unit.reserve(max_iter);
223 | 
224 |     int counter = 0;
225 |     double m_current = m_total;
226 |     int n_units_current = m_data.nrow();
227 | 
228 |     Progress p(n_units_current, true);
229 |     while (neighbors.size() > 0)
230 |     {
231 |         if (Progress::check_abort())
232 |             throw Rcpp::exception("user interruption");
233 | 
234 |         // find smallest reduction
235 |         double min_reduction = 10000;
236 |         t_neighbor min_key;
237 |         for (const auto &[key, reduction] : neighbors)
238 |         {
239 |             if (reduction < min_reduction)
240 |             {
241 |                 min_reduction = reduction;
242 |                 min_key = key;
243 |                 if (reduction == 0)
244 |                     break;
245 |             }
246 |         }
247 | 
248 |         const std::string unit_keep = min_key.first;
249 |         const std::string unit_delete = min_key.second;
250 |         // add counts of 'delete' to 'keep', delete unit
251 |         for (int i = 0; i < n_groups; i++)
252 |             data[unit_keep][i] += data[unit_delete][i];
253 |         data.erase(unit_delete);
254 | 
255 |         // update neighbors
256 |         neighbors.erase(min_key);
257 | 
258 |         std::vector<t_neighbor> delete_neighbors;
259 |         std::map<t_neighbor, double> new_neighbors;
260 |         for (const auto &[key, reduction] : neighbors)
261 |         {
262 |             // update pairs if deleted unit is involved
263 |             if (key.first == unit_delete || key.second == unit_delete)
264 |             {
265 |                 // this is a pair some_unit - deleted_unit -- replace deleted_unit with unit_keep
266 |                 delete_neighbors.push_back(key);
267 |                 std::string some_unit = (key.first == unit_delete) ? key.second : key.first;
268 |                 new_neighbors[neighbor_make_pair(unit_keep, some_unit)] =
269 |                     calculate_reduction(n_total, data[unit_keep], data[some_unit]);
270 |             }
271 |             // recalculate reduction if kept unit is involved
272 |             else if (key.first == unit_keep || key.second == unit_keep)
273 |             {
274 |                 new_neighbors[key] = calculate_reduction(n_total, data[key.first], data[key.second]);
275 |             }
276 |         }
277 | 
278 |         // delete neighbors that involve the old unit
279 |         for (int i = 0; i < delete_neighbors.size(); i++)
280 |             neighbors.erase(delete_neighbors[i]);
281 | 
282 |         // update and add new neighbors
283 |         for (const auto &[key, reduction] : new_neighbors)
284 |             neighbors[key] = reduction;
285 | 
286 |         // update results
287 |         m_current -= min_reduction;
288 |         n_units_current -= 1;
289 |         counter += 1;
290 |         results.iter.push_back(counter);
291 |         results.M_wgt.push_back(min_reduction);
292 |         if (n_units_current == 1)
293 |         {
294 |             // ensure that this is displayed as a true 0
295 |             results.M.push_back(0);
296 |         }
297 |         else
298 |         {
299 |             results.M.push_back(m_current);
300 |         }
301 |         results.N_units.push_back(n_units_current);
302 |         results.old_unit.push_back(unit_delete);
303 |         results.new_unit.push_back(unit_keep);
304 | 
305 |         if (counter == max_iter)
306 |             break;
307 | 
308 |         p.increment();
309 |     }
310 | 
311 |     return List::create(
312 |         _["iter"] = results.iter,
313 |         _["M_wgt"] = results.M_wgt,
314 |         _["M"] = results.M,
315 |         _["N_units"] = results.N_units,
316 |         _["old_unit"] = results.old_unit,
317 |         _["new_unit"] = results.new_unit);
318 | }
319 | 
320 | int find_in_sets(std::string needle, std::vector<std::set<std::string>> haystack)
321 | {
322 |     for (int i = 0; i < haystack.size(); i++)
323 |     {
324 |         const bool is_in = haystack[i].find(needle) != haystack[i].end();
325 |         if (is_in == true)
326 |             return i;
327 |     }
328 |     return -1;
329 | }
330 | 
331 | // [[Rcpp::export]]
332 | List get_crosswalk_cpp(std::vector<std::string> old_unit, std::vector<std::string> new_unit)
333 | {
334 |     std::vector<std::set<std::string>> bags;
335 | 
336 |     for (int i = 0; i < old_unit.size(); i++)
337 |     {
338 |         int old_unit_bag = find_in_sets(old_unit[i], bags);
339 |         int new_unit_bag = find_in_sets(new_unit[i], bags);
340 | 
341 |         if (old_unit_bag == -1 && new_unit_bag == -1)
342 |         {
343 |             // neither unit in bags - add new bag
344 |             bags.push_back({old_unit[i], new_unit[i]});
345 |         }
346 |         else if (old_unit_bag != -1 && new_unit_bag == -1)
347 |         {
348 |             // old_unit already exists - add new_unit to same bag
349 |             bags[old_unit_bag].insert(new_unit[i]);
350 |         }
351 |         else if (old_unit_bag == -1 && new_unit_bag != -1)
352 |         {
353 |             // new_unit already exists - add old_unit to same bag
354 |             bags[new_unit_bag].insert(old_unit[i]);
355 |         }
356 |         else if (old_unit_bag != -1 && new_unit_bag != -1)
357 |         {
358 |             // both units exist in different bags, merge the two
359 |             bags[old_unit_bag].insert(bags[new_unit_bag].begin(), bags[new_unit_bag].end());
360 |             bags.erase(bags.begin() + new_unit_bag);
361 |         }
362 |     }
363 | 
364 |     // convert to List
365 |     List l(bags.size());
366 |     for (int i = 0; i < bags.size(); i++)
367 |     {
368 |         std::vector<std::string> bag(bags[i].size());
369 |         int index = 0;
370 |         for (auto el : bags[i])
371 |         {
372 |             bag[index] = el;
373 |             index++;
374 |         }
375 |         l[i] = bag;
376 |     }
377 |     return l;
378 | }


--------------------------------------------------------------------------------