├── vignettes
    ├── .gitignore
    └── cytofin.Rmd
├── LICENSE
├── .Rbuildignore
├── inst
    ├── images
    │   ├── image1.png
    │   ├── image2.png
    │   └── image3.png
    └── extdata
    │   ├── test_metadata_raw.csv
    │   └── test_panel.csv
├── man
    ├── figures
    │   ├── README-unnamed-chunk-11-1.png
    │   ├── README-unnamed-chunk-11-10.png
    │   ├── README-unnamed-chunk-11-2.png
    │   ├── README-unnamed-chunk-11-3.png
    │   ├── README-unnamed-chunk-11-4.png
    │   ├── README-unnamed-chunk-11-5.png
    │   ├── README-unnamed-chunk-11-6.png
    │   ├── README-unnamed-chunk-11-7.png
    │   ├── README-unnamed-chunk-11-8.png
    │   ├── README-unnamed-chunk-11-9.png
    │   ├── README-unnamed-chunk-12-1.png
    │   ├── README-unnamed-chunk-14-1.png
    │   ├── README-unnamed-chunk-15-1.png
    │   ├── README-unnamed-chunk-16-1.png
    │   └── README-unnamed-chunk-17-1.png
    ├── homogenize_flowFrame.Rd
    ├── get_extension.Rd
    ├── cytofin-package.Rd
    ├── rev_asinh.Rd
    ├── cytofin_read_metadata.Rd
    ├── cytofin_read_panel_info.Rd
    ├── cytofin_generate_panel_template.Rd
    ├── cytofin_generate_metadata_template.Rd
    ├── cytofin_homogenize.Rd
    ├── cytofin_prep_anchors.Rd
    ├── cytofin_make_plots.Rd
    ├── cytofin_normalize.Rd
    └── cytofin_normalize_nrs.Rd
├── R
    ├── cytofin-package.R
    ├── file_templates.R
    ├── cytofin_homogenize.R
    ├── utils.R
    ├── cytofin_prep_anchors.R
    ├── cytofin_normalize.R
    ├── cytofin_make_plots.R
    └── cytofin_normalize_nrs.R
├── NAMESPACE
├── LICENSE.md
├── DESCRIPTION
├── README.Rmd
└── README.md


/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2021
2 | COPYRIGHT HOLDER: cytofin authors
3 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | ^LICENSE\.md$
5 | 


--------------------------------------------------------------------------------
/inst/images/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/inst/images/image1.png


--------------------------------------------------------------------------------
/inst/images/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/inst/images/image2.png


--------------------------------------------------------------------------------
/inst/images/image3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/inst/images/image3.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-10.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-2.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-3.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-4.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-5.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-6.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-7.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-8.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-9.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-14-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-14-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-17-1.png


--------------------------------------------------------------------------------
/R/cytofin-package.R:
--------------------------------------------------------------------------------
 1 | #' @keywords internal
 2 | "_PACKAGE"
 3 | 
 4 | # The following block is used by usethis to automatically manage
 5 | # roxygen namespace tags. Modify with care!
 6 | ## usethis namespace: start
 7 | 
 8 | ## usethis namespace: end
 9 | NULL
10 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(cytofin_generate_metadata_template)
 4 | export(cytofin_generate_panel_template)
 5 | export(cytofin_homogenize)
 6 | export(cytofin_make_plots)
 7 | export(cytofin_normalize)
 8 | export(cytofin_normalize_nrs)
 9 | export(cytofin_prep_anchors)
10 | 


--------------------------------------------------------------------------------
/man/homogenize_flowFrame.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{homogenize_flowFrame}
 4 | \alias{homogenize_flowFrame}
 5 | \title{Alter a flowFrame to only include data from channels in a reference panel}
 6 | \usage{
 7 | homogenize_flowFrame(fcs_raw, ref_panel)
 8 | }
 9 | \arguments{
10 | \item{fcs_raw}{A flowFrame containing unprocessed CyTOF data}
11 | 
12 | \item{ref_panel}{A data.frame representing the reference panel data for a
13 | cytofin analysis.}
14 | }
15 | \value{
16 | a homogenized flowFrame
17 | }
18 | \description{
19 | Alter a flowFrame to only include data from channels in a reference panel
20 | }
21 | 


--------------------------------------------------------------------------------
/man/get_extension.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{get_extension}
 4 | \alias{get_extension}
 5 | \title{Find the extension for a file}
 6 | \usage{
 7 | get_extension(filename)
 8 | }
 9 | \arguments{
10 | \item{filename}{A string representing the name of a file in its local directory}
11 | }
12 | \value{
13 | The the file extension of \code{filename}
14 | }
15 | \description{
16 | Find the extension for a file
17 | }
18 | \examples{
19 | \dontrun{
20 | # example file name
21 | my_filename <- "my_file.txt"
22 | 
23 | # find and print the extension
24 | my_extension <- getExtension(my_filename)
25 | print(my_extension)
26 | }
27 | }
28 | 


--------------------------------------------------------------------------------
/man/cytofin-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cytofin-package.R
 3 | \docType{package}
 4 | \name{cytofin-package}
 5 | \alias{cytofin}
 6 | \alias{cytofin-package}
 7 | \title{cytofin: Integrate CyTOF Datasets From Heterogeneous Sources}
 8 | \description{
 9 | Integrate multiple CyTOF datasets collected from independent
10 |   sources (i.e. labs, institutions, etc.). Cytofin performs CyTOF panel alignment
11 |   across datasets ("homogenization") as well as batch correction using generalized
12 |   anchors identified on each CyTOF plate ("normalization").
13 | }
14 | \author{
15 | \strong{Maintainer}: Ben Lo \email{bennylo@stanford.edu}
16 | 
17 | Authors:
18 | \itemize{
19 |   \item Timothy Keyes \email{tkeyes@stanford.edu} (\href{https://orcid.org/0000-0003-0423-9679}{ORCID})
20 | }
21 | 
22 | Other contributors:
23 | \itemize{
24 |   \item Kara Davis \email{kardavis@stanford.edu} [research team head, owner]
25 | }
26 | 
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/man/rev_asinh.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{rev_asinh}
 4 | \alias{rev_asinh}
 5 | \title{Reverses arcsinh transformation with cofactor \code{scale_factor} and a shift of \code{shift_factor}.}
 6 | \usage{
 7 | rev_asinh(x, shift_factor, scale_factor)
 8 | }
 9 | \arguments{
10 | \item{x}{A numeric vector.}
11 | 
12 | \item{shift_factor}{The scalar value \code{a} in the following equation used to
13 | transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
14 | \code{new_x <- asinh(a + b * x)}.}
15 | 
16 | \item{scale_factor}{The scalar value \code{b} in the following equation used to
17 | transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
18 | \code{new_x <- asinh(a + b * x)}.}
19 | }
20 | \value{
21 | A numeric vector after undergoing reverse
22 | arcsinh transformation
23 | }
24 | \description{
25 | Reverses arcsinh transformation with cofactor \code{scale_factor} and a shift of \code{shift_factor}.
26 | }
27 | 


--------------------------------------------------------------------------------
/inst/extdata/test_metadata_raw.csv:
--------------------------------------------------------------------------------
 1 | filename,cohort,plate_number,patient_id,condition,is_anchor,validation
 2 | ALL05v2_Plate2_UPN94 das.fcs,ALL05v2,plate2,UPN94,Das,0,homogenized_ALL05v2_plate2_UPN94 das.fcs
 3 | ALL08_Plate8_UPN26 basal.fcs,ALL08,plate8,UPN26,Basal,0,homogenized_ALL08_plate8_UPN26 basal.fcs
 4 | CRLF2_Plate1_UPN53 das + TSLP.fcs,CRLF2,plate1,UPN53,das_TSLP,0,homogenized_CRLF2_plate1_UPN53 das + TSLP.fcs
 5 | ALL05v2_Plate2_healthy basal1.fcs,ALL05v2,plate2,Healthy,Basal,1,homogenized_ALL05v2_plate2_healthy basal1.fcs
 6 | ALL08_Plate8_Healthy03 basal.fcs,ALL08,plate8,Healthy03,Basal,1,homogenized_ALL08_plate9_Healthy03 basal.fcs
 7 | CRLF2_Plate1_Healthy 04 BCR.fcs,CRLF2,plate1,Healthy04,BCR,1,homogenized_CRLF2_plate1_Healthy 04 BCR.fcs
 8 | MS_Plate5_SU978 Basal.fcs,MajSak,plate5,SU978,Basal,0,homogenized_MajSak_plate5_SU978 Basal.fcs
 9 | MS_Plate5_Healthy BM.fcs,MajSak,plate5,Healthy,BM,1,homogenized_MajSak_plate5_Healthy BM.fcs
10 | SJ_Plate2_TB010950_Basal.fcs,StJude,plate2,TB010950,Basal,0,homogenized_StJude_plate2_TB010950_Basal.fcs
11 | SJ_Plate2_Healthy_BM.fcs,StJude,plate2,Healthy,BM,1,homogenized_StJude_plate2_Healthy_BM.fcs


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2021 cytofin authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/man/cytofin_read_metadata.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{cytofin_read_metadata}
 4 | \alias{cytofin_read_metadata}
 5 | \title{Read in a cytofin metadata file}
 6 | \usage{
 7 | cytofin_read_metadata(metadata_path)
 8 | }
 9 | \arguments{
10 | \item{metadata_path}{A filepath leading to an .xlsx or .csv file
11 | containing a table of CyTOF file (.fcs file) names. Columns should include
12 | \code{filename}, \code{cohort}, \code{plate_number}, \code{patient_id}, \code{condition}, \code{population},
13 | and \code{validation}. TO DO: Change the names of these columns to more descriptive
14 | names and make sure that they are all actually needed.
15 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
16 | }
17 | \value{
18 | A data.frame containing the metadata information in the
19 | file stored at \code{metadata_path}.
20 | }
21 | \description{
22 | This function reads a cytofin metadata file from a connection
23 | that points to a .csv or a .xlsx file
24 | }
25 | \examples{
26 | \dontrun{
27 | my_path <- file.path("~", "foo", "bar", "metadata.csv")
28 | my_metadata <- cytofin:::cytofin_read_metadata(my_path)
29 | }
30 | }
31 | 


--------------------------------------------------------------------------------
/man/cytofin_read_panel_info.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{cytofin_read_panel_info}
 4 | \alias{cytofin_read_panel_info}
 5 | \title{Read in a cytofin reference panel information}
 6 | \usage{
 7 | cytofin_read_panel_info(panel_path)
 8 | }
 9 | \arguments{
10 | \item{panel_path}{A file path leading to an .xlsx or .csv file containing
11 | a table of standardized antigen panel information. Columns should include
12 | \code{desc}, \code{range}, \code{metal_pattern}, \code{antigen_pattern}, \code{Lineage}, \code{Functional},
13 | and \code{General}. TO DO: Change the names of these columns to more descriptive
14 | names and make sure that they are all actually needed.
15 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
16 | }
17 | \value{
18 | A data.frame containing the reference panel information in the
19 | file stored at \code{panel_path}.
20 | }
21 | \description{
22 | This function reads cytofin reference panel information from a connection
23 | that points to a .csv or a .xlsx file
24 | }
25 | \examples{
26 | \dontrun{
27 | my_path <- file.path("~", "foo", "bar", "panel.csv")
28 | my_metadata <- cytofin:::cytofin_read_panel_info(my_path)
29 | }
30 | }
31 | 


--------------------------------------------------------------------------------
/man/cytofin_generate_panel_template.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/file_templates.R
 3 | \name{cytofin_generate_panel_template}
 4 | \alias{cytofin_generate_panel_template}
 5 | \title{Generate a template for a cytofin reference panel file}
 6 | \usage{
 7 | cytofin_generate_panel_template(
 8 |   file_name = "template_panel_info.csv",
 9 |   template_path = getwd()
10 | )
11 | }
12 | \arguments{
13 | \item{file_name}{A string representing the name of the .csv file to be
14 | saved in the directory specified by \code{template_path}. Defaults to
15 | "template_panel_info.csv"}
16 | 
17 | \item{template_path}{File path or connection where the template file should be
18 | written. Defaults to the current working directory}
19 | }
20 | \description{
21 | \code{cytofin_generate_panel_template} creates a template reference panel .csv file
22 | (with the correct columns and dummy example data) in a specified location.
23 | }
24 | \examples{
25 | 
26 | # specify the path where you'd like to store the template file
27 | my_name <- "panel_template.csv"
28 | my_path <- file.path("~", "Desktop", "template_folder")
29 | 
30 | # generate the template file, which then can be edited manually 
31 | cytofin_generate_panel_template(
32 |    file_name = my_name, 
33 |    template_path = my_path
34 | )
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/man/cytofin_generate_metadata_template.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/file_templates.R
 3 | \name{cytofin_generate_metadata_template}
 4 | \alias{cytofin_generate_metadata_template}
 5 | \title{Generate a template for a cytofin metadata file}
 6 | \usage{
 7 | cytofin_generate_metadata_template(
 8 |   file_name = "template_metadata.csv",
 9 |   template_path = getwd()
10 | )
11 | }
12 | \arguments{
13 | \item{file_name}{A string representing the name of the .csv file to be
14 | saved in the directory specified by \code{template_path}. Defaults to
15 | "template_metadata.csv"}
16 | 
17 | \item{template_path}{A file path or connection where the template file should be
18 | written. Defaults to the current working directory}
19 | }
20 | \description{
21 | \code{cytofin_generate_metadata_template} creates a template metadata .csv file
22 | (with the correct columns and dummy example data) in a specified location.
23 | }
24 | \examples{
25 | # specify the path where you'd like to store the template file
26 | my_name <- "metadata_template.csv"
27 | my_path <- file.path("~", "Desktop", "template_folder")
28 |     
29 | 
30 | # generate the template file, which then can be edited manually 
31 | cytofin_generate_metadata_template(
32 |    file_name = my_name, 
33 |    template_path = my_path
34 | )
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: cytofin
 2 | Title: Integrate CyTOF Datasets From Heterogeneous Sources
 3 | Version: 0.0.0.9000
 4 | Authors@R: 
 5 |     c(person(given = "Ben",
 6 |            family = "Lo",
 7 |            role = c("aut", "cre"),
 8 |            email = "bennylo@stanford.edu"),
 9 |       person(given = "Timothy", 
10 |              family = "Keyes", 
11 |              role = "aut", 
12 |              email = "tkeyes@stanford.edu", 
13 |              comment = c(ORCID = "0000-0003-0423-9679")), 
14 |       person(given = "Kara", 
15 |              family = "Davis", 
16 |              role = c("rth", "own"),
17 |              email = "kardavis@stanford.edu"))
18 | Description: Integrate multiple CyTOF datasets collected from independent
19 |   sources (i.e. labs, institutions, etc.). Cytofin performs CyTOF panel alignment
20 |   across datasets ("homogenization") as well as batch correction using generalized
21 |   anchors identified on each CyTOF plate ("normalization").
22 | License: MIT + file LICENSE
23 | Encoding: UTF-8
24 | LazyData: true
25 | Roxygen: list(markdown = TRUE)
26 | RoxygenNote: 7.1.1
27 | Imports:
28 | 	flowCore,
29 | 	readxl,
30 | 	stringr,
31 | 	tidyr,
32 | 	reshape2,
33 | 	ggplot2, 
34 | 	readr, 
35 | 	dplyr
36 | Suggests: 
37 |     rmarkdown,
38 |     knitr,
39 |     testthat (>= 3.0.0)
40 | Config/testthat/edition: 3
41 | VignetteBuilder: knitr
42 | 


--------------------------------------------------------------------------------
/inst/extdata/test_panel.csv:
--------------------------------------------------------------------------------
 1 | metal_name,antigen_name,antigen_pattern,lineage,functional,general
 2 | Time,Time,[Tt]ime,0,0,1
 3 | Event_length,Event_length,ength,0,0,1
 4 | (Pd102)Di,BC1,BC1,0,0,1
 5 | (Pd104)Di,BC2,BC2,0,0,1
 6 | (Pd105)Di,BC3,BC3,0,0,1
 7 | (Pd106)Di,BC4,BC4,0,0,1
 8 | (Pd108)Di,BC5,BC5,0,0,1
 9 | (Pd110)Di,BC6,BC6,0,0,1
10 | (In113)Di,CD235_CD61,CD235,1,0,0
11 | (In115)Di,CD45,CD45,1,0,0
12 | (La139)Di,cPARP,PARP,0,1,0
13 | (Pr141)Di,pPLCg1_2,pPLCg1_2,0,1,0
14 | (Nd142)Di,CD19,CD19,1,0,0
15 | (Nd143)Di,CD22,CD22,1,0,0
16 | (Nd144)Di,p4EBP1,p4EBP1,0,1,0
17 | (Nd145)Di,tIkaros,tIkaros,1,0,0
18 | (Nd146)Di,CD79b,CD79b,1,0,0
19 | (Sm147)Di,CD20,CD20,1,0,0
20 | (Nd148)Di,CD34,CD34,1,0,0
21 | (Sm149)Di,CD179a,CD179a,1,0,0
22 | (Nd150)Di,pSTAT5,pSTAT5,0,1,0
23 | (Sm152)Di,Ki67,Ki67,0,1,0
24 | (Eu153)Di,IgMi,IgMi,1,0,0
25 | (Sm154)Di,Kappa_lambda,appa,0,1,0
26 | (Gd156)Di,CD10,CD10,1,0,0
27 | (Gd158)Di,CD179b,CD179b,1,0,0
28 | (Gd160)Di,CD24,CD24,1,0,0
29 | (Dy161)Di,TSLPr,TSLPr,0,1,0
30 | (Dy162)Di,CD127,CD127,1,0,0
31 | (Dy163)Di,RAG1,RAG1,1,0,0
32 | (Dy164)Di,TdT,Td,1,0,0
33 | (Ho165)Di,Pax5,Pax5,1,0,0
34 | (Er166)Di,pSyk,pSyk,0,1,0
35 | (Er167)Di,CD43,CD43,1,0,0
36 | (Er168)Di,CD38,CD38,1,0,0
37 | (Er170)Di,CD3,CD3^,1,0,0
38 | (Yb171)Di,CD33,FITC|CD33,0,1,0
39 | (Yb172)Di,pS6,pS6,0,1,0
40 | (Yb173)Di,pErk,pErk,0,1,0
41 | (Yb174)Di,HLADR,HLADR,1,0,0
42 | (Lu175)Di,IgMs,IgMs,1,0,0
43 | (Yb176)Di,pCreb,pCreb,0,1,0
44 | (Ir191)Di,DNA1,DNA1,0,1,0
45 | (Ir193)Di,DNA2,DNA2,0,1,0


--------------------------------------------------------------------------------
/man/cytofin_homogenize.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cytofin_homogenize.R
 3 | \name{cytofin_homogenize}
 4 | \alias{cytofin_homogenize}
 5 | \title{Homogenize CyTOF channels names using a consensus antigen panel}
 6 | \usage{
 7 | cytofin_homogenize(
 8 |   metadata_path,
 9 |   panel_path,
10 |   input_data_path,
11 |   output_data_path,
12 |   prefix = "homogenized_",
13 |   verbose = FALSE
14 | )
15 | }
16 | \arguments{
17 | \item{metadata_path}{A file path leading to an .xlsx or .csv file
18 | containing a table of CyTOF file (.fcs file) names in the first column (\code{filename})
19 | and additional information about each .fcs file in subsequent columns.
20 | Columns should include \code{filename}, \code{cohort}, \code{plate_number}, \code{patient_id},
21 | \code{condition}, \code{is_anchor}, and \code{validation}.}
22 | 
23 | \item{panel_path}{A file path leading to an .xlsx or .csv file containing
24 | a table of standardized antigen panel information. Columns should include
25 | \code{metal_name}, \code{antigen_name}, \code{antigen_pattern},
26 | \code{lineage}, \code{functional}, and \code{general}.}
27 | 
28 | \item{input_data_path}{A folder directory containing the input .fcs files
29 | to be homogenized.}
30 | 
31 | \item{output_data_path}{A folder directory to which the output (i.e.
32 | homogenized) .fcs files should be written.}
33 | 
34 | \item{prefix}{A string appended to the name of each input file to create the
35 | name of the corresponding output file (post-homogenization). Defaults to
36 | "homogenized_" (e.g. an input file named "file1.fcs" will correspond to
37 | the output file "homogenized_file1.fcs" saved in \code{output_data_path}).}
38 | 
39 | \item{verbose}{A boolean value indicating whether progress message should be
40 | printed to the console during homogenization. Defaults to FALSE.}
41 | }
42 | \value{
43 | \code{cytofin_homogenize} doesn't return anything. Instead, it has the
44 | side-effect of saving homogenized files (in .fcs format) to the directory
45 | specified with \code{output_data_path}. Each of the saved files will contain
46 | homogenized, user-defined channels according to details specified in the
47 | file at \code{panel_path.}
48 | }
49 | \description{
50 | This function homogenizes CyTOF data (.fcs files) from heterogeneous sources
51 | according to the standard panel in a .csv file located at \code{panel_path.}
52 | }
53 | 


--------------------------------------------------------------------------------
/man/cytofin_prep_anchors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cytofin_prep_anchors.R
 3 | \name{cytofin_prep_anchors}
 4 | \alias{cytofin_prep_anchors}
 5 | \title{Prepare CyTOF controls for batch normalization across plates}
 6 | \usage{
 7 | cytofin_prep_anchors(
 8 |   metadata_path,
 9 |   panel_path,
10 |   input_data_path,
11 |   input_prefix = "homogenized_",
12 |   output_path = "none",
13 |   shift_factor = 0,
14 |   scale_factor = 0.2
15 | )
16 | }
17 | \arguments{
18 | \item{metadata_path}{A file path leading to an .xlsx or .csv file
19 | containing a table of CyTOF file (.fcs file) names. Columns should include
20 | \code{filename}, \code{cohort}, \code{plate_number}, \code{patient_id}, \code{condition}, \code{population},
21 | and \code{validation}.
22 | 
23 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
24 | 
25 | \item{panel_path}{A file path leading to an .xlsx or .csv file containing
26 | a table of standardized antigen panel information. Columns should include
27 | \code{metal_name}, \code{antigen_name}, \code{antigen_pattern}, \code{lineage}, \code{functional},
28 | and \code{general}.
29 | 
30 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
31 | 
32 | \item{input_data_path}{A folder directory containing the input CyTOF files
33 | to be prepped for normalization. These files should already be homogenized,
34 | and in most cases this will be the directory to which the output
35 | .fcs files from \code{cytofin_homogenize} were written.}
36 | 
37 | \item{input_prefix}{The string that was appended to the name of the input files
38 | of \code{cytofin_homogenize} to create their corresponding output file names.
39 | Defaults to "homogenized_".}
40 | 
41 | \item{output_path}{A file path specifying where to save the output .rds
42 | file containing the statistics calculated from this step and the concatenated
43 | .FCS files containing all cells from the generalized anchor samples. Defaults
44 | to "none", in which case no files are saved.}
45 | 
46 | \item{shift_factor}{The scalar value \code{a} in the following equation used to
47 | transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
48 | 
49 | \code{new_x <- asinh(a + b*x)}.
50 | 
51 | Defaults to 0.}
52 | 
53 | \item{scale_factor}{The scalar value \code{b} in the following equation used to
54 | transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
55 | 
56 | \code{new_x <- asinh(a + b*x)}.
57 | 
58 | Defaults to 0.2.}
59 | }
60 | \value{
61 | a \code{list()} of summary statistics with the following elements:
62 | \itemize{
63 | \item \strong{universal_var}:  a named numeric vector in which each entry corresponds to the
64 | universal variance of an antigen channel in the homogenized dataset
65 | \item \strong{universal_mean}:  a named numeric vector in which each entry corresponds to the
66 | universal mean of an antigen channel in the homogenized dataset
67 | \item \strong{bulk_var}:  The mean of all the channel-specific universal variances
68 | in \code{universal_var} (a scalar value)
69 | \item \strong{bulk_mean}:  The mean of all the channel-specific universal means
70 | in \code{universal_mean} (a scalar value)
71 | }
72 | }
73 | \description{
74 | This function calculates reference statistics needed for CytofIn batch normalization.
75 | Specifically, it calculates the universal mean and universal variance vectors
76 | of the generalized anchors identified in the metadata file at \code{metadata_path};
77 | in addition, it calculates the non-channel-specific bulk mean and bulk variance
78 | of the generalized anchors.
79 | }
80 | 


--------------------------------------------------------------------------------
/R/file_templates.R:
--------------------------------------------------------------------------------
  1 | #' Generate a template for a cytofin metadata file
  2 | #' 
  3 | #' `cytofin_generate_metadata_template` creates a template metadata .csv file 
  4 | #' (with the correct columns and dummy example data) in a specified location. 
  5 | #'
  6 | #' @param file_name A string representing the name of the .csv file to be 
  7 | #' saved in the directory specified by `template_path`. Defaults to 
  8 | #' "template_metadata.csv"
  9 | #'
 10 | #' @param template_path A file path or connection where the template file should be 
 11 | #' written. Defaults to the current working directory
 12 | #'
 13 | #' 
 14 | #' @export
 15 | #'
 16 | #' @examples
 17 | #' # specify the path where you'd like to store the template file
 18 | #' my_name <- "metadata_template.csv"
 19 | #' my_path <- file.path("~", "Desktop", "template_folder")
 20 | #'     
 21 | #' 
 22 | #' # generate the template file, which then can be edited manually 
 23 | #' cytofin_generate_metadata_template(
 24 | #'    file_name = my_name, 
 25 | #'    template_path = my_path
 26 | #' )
 27 | #' 
 28 | cytofin_generate_metadata_template <- 
 29 |   function(
 30 |     file_name = "template_metadata.csv", 
 31 |     template_path = getwd()
 32 |   ) { 
 33 |     
 34 |     # create template_path if needed
 35 |     if(!dir.exists(template_path)) { 
 36 |       dir.create(template_path, showWarnings = FALSE, recursive = TRUE)
 37 |     }
 38 |     
 39 |     #create output data.frame
 40 |     output_frame <- 
 41 |       data.frame(
 42 |         filename = c("file_1.fcs", "file_2.fcs", "file_3.fcs", "file_4.fcs"), 
 43 |         cohort = c("cohort_1", "cohort_1", "cohort_2", "cohort_2"),
 44 |         plate_number = c("plate_1", "plate_1", "plate_2", "plate_2"), 
 45 |         patient_id = c("patient_1", "patient_2", "patient_a", "patient_b"), 
 46 |         condition = c("basal", "basal", "stimulation_1", "stimulation_2"), 
 47 |         is_anchor = c(0, 1, 0, 1), 
 48 |         validation = 
 49 |           paste0(
 50 |             "validation_", 
 51 |             c("file_1.fcs", "file_2.fcs", "file_3.fcs", "file_4.fcs")
 52 |           )
 53 |       )
 54 |     
 55 |     readr::write_csv(
 56 |       x = output_frame, 
 57 |       file = file.path(template_path, file_name)
 58 |     )
 59 |     
 60 |   }
 61 | 
 62 | 
 63 | #' Generate a template for a cytofin reference panel file
 64 | #' 
 65 | #' `cytofin_generate_panel_template` creates a template reference panel .csv file 
 66 | #' (with the correct columns and dummy example data) in a specified location. 
 67 | #'
 68 | #' @param file_name A string representing the name of the .csv file to be 
 69 | #' saved in the directory specified by `template_path`. Defaults to 
 70 | #' "template_panel_info.csv"
 71 | #'
 72 | #' @param template_path File path or connection where the template file should be 
 73 | #' written. Defaults to the current working directory
 74 | #'
 75 | #' 
 76 | #' @export
 77 | #'
 78 | #' @examples
 79 | #' 
 80 | #' # specify the path where you'd like to store the template file
 81 | #' my_name <- "panel_template.csv"
 82 | #' my_path <- file.path("~", "Desktop", "template_folder")
 83 | #' 
 84 | #' # generate the template file, which then can be edited manually 
 85 | #' cytofin_generate_panel_template(
 86 | #'    file_name = my_name, 
 87 | #'    template_path = my_path
 88 | #' )
 89 | #' 
 90 | cytofin_generate_panel_template <- 
 91 |   function(
 92 |     file_name = "template_panel_info.csv", 
 93 |     template_path = getwd()
 94 |   ) { 
 95 |     
 96 |     # create template_path if needed
 97 |     if(!dir.exists(template_path)) { 
 98 |       dir.create(template_path, showWarnings = FALSE, recursive = TRUE)
 99 |     }
100 |     
101 |     #create output data.frame
102 |     output_frame <- 
103 |       data.frame(
104 |         metal_name = c("Time", "Event_length", "(Pd102)Di", "(Pd104)Di"), 
105 |         antigen_name = c("Time", "Event_length", "marker_name_1", "marker_name_2"),
106 |         antigen_pattern = c("", "", "", ""),
107 |         lineage = c(0, 0, 1, 1), 
108 |         functional = c(0, 0, 0, 1), 
109 |         general = c(0, 1, 1, 1)
110 |       )
111 |     
112 |     readr::write_csv(
113 |       x = output_frame, 
114 |       file = file.path(template_path, file_name)
115 |     )
116 |     
117 |   }
118 | 


--------------------------------------------------------------------------------
/man/cytofin_make_plots.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cytofin_make_plots.R
 3 | \name{cytofin_make_plots}
 4 | \alias{cytofin_make_plots}
 5 | \title{Make diagnostic plots to evaluate CytofIn batch normalization}
 6 | \usage{
 7 | cytofin_make_plots(
 8 |   normalization_result,
 9 |   which_rows = 1:nrow(normalization_result),
10 |   val_path = "none"
11 | )
12 | }
13 | \arguments{
14 | \item{normalization_result}{An output data.frame produced by the \code{cytofin_normalize} or
15 | \code{cytofin_normalize_nrs} function.
16 | 
17 | The following columns should be present: \code{filename},
18 | \code{cohort}, \code{plate_number}, \code{patient_id}, \code{condition}, \code{is_anchor}, \code{validation},
19 | \code{universal_var}, \code{anchor_mean}, \code{anchor_var}, \code{mean_b4norm}, \code{var_b4norm},
20 | \code{mean_norm}, \code{var_norm}, \code{mean_ctr_norm}, \code{var_ctr_norm}.}
21 | 
22 | \item{which_rows}{A numeric vector indicating which rows of \code{normalization_result}
23 | (i.e. which .fcs files in the combined dataset) should be used for plotting. Defaults
24 | to 1:nrow(normalization_result), which will make all possible plots.}
25 | 
26 | \item{val_path}{The folder directory containing validation (i.e. bead-normalized)
27 | .fcs files corresponding to the input .fcs files in the metadata table. (Optional).}
28 | }
29 | \value{
30 | 8 diagnostic plots are made for each input .fcs file that was batch
31 | normalized (i.e. each .fcs file represented as a row in \code{normalization_result}).
32 | From left-to-right (and top-to-bottom), these plots represent the following:
33 | \enumerate{
34 | \item The entry in the universal mean vector corresponding to each antigen in the
35 | consensus antigen panel. X-axis: antigen index in the universal mean vector.
36 | Y-axis: Arcsinh-transformed entry in the universal mean vector corresponding
37 | to each antigen.
38 | \item The mean (across all cells) antigen expression vector for the anchor
39 | associated with each input .fcs file both before and after normalization.
40 | X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the
41 | anchor .fcs file.
42 | \item The mean (across all cells) antigen expression vector for each input
43 | .fcs file both before and after normalization.
44 | X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the
45 | input .fcs file.
46 | \item The mean (across all cells) antigen expression vector for each "validation"
47 | (i.e. bead-normalized) .fcs file both before and after bead-normalization.
48 | This plot can be used to compare CytofIn batch normalization with gold-
49 | standard approaches. If \code{val_path} is "none", this plot will be identical to
50 | plot 3 (see above).
51 | X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the
52 | validation .fcs file.
53 | \item The entry in the universal standard deviation vector corresponding to each antigen in the
54 | consensus antigen panel. X-axis: antigen index in the universal standard deviation vector.
55 | Y-axis: Arcsinh-transformed entry in the universal standard deviation vector corresponding
56 | to each antigen.
57 | \item The standard deviation (across all cells) antigen expression vector for the anchor
58 | associated with each input .fcs file both before and after normalization.
59 | X-axis: antigen index (as in plot 1). Y-axis: the standard deviation of all
60 | antigen expression values in the anchor .fcs file.
61 | \item The standard deviation (across all cells) antigen expression vector for each input
62 | .fcs file both before and after normalization.
63 | X-axis: antigen index (as in plot 1). Y-axis: the standard deviation of all
64 | antigen expression values in the input .fcs file.
65 | \item The standard deviation (across all cells) antigen expression vector for each "validation"
66 | (i.e. bead-normalized) .fcs file both before and after bead-normalization.
67 | This plot can be used to compare CytofIn batch normalization with gold-
68 | standard approaches. If \code{val_path} is "none", this plot will be identical to
69 | plot 3 (see above).
70 | X-axis: antigen index (as in plot 1). Y-axis: Standard deviation of all
71 | antigen expression values in the validation .fcs file.
72 | }
73 | }
74 | \description{
75 | When given the output data structure from \code{cytofin_normalize} or \code{cytofin_normalize_nrs},
76 | this function plots mean and variance plots for all normalized .fcs files and their
77 | associated anchors.
78 | }
79 | 


--------------------------------------------------------------------------------
/man/cytofin_normalize.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/cytofin_normalize.R
  3 | \name{cytofin_normalize}
  4 | \alias{cytofin_normalize}
  5 | \title{Batch normalize CyTOF plates from heterogeneous sources using external anchors}
  6 | \usage{
  7 | cytofin_normalize(
  8 |   metadata_path,
  9 |   panel_path,
 10 |   anchor_statistics,
 11 |   input_data_path,
 12 |   output_data_path,
 13 |   mode = c("meanshift", "meanshift_bulk", "variance", "z_score", "beadlike"),
 14 |   input_prefix = "homogenized_",
 15 |   output_prefix = "normalized_",
 16 |   shift_factor = 0,
 17 |   scale_factor = 0.2
 18 | )
 19 | }
 20 | \arguments{
 21 | \item{metadata_path}{A filepath leading to an .xlsx or .csv file
 22 | containing a table of CyTOF file (.fcs file) names. Columns should include
 23 | \code{filename}, \code{cohort}, \code{plate_number}, \code{patient_id}, \code{condition}, \code{is_anchor},
 24 | and \code{validation}.
 25 | 
 26 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
 27 | 
 28 | \item{panel_path}{A file path leading to an .xlsx or .csv file containing
 29 | a table of standardized antigen panel information. Columns should include
 30 | \code{metal_name}, \code{antigen_name}, \code{antigen_pattern},
 31 | \code{lineage}, \code{functional}, and \code{general}.
 32 | 
 33 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
 34 | 
 35 | \item{anchor_statistics}{a list produced by the \code{cytofin_prep_anchors}
 36 | function or the file path to an .rds object containing anchor reference statistics.}
 37 | 
 38 | \item{input_data_path}{A folder directory containing the input CyTOF files
 39 | to be normalized. In most cases, this will be the directory to which the output
 40 | .fcs files from \code{cytofin_homogenize} were written.}
 41 | 
 42 | \item{output_data_path}{A folder directory to which the output (i.e.
 43 | batch normalized/batch corrected) .fcs files should be written.}
 44 | 
 45 | \item{mode}{A string indicating which transformation function should be used
 46 | for batch normalization ("meanshift", "meanshift_bulk", "variance", "z_score",
 47 | or "beadlike").}
 48 | 
 49 | \item{input_prefix}{The string that was appended to the name of the input files
 50 | of \code{cytofin_homogenize} to create their corresponding output file names.
 51 | Defaults to "homogenized_".}
 52 | 
 53 | \item{output_prefix}{A string to be appended to the name of each input file
 54 | to create the name of the corresponding output file (post-homogenization).
 55 | Defaults to "normalized_" (e.g. an input file named "file1.fcs" will correspond to
 56 | the output file "normalized_file1.fcs" saved in \code{output_data_path}).}
 57 | 
 58 | \item{shift_factor}{The scalar value \code{a} in the following equation used to
 59 | transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
 60 | 
 61 | \code{new_x <- asinh(a + b * x)}.
 62 | 
 63 | Defaults to 0.}
 64 | 
 65 | \item{scale_factor}{The scalar value \code{b} in the following equation used to
 66 | transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
 67 | 
 68 | \code{new_x <- asinh(a + b * x)}.
 69 | 
 70 | Defaults to 0.2.}
 71 | }
 72 | \value{
 73 | Batch-normalized .fcs files are saved in the directory specified by
 74 | \code{output_data_path}.
 75 | 
 76 | In addition, a data.frame containing information about
 77 | each input .fcs file (that can be used for plotting with \code{cytofin_make_plots})
 78 | is returned with the following columns:
 79 | \itemize{
 80 | \item All of the columns in the input metadata table (located at \code{metadata_path})
 81 | \item \strong{universal_mean}: the universal mean vector to which all files are adjusted
 82 | (will be identical for all input .fcs files)
 83 | \item \strong{universal_var}: the universal mean vector to which all files are adjusted
 84 | (will be identical for all input .fcs files)
 85 | \item \strong{anchor_mean}: the mean (across all cells) vector for the anchor file associated
 86 | with each input .fcs file (i.e. the anchor located on the same plate as the
 87 | input .fcs file) before batch normalization.
 88 | \item \strong{anchor_var}: the variance (across all cells) vector for the anchor file associated
 89 | with each input .fcs file (i.e. the anchor located on the same plate as the
 90 | input .fcs file)
 91 | \item \strong{mean_b4norm}: the mean (across all cells) vector of the input .fcs file
 92 | before batch normalization.
 93 | \item \strong{var_b4norm}: the variance (across all cells) vector of the input .fcs file
 94 | before batch normalization.
 95 | \item \strong{mean_norm}: the mean (across all cells) vector of the input .fcs file
 96 | after batch normalization.
 97 | \item \strong{var_norm}: the variance (across all cells) vector of the input .fcs file
 98 | after batch normalization.
 99 | \item \strong{anchor_mean_norm}: the mean (across all cells) vector for the anchor file associated
100 | with each input .fcs file (i.e. the anchor located on the same plate as the
101 | input .fcs file) after batch normalization.
102 | \item \strong{anchor_var_norm}: the variance (across all cells) vector for the anchor file associated
103 | with each input .fcs file (i.e. the anchor located on the same plate as the
104 | input .fcs file) after batch normalization.
105 | }
106 | }
107 | \description{
108 | This function batch normalizes CyTOF data from multiple plates (from one or more
109 | experimental cohorts) using external (i.e. "generalized") anchors.
110 | }
111 | 


--------------------------------------------------------------------------------
/man/cytofin_normalize_nrs.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/cytofin_normalize_nrs.R
  3 | \name{cytofin_normalize_nrs}
  4 | \alias{cytofin_normalize_nrs}
  5 | \title{Batch normalize CyTOF plates from heterogeneous sources using stable channels}
  6 | \usage{
  7 | cytofin_normalize_nrs(
  8 |   metadata_path,
  9 |   panel_path,
 10 |   input_data_path,
 11 |   output_data_path,
 12 |   input_prefix = "homogenized_",
 13 |   output_prefix = "normalized_",
 14 |   shift_factor = 0,
 15 |   scale_factor = 0.2,
 16 |   nchannels = 3,
 17 |   make_plot = FALSE
 18 | )
 19 | }
 20 | \arguments{
 21 | \item{metadata_path}{A filepath leading to an .xlsx or .csv file
 22 | containing a table of CyTOF file (.fcs file) names. Columns should include
 23 | \code{filename}, \code{cohort}, \code{plate_number}, \code{patient_id}, \code{condition}, \code{is_anchor},
 24 | and \code{validation}.
 25 | 
 26 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
 27 | 
 28 | \item{panel_path}{A file path leading to an .xlsx or .csv file containing
 29 | a table of standardized antigen panel information. Columns should include
 30 | \code{metal_name}, \code{antigen_name}, \code{antigen_pattern},
 31 | \code{lineage}, \code{functional}, and \code{general}.
 32 | 
 33 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
 34 | 
 35 | \item{input_data_path}{A folder directory containing the input CyTOF files
 36 | to be normalized. In most cases, this will be the directory to which the output
 37 | .fcs files from \code{cytofin_homogenize} were written.}
 38 | 
 39 | \item{output_data_path}{A folder directory to which the output (i.e.
 40 | batch normalized/batch corrected) .fcs files should be written.}
 41 | 
 42 | \item{input_prefix}{The string that was appended to the name of the input files
 43 | of \code{cytofin_homogenize} to create their corresponding output file names.
 44 | Defaults to "homogenized_".}
 45 | 
 46 | \item{output_prefix}{A string to be appended to the name of each input file
 47 | to create the name of the corresponding output file (post-homogenization).
 48 | Defaults to "normalized_" (e.g. an input file named "file1.fcs" will correspond to
 49 | the output file "normalized_file1.fcs" saved in \code{output_data_path}).}
 50 | 
 51 | \item{shift_factor}{The scalar value \code{a} in the following equation used to
 52 | transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
 53 | 
 54 | \code{new_x <- asinh(a + b * x)}.
 55 | 
 56 | Defaults to 0.}
 57 | 
 58 | \item{scale_factor}{The scalar value \code{b} in the following equation used to
 59 | transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
 60 | 
 61 | \code{new_x <- asinh(a + b * x)}.
 62 | 
 63 | Defaults to 0.2.}
 64 | 
 65 | \item{nchannels}{An integer representing the number of most stable channels to
 66 | use during batch normalization. Defaults to 3.}
 67 | 
 68 | \item{make_plot}{A boolean value indicating if a plot depicting the non-
 69 | redundancy scores of each marker in each .fcs file being batch normalized
 70 | should be plotted as a side-effect of the function call. Defaults to FALSE.}
 71 | }
 72 | \value{
 73 | Batch-normalized .fcs files are saved in the directory specified by
 74 | \code{output_data_path}.
 75 | 
 76 | In addition, a data.frame containing information about
 77 | each input .fcs file (that can be used for plotting with \code{cytofin_make_plots})
 78 | is returned with the following columns:
 79 | \itemize{
 80 | \item All of the columns in the input metadata table (located at \code{metadata_path})
 81 | \item \strong{universal_mean}: the universal mean vector to which all files are adjusted
 82 | (will be identical for all input .fcs files)
 83 | \item \strong{universal_var}: the universal mean vector to which all files are adjusted
 84 | (will be identical for all input .fcs files)
 85 | \item \strong{anchor_mean}: the mean (across all cells) vector for the anchor file associated
 86 | with each input .fcs file (i.e. the anchor located on the same plate as the
 87 | input .fcs file)
 88 | \item \strong{anchor_var}: the variance (across all cells) vector for the anchor file associated
 89 | with each input .fcs file (i.e. the anchor located on the same plate as the
 90 | input .fcs file)
 91 | \item \strong{mean_b4norm}: the mean (across all cells) vector of the input .fcs file
 92 | before batch normalization.
 93 | \item \strong{var_b4norm}: the variance (across all cells) vector of the input .fcs file
 94 | before batch normalization.
 95 | \item \strong{mean_norm}: the mean (across all cells) vector of the input .fcs file
 96 | after batch normalization.
 97 | \item \strong{var_norm}: the variance (across all cells) vector of the input .fcs file
 98 | after batch normalization.
 99 | \item \strong{anchor_mean_norm}: the mean (across all cells) vector for the anchor file associated
100 | with each input .fcs file (i.e. the anchor located on the same plate as the
101 | input .fcs file) after batch normalization.
102 | \item \strong{anchor_var_norm}: the variance (across all cells) vector for the anchor file associated
103 | with each input .fcs file (i.e. the anchor located on the same plate as the
104 | input .fcs file) after batch normalization.
105 | }
106 | }
107 | \description{
108 | This function batch normalizes CyTOF data from multiple plates (from one or more
109 | experimental cohorts) by computing the non-redundancy score (NRS) for each
110 | channel in the dataset, then using the most redundant (i.e. the "most stable")
111 | channels as a reference for batch normalization.
112 | }
113 | 


--------------------------------------------------------------------------------
/R/cytofin_homogenize.R:
--------------------------------------------------------------------------------
  1 | #' Homogenize CyTOF channels names using a consensus antigen panel
  2 | #'
  3 | #' This function homogenizes CyTOF data (.fcs files) from heterogeneous sources 
  4 | #' according to the standard panel in a .csv file located at `panel_path.`
  5 | #'
  6 | #' @param metadata_path A file path leading to an .xlsx or .csv file 
  7 | #' containing a table of CyTOF file (.fcs file) names in the first column (`filename`)
  8 | #' and additional information about each .fcs file in subsequent columns. 
  9 | #' Columns should include `filename`, `cohort`, `plate_number`, `patient_id`, 
 10 | #' `condition`, `is_anchor`, and `validation`. 
 11 | #' 
 12 | #' @param panel_path A file path leading to an .xlsx or .csv file containing 
 13 | #' a table of standardized antigen panel information. Columns should include 
 14 | #' `metal_name`, `antigen_name`, `antigen_pattern`, 
 15 | #' `lineage`, `functional`, and `general`. 
 16 | #' 
 17 | #' @param input_data_path A folder directory containing the input .fcs files
 18 | #' to be homogenized.
 19 | #' 
 20 | #' @param output_data_path A folder directory to which the output (i.e. 
 21 | #' homogenized) .fcs files should be written.
 22 | #' 
 23 | #' @param prefix A string appended to the name of each input file to create the 
 24 | #' name of the corresponding output file (post-homogenization). Defaults to 
 25 | #' "homogenized_" (e.g. an input file named "file1.fcs" will correspond to 
 26 | #' the output file "homogenized_file1.fcs" saved in `output_data_path`). 
 27 | #' 
 28 | #' @param verbose A boolean value indicating whether progress message should be 
 29 | #' printed to the console during homogenization. Defaults to FALSE.
 30 | #' 
 31 | #' @return `cytofin_homogenize` doesn't return anything. Instead, it has the 
 32 | #' side-effect of saving homogenized files (in .fcs format) to the directory 
 33 | #' specified with `output_data_path`. Each of the saved files will contain 
 34 | #' homogenized, user-defined channels according to details specified in the 
 35 | #' file at `panel_path.`
 36 | #' 
 37 | #' @export
 38 | #' 
 39 | cytofin_homogenize <- 
 40 |   function(
 41 |     metadata_path, 
 42 |     panel_path, 
 43 |     input_data_path, 
 44 |     output_data_path, 
 45 |     prefix = "homogenized_",
 46 |     verbose = FALSE
 47 |   ) {
 48 |     
 49 |     # create output directory for homogenized .fcs files
 50 |     dir.create(output_data_path, showWarnings = FALSE, recursive = TRUE)
 51 |     
 52 |     # read metadata table
 53 |     md <- cytofin_read_metadata(metadata_path)
 54 |     
 55 |     # read reference panel information
 56 |     ref_panel <- cytofin_read_panel_info(panel_path)
 57 |     
 58 |     # for all files in the input directory
 59 |     for (file in md$filename) {
 60 |       # read in FCS file
 61 |       sink(file = "/dev/null")
 62 |       fcs_raw <- 
 63 |         flowCore::read.FCS(
 64 |           filename = file.path(input_data_path, file), 
 65 |           transformation = FALSE, 
 66 |           truncate_max_range = FALSE
 67 |         )
 68 |       sink()
 69 |       if(verbose) {
 70 |         cat("filename:", file, "\n")
 71 |       }
 72 |       
 73 |       # parse panel in FCS files
 74 |       data_panel_antigens <- 
 75 |         flowCore::pData(flowCore::parameters(fcs_raw))$desc
 76 |       
 77 |       data_panel_metals <- 
 78 |         flowCore::pData(flowCore::parameters(fcs_raw))$name
 79 |       
 80 |       # for each channel in the reference panel
 81 |       for (i in 1:length(ref_panel$antigen_name)) {
 82 |         tryCatch(
 83 |           {
 84 |             # extract the antigen name in the reference and its corresponding regex
 85 |             ref_antigen <- ref_panel$antigen_name[[i]]
 86 |             ref_antigen_regex <- ref_panel$antigen_pattern[[i]]
 87 |             
 88 |             # Find the index of the data antigen corresponding to the reference antigen
 89 |             data_antigen_index <- 
 90 |               stringr::str_detect(
 91 |                 string = tidyr::replace_na(data_panel_antigens,''), 
 92 |                 pattern = ref_antigen_regex
 93 |               )
 94 |             # store the name of the data antigen for reporting
 95 |             data_antigen <- data_panel_antigens[data_antigen_index]
 96 |             
 97 |             # if there was a match with the reference antigen's regex
 98 |             if (max(data_antigen_index) == 1) {
 99 |               # rename the data antigen in the flowFrame using the reference antigen name
100 |               flowCore::pData(flowCore::parameters(fcs_raw))$desc[data_antigen_index] <- 
101 |                 ref_antigen
102 |               # otherwise
103 |             }
104 |             
105 |             # report what was matched if verbose
106 |             if(verbose) {
107 |               cat(
108 |                 "matched data antigen: ",
109 |                 data_antigen,
110 |                 "\nwith the reference antigen: ",
111 |                 ref_antigen,
112 |                 "\nusing the regex: ",
113 |                 ref_antigen_regex,
114 |                 "\n"
115 |               )
116 |             }
117 |           }, 
118 |           # if an error is encountered, print some information
119 |           error = 
120 |             function(e) {
121 |               txt <- 
122 |                 paste(
123 |                   md$filename, "item", i , 
124 |                   "data_antigen", data_antigen, "ref_antigen",
125 |                   ref_antigen, "ref_antigen_pattern", ref_antigen_pattern,
126 |                   as.character(e)
127 |                 )
128 |               cat(txt,"\n")
129 |             }
130 |         )
131 |       }
132 |       
133 |       # finalize the fcs file to write as output
134 |       fcs <- homogenize_flowFrame(fcs_raw, ref_panel)
135 |       
136 |       # write output fcs file to the specified directory
137 |       filename <- file.path(output_data_path, paste0(prefix, file))
138 |       flowCore::write.FCS(fcs, filename)
139 |       
140 |     }
141 |     
142 |   }
143 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | #' Find the extension for a file
  2 | #'
  3 | #' @param filename A string representing the name of a file in its local directory
  4 | #'
  5 | #' @return The the file extension of `filename` 
  6 | #'
  7 | #' @examples
  8 | #' \dontrun{
  9 | #' # example file name
 10 | #' my_filename <- "my_file.txt"
 11 | #' 
 12 | #' # find and print the extension
 13 | #' my_extension <- getExtension(my_filename)
 14 | #' print(my_extension)
 15 | #' }
 16 | get_extension <- function(filename) {
 17 |   ex <- strsplit(basename(filename), split="\\.")[[1]]
 18 |   return(ex[[-1]])
 19 | }
 20 | 
 21 | 
 22 | #' Alter a flowFrame to only include data from channels in a reference panel
 23 | #'
 24 | #' @param fcs_raw A flowFrame containing unprocessed CyTOF data
 25 | #' @param ref_panel A data.frame representing the reference panel data for a 
 26 | #' cytofin analysis.
 27 | #'
 28 | #' @return a homogenized flowFrame
 29 | #'
 30 | homogenize_flowFrame <- function(fcs_raw, ref_panel) {
 31 |   
 32 |   #extract some needed values from the raw fcs data and the reference panel
 33 |   ref_markers <- ref_panel$antigen_name
 34 |   ref_metals <- ref_panel$metal_name
 35 |   
 36 |   panel_fcs <- flowCore::pData(flowCore::parameters(fcs_raw))
 37 |   panel_markers <- panel_fcs$desc
 38 |   panel_metals <- as.character(panel_fcs$name)
 39 |   panel_rownames <- row.names(panel_fcs)
 40 |   
 41 |   # create new flowFrame to be modified
 42 |   fcs <- fcs_raw
 43 |   
 44 |   # only keep the markers/metals that are present in the reference marker list
 45 |   panel_markers_to_keep <- intersect(panel_markers, ref_markers)
 46 |   panel_metals_to_keep <- panel_metals[panel_markers %in% panel_markers_to_keep]
 47 |   
 48 |   # create dictionary to look up which metals from the reference panel 
 49 |   # correspond to shared antigens with the FCS file's panel (which may be on 
 50 |   # different metals)
 51 |   names(ref_metals) <- ref_markers
 52 |   
 53 |   # perform lookup to "rename" metals in the FCS file's panel to the standard
 54 |   # metal name in the reference
 55 |   new_panel_metals <- as.character(ref_metals[panel_markers_to_keep])
 56 |   
 57 |   # remove columns not present in ref_panel from final fcs file
 58 |   expr <- flowCore::exprs(fcs)
 59 |   new_expr <- expr[, panel_metals_to_keep] 
 60 |   
 61 |   # rename metals using the looked-up values
 62 |   colnames(new_expr) <- new_panel_metals
 63 |   
 64 |   # sort columns into the order in the reference panel
 65 |   final_expr <- new_expr[ , ref_metals]
 66 |   flowCore::exprs(fcs) <- final_expr
 67 |   
 68 |   # return result 
 69 |   return(fcs)
 70 |   
 71 | }
 72 | 
 73 | 
 74 | #' Read in a cytofin metadata file
 75 | #' 
 76 | #' This function reads a cytofin metadata file from a connection 
 77 | #' that points to a .csv or a .xlsx file
 78 | #'
 79 | #' @param metadata_path A filepath leading to an .xlsx or .csv file 
 80 | #' containing a table of CyTOF file (.fcs file) names. Columns should include
 81 | #' `filename`, `cohort`, `plate_number`, `patient_id`, `condition`, `population`, 
 82 | #' and `validation`. TO DO: Change the names of these columns to more descriptive
 83 | #' names and make sure that they are all actually needed. 
 84 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
 85 | #'
 86 | #' @return A data.frame containing the metadata information in the 
 87 | #' file stored at `metadata_path`. 
 88 | #'
 89 | #' @examples
 90 | #' \dontrun{
 91 | #' my_path <- file.path("~", "foo", "bar", "metadata.csv")
 92 | #' my_metadata <- cytofin:::cytofin_read_metadata(my_path)
 93 | #' }
 94 | cytofin_read_metadata <- function(metadata_path) {
 95 |   
 96 |   if (get_extension(metadata_path) == "xlsx") {
 97 |     md <- readxl::read_excel(metadata_path)
 98 |   } else if (get_extension(metadata_path) == "csv") {
 99 |     md <- read.csv(metadata_path)
100 |   } else { 
101 |     # throw error if the wrong kind of file is given
102 |     stop("metadata_path must point to an .xlsx or .csv file")
103 |   }
104 |   
105 |   # trim whitespace from all strings in metadata
106 |   md <- data.frame(lapply(md, trimws), stringsAsFactors = FALSE)
107 |   
108 |   return(md)
109 | }
110 | 
111 | 
112 | #' Read in a cytofin reference panel information
113 | #' 
114 | #' This function reads cytofin reference panel information from a connection 
115 | #' that points to a .csv or a .xlsx file
116 | #'
117 | #' @param panel_path A file path leading to an .xlsx or .csv file containing 
118 | #' a table of standardized antigen panel information. Columns should include 
119 | #' `desc`, `range`, `metal_pattern`, `antigen_pattern`, `Lineage`, `Functional`, 
120 | #' and `General`. TO DO: Change the names of these columns to more descriptive
121 | #' names and make sure that they are all actually needed. 
122 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
123 | #'
124 | #' @return A data.frame containing the reference panel information in the 
125 | #' file stored at `panel_path`. 
126 | #'
127 | #' @examples
128 | #' \dontrun{
129 | #' my_path <- file.path("~", "foo", "bar", "panel.csv")
130 | #' my_metadata <- cytofin:::cytofin_read_panel_info(my_path)
131 | #' }
132 | cytofin_read_panel_info <- function(panel_path) {
133 |   
134 |   if (get_extension(panel_path) == "xlsx") {
135 |     ref_panel <- readxl::read_excel(panel_path)
136 |   } else if (get_extension(panel_path) == "csv") {
137 |     ref_panel <- read.csv(panel_path)
138 |   } else { 
139 |     # throw error if the wrong kind of file is given
140 |     stop("panel_path must point to an .xlsx or .csv file")
141 |   }
142 |   
143 |   # trim whitespace from all strings in reference panel
144 |   ref_panel <- data.frame(lapply(ref_panel, trimws), stringsAsFactors = FALSE)
145 |   
146 |   return(ref_panel)
147 | }
148 | 
149 | 
150 | #' Reverses arcsinh transformation with cofactor `scale_factor` and a shift of `shift_factor`.
151 | #'
152 | #' @param x A numeric vector.
153 | #' 
154 | #' @param shift_factor The scalar value `a` in the following equation used to 
155 | #' transform CyTOF raw data ion counts using the hyperbolic arcsinh function:  
156 | #'    `new_x <- asinh(a + b * x)`.
157 | #' 
158 | #' @param scale_factor The scalar value `b` in the following equation used to 
159 | #' transform CyTOF raw data ion counts using the hyperbolic arcsinh function:  
160 | #'    `new_x <- asinh(a + b * x)`.
161 | #'
162 | #' @return A numeric vector after undergoing reverse 
163 | #' arcsinh transformation 
164 | #' 
165 | #'
166 | rev_asinh <- function(x, shift_factor, scale_factor) {
167 |   
168 |   new_x <- (sinh(x) - shift_factor) / scale_factor
169 |   return(new_x)
170 |   
171 | }
172 | 


--------------------------------------------------------------------------------
/R/cytofin_prep_anchors.R:
--------------------------------------------------------------------------------
  1 | #' Prepare CyTOF controls for batch normalization across plates
  2 | #'
  3 | #' This function calculates reference statistics needed for CytofIn batch normalization. 
  4 | #' Specifically, it calculates the universal mean and universal variance vectors
  5 | #' of the generalized anchors identified in the metadata file at `metadata_path`; 
  6 | #' in addition, it calculates the non-channel-specific bulk mean and bulk variance
  7 | #' of the generalized anchors.
  8 | #'
  9 | #' @param metadata_path A file path leading to an .xlsx or .csv file 
 10 | #' containing a table of CyTOF file (.fcs file) names. Columns should include
 11 | #' `filename`, `cohort`, `plate_number`, `patient_id`, `condition`, `population`, 
 12 | #' and `validation`.
 13 | #' 
 14 | #' See the vignette for details: \code{vignette("help", package = "cytofin")} 
 15 | #' 
 16 | #' @param panel_path A file path leading to an .xlsx or .csv file containing 
 17 | #' a table of standardized antigen panel information. Columns should include 
 18 | #' `metal_name`, `antigen_name`, `antigen_pattern`, `lineage`, `functional`, 
 19 | #' and `general`.
 20 | #'  
 21 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
 22 | #' 
 23 | #' @param input_data_path A folder directory containing the input CyTOF files
 24 | #' to be prepped for normalization. These files should already be homogenized,  
 25 | #' and in most cases this will be the directory to which the output
 26 | #' .fcs files from `cytofin_homogenize` were written.
 27 | #' 
 28 | #' @param input_prefix The string that was appended to the name of the input files 
 29 | #' of `cytofin_homogenize` to create their corresponding output file names. 
 30 | #' Defaults to "homogenized_".
 31 | #' 
 32 | #' @param output_path A file path specifying where to save the output .rds
 33 | #' file containing the statistics calculated from this step and the concatenated 
 34 | #' .FCS files containing all cells from the generalized anchor samples. Defaults
 35 | #' to "none", in which case no files are saved.
 36 | #'
 37 | #' @param shift_factor The scalar value `a` in the following equation used to 
 38 | #' transform CyTOF raw data ion counts using the hyperbolic arc-sine function:  
 39 | #'    
 40 | #'    `new_x <- asinh(a + b*x)`.
 41 | #' 
 42 | #' Defaults to 0. 
 43 | #' 
 44 | #' @param scale_factor The scalar value `b` in the following equation used to 
 45 | #' transform CyTOF raw data ion counts using the hyperbolic arc-sine function:  
 46 | #'    
 47 | #'    `new_x <- asinh(a + b*x)`.
 48 | #' 
 49 | #' Defaults to 0.2.
 50 | #' 
 51 | #' @return a `list()` of summary statistics with the following elements:
 52 | #' * __universal_var__:  a named numeric vector in which each entry corresponds to the 
 53 | #' universal variance of an antigen channel in the homogenized dataset
 54 | #' * __universal_mean__:  a named numeric vector in which each entry corresponds to the 
 55 | #' universal mean of an antigen channel in the homogenized dataset
 56 | #' * __bulk_var__:  The mean of all the channel-specific universal variances 
 57 | #' in `universal_var` (a scalar value)
 58 | #' * __bulk_mean__:  The mean of all the channel-specific universal means 
 59 | #' in `universal_mean` (a scalar value)
 60 | #' 
 61 | #' 
 62 | #' @export
 63 | #' 
 64 | 
 65 | cytofin_prep_anchors <- function(
 66 |   metadata_path, 
 67 |   panel_path, 
 68 |   input_data_path, 
 69 |   input_prefix = "homogenized_",
 70 |   output_path = "none", 
 71 |   shift_factor = 0, 
 72 |   scale_factor = 0.2
 73 | ) {
 74 |   
 75 |   # create output directory if needed
 76 |   dir.create(output_path, showWarnings = FALSE, recursive = TRUE)
 77 |   
 78 |   # read metadata table and select only the anchor samples
 79 |   md_control <- 
 80 |     dplyr::filter(cytofin_read_metadata(metadata_path), is_anchor == 1)
 81 |   
 82 |   # read reference panel information
 83 |   ref_panel <- cytofin_read_panel_info(panel_path)
 84 | 
 85 |   # extract character vectors of the lineage markers' metals and 
 86 |   # the functional markers' metals
 87 |   lineage_markers <- ref_panel$metal_name[ref_panel$lineage == 1]
 88 |   functional_markers <- ref_panel$metal_name[ref_panel$functional == 1]
 89 |   all_markers <- c(lineage_markers, functional_markers)
 90 |   
 91 |   # read in the input data as a flowSet
 92 |   fcs_control <- 
 93 |     flowCore::read.flowSet(
 94 |       file.path(input_data_path, paste0(input_prefix, md_control$filename)), 
 95 |       transformation = FALSE, 
 96 |       truncate_max_range = FALSE
 97 |     )
 98 |   
 99 |   # arcsinh-transform all data
100 |   asinh_transform <- flowCore::arcsinhTransform(a = shift_factor, b = scale_factor)
101 |   col_names <- flowCore::colnames(fcs_control)
102 |   expr_untransformed <- flowCore::fsApply(fcs_control, flowCore::exprs)
103 |   transform_list <- flowCore::transformList(from = col_names, tfun = asinh_transform)
104 |   fcs_asinh <- flowCore::transform(fcs_control, transform_list)
105 |   expr <- flowCore::fsApply(fcs_asinh, flowCore::exprs)
106 |   
107 |   # calculate universal mean and variance
108 |   universal_mean <- apply(expr, 2, mean)
109 |   universal_var <- apply(expr, 2, var)
110 |   
111 |   # calculate the mean and variance of all the channel-specific universal means
112 |   # and variances, respectively
113 |   bulk_var <- mean(universal_var[all_markers])
114 |   bulk_mean <- mean(universal_mean[all_markers])
115 |   
116 |   # collate all reference statistics into a list
117 |   result <-
118 |     list(
119 |       universal_var = universal_var, 
120 |       universal_mean = universal_mean,
121 |       bulk_var = bulk_var, 
122 |       bulk_mean = bulk_mean
123 |     )
124 |   
125 |   # if the user wants to store intermediate files
126 |   if (output_path != "none") {
127 |     
128 |     # save universal mean and variance information
129 |     readr::write_rds(
130 |       x = result, 
131 |       file = file.path(output_path, "anchor_statistics.rds")
132 |     )
133 |     
134 |     # write concatenated control file (asinh-transformed)
135 |     gc()
136 |     filename <- file.path(output_path, "concatenated_control.fcs")
137 |     ff <- flowCore::flowFrame(expr)
138 |     data_panel_name <- flowCore::pData(flowCore::parameters(fcs_control[[1]]))$desc
139 |     flowCore::pData(flowCore::parameters(ff))$desc <- data_panel_name  
140 |     flowCore::write.FCS(ff, filename)
141 |     
142 |     # write concatenated control file (untransformed)
143 |     gc()
144 |     filename <- file.path(output_path, "concatenated_control_untransformed.fcs")
145 |     ff <- flowCore::flowFrame(expr_untransformed)
146 |     data_panel_name <- flowCore::pData(flowCore::parameters(fcs_control[[1]]))$desc
147 |     flowCore::pData(flowCore::parameters(ff))$desc <- data_panel_name  
148 |     flowCore::write.FCS(ff, filename)
149 |   }
150 |     
151 |     return(result)
152 |   }
153 |   
154 | 


--------------------------------------------------------------------------------
/R/cytofin_normalize.R:
--------------------------------------------------------------------------------
  1 | #' Batch normalize CyTOF plates from heterogeneous sources using external anchors
  2 | #'
  3 | #' This function batch normalizes CyTOF data from multiple plates (from one or more 
  4 | #' experimental cohorts) using external (i.e. "generalized") anchors.
  5 | #'
  6 | #' @param metadata_path A filepath leading to an .xlsx or .csv file 
  7 | #' containing a table of CyTOF file (.fcs file) names. Columns should include
  8 | #' `filename`, `cohort`, `plate_number`, `patient_id`, `condition`, `is_anchor`, 
  9 | #' and `validation`. 
 10 | #' 
 11 | #' See the vignette for details: \code{vignette("help", package = "cytofin")} 
 12 | #' 
 13 | #' @param panel_path A file path leading to an .xlsx or .csv file containing 
 14 | #' a table of standardized antigen panel information. Columns should include 
 15 | #' `metal_name`, `antigen_name`, `antigen_pattern`, 
 16 | #' `lineage`, `functional`, and `general`. 
 17 | #' 
 18 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
 19 | #' 
 20 | #' @param anchor_statistics a list produced by the `cytofin_prep_anchors`
 21 | #' function or the file path to an .rds object containing anchor reference statistics.
 22 | #' 
 23 | #' @param input_data_path A folder directory containing the input CyTOF files
 24 | #' to be normalized. In most cases, this will be the directory to which the output
 25 | #' .fcs files from `cytofin_homogenize` were written.
 26 | #' 
 27 | #' @param output_data_path A folder directory to which the output (i.e. 
 28 | #' batch normalized/batch corrected) .fcs files should be written.
 29 | #' 
 30 | #' @param mode A string indicating which transformation function should be used 
 31 | #' for batch normalization ("meanshift", "meanshift_bulk", "variance", "z_score", 
 32 | #' or "beadlike").
 33 | #' 
 34 | #' @param input_prefix The string that was appended to the name of the input files 
 35 | #' of `cytofin_homogenize` to create their corresponding output file names. 
 36 | #' Defaults to "homogenized_".
 37 | #' 
 38 | #' @param output_prefix A string to be appended to the name of each input file 
 39 | #' to create the name of the corresponding output file (post-homogenization). 
 40 | #' Defaults to "normalized_" (e.g. an input file named "file1.fcs" will correspond to 
 41 | #' the output file "normalized_file1.fcs" saved in `output_data_path`).
 42 | #' 
 43 | #' @param shift_factor The scalar value `a` in the following equation used to 
 44 | #' transform CyTOF raw data ion counts using the hyperbolic arc-sine function:  
 45 | #'    
 46 | #'    `new_x <- asinh(a + b * x)`.
 47 | #' 
 48 | #' Defaults to 0. 
 49 | #' 
 50 | #' @param scale_factor The scalar value `b` in the following equation used to 
 51 | #' transform CyTOF raw data ion counts using the hyperbolic arc-sine function:  
 52 | #'    
 53 | #'    `new_x <- asinh(a + b * x)`.
 54 | #' 
 55 | #' Defaults to 0.2. 
 56 | #'
 57 | #' @return Batch-normalized .fcs files are saved in the directory specified by 
 58 | #' `output_data_path`. 
 59 | #' 
 60 | #' In addition, a data.frame containing information about 
 61 | #' each input .fcs file (that can be used for plotting with `cytofin_make_plots`)
 62 | #' is returned with the following columns: 
 63 | #'    * All of the columns in the input metadata table (located at `metadata_path`)
 64 | #'    * __universal_mean__: the universal mean vector to which all files are adjusted 
 65 | #'    (will be identical for all input .fcs files)
 66 | #'    * __universal_var__: the universal mean vector to which all files are adjusted
 67 | #'    (will be identical for all input .fcs files)
 68 | #'    * __anchor_mean__: the mean (across all cells) vector for the anchor file associated
 69 | #'    with each input .fcs file (i.e. the anchor located on the same plate as the 
 70 | #'    input .fcs file) before batch normalization.
 71 | #'    * __anchor_var__: the variance (across all cells) vector for the anchor file associated
 72 | #'    with each input .fcs file (i.e. the anchor located on the same plate as the 
 73 | #'    input .fcs file)
 74 | #'    * __mean_b4norm__: the mean (across all cells) vector of the input .fcs file 
 75 | #'    before batch normalization. 
 76 | #'    * __var_b4norm__: the variance (across all cells) vector of the input .fcs file 
 77 | #'    before batch normalization. 
 78 | #'    * __mean_norm__: the mean (across all cells) vector of the input .fcs file 
 79 | #'    after batch normalization. 
 80 | #'    * __var_norm__: the variance (across all cells) vector of the input .fcs file 
 81 | #'    after batch normalization.
 82 | #'    * __anchor_mean_norm__: the mean (across all cells) vector for the anchor file associated
 83 | #'    with each input .fcs file (i.e. the anchor located on the same plate as the 
 84 | #'    input .fcs file) after batch normalization.
 85 | #'    * __anchor_var_norm__: the variance (across all cells) vector for the anchor file associated
 86 | #'    with each input .fcs file (i.e. the anchor located on the same plate as the 
 87 | #'    input .fcs file) after batch normalization.
 88 | #' 
 89 | #' @export
 90 | #' 
 91 | cytofin_normalize <-
 92 |   function(
 93 |     metadata_path,
 94 |     panel_path,
 95 |     anchor_statistics, 
 96 |     input_data_path,
 97 |     output_data_path,
 98 |     mode = c("meanshift", "meanshift_bulk", "variance", "z_score", "beadlike"),
 99 |     input_prefix = "homogenized_", 
100 |     output_prefix = "normalized_", 
101 |     shift_factor = 0, 
102 |     scale_factor = 0.2
103 |   ) {
104 |     
105 |     # create output directory
106 |     dir.create(output_data_path, showWarnings = FALSE, recursive = TRUE)
107 |     
108 |     #read metadata table
109 |     md <- cytofin_read_metadata(metadata_path)
110 |     
111 |     # separate metadata for anchor samples
112 |     md_control <- dplyr::filter(md, is_anchor == 1)
113 | 
114 |     # if anchor_statistics is a file path
115 |     if (is.character(anchor_statistics)) { 
116 |       anchor_statistics_list <- readr::read_rds(anchor_statistics)
117 |       # else if anchor_statistics is a list 
118 |     } else if (is.list(anchor_statistics)) { 
119 |       anchor_statistics_list <- anchor_statistics
120 |     } else { 
121 |       stop("anchor_statistics must be either a character vector (file path) or a list")
122 |     }
123 |     
124 |     # extract needed values from anchor_statistics_list
125 |     universal_var <- anchor_statistics_list$universal_var
126 |     universal_mean <- anchor_statistics_list$universal_mean
127 |     bulk_var <- anchor_statistics_list$bulk_var
128 |     bulk_mean <- anchor_statistics_list$bulk_mean
129 |     
130 |     # read in standardized panel
131 |     ref_panel <- cytofin_read_panel_info(panel_path = panel_path)
132 |     
133 |     # compile list of all markers to keep during analysis
134 |     lineage_markers <- 
135 |       as.character(ref_panel$metal_name[ref_panel$lineage == 1])
136 |     
137 |     functional_markers <- 
138 |       as.character(ref_panel$metal_name[ref_panel$functional == 1])
139 |     
140 |     all_markers <- c(lineage_markers, functional_markers)
141 |     
142 |     # create transformation functions
143 |     norm_1 <- function(x) {
144 |       y <- universal_mean[all_markers]
145 |       z <- x
146 |       m <- match(names(y), names(x))
147 |       z[m] <- z[m] - anchor_mean[m] + universal_mean[m]
148 |       return(z)
149 |     } #meanshift
150 |     
151 |     norm_2 <- function(x) {
152 |       y <- universal_mean[all_markers]
153 |       z <- x
154 |       m <- match(names(y), names(x))
155 |       z[m] <- z[m] - mean(anchor_mean[m]) + mean(universal_mean[m])
156 |       return(z)
157 |     } #meanshift bulk
158 |     
159 |     norm_3 <- function(x) {
160 |       y <- universal_mean[all_markers]
161 |       z <- x
162 |       m <- match(names(y), names(x))
163 |       z[m] <- 
164 |         (z[m] - anchor_mean[m] + universal_mean[m]) * sqrt(universal_var[m])/sqrt(anchor_var[m])
165 |       return(z)
166 |     } #variance
167 |     
168 |     norm_4 <- function(x) {
169 |       y <- universal_mean[all_markers]
170 |       z <- x
171 |       m <- match(names(y), names(x))
172 |       z[m] <- 
173 |         (z[m] - anchor_mean[m]) * sqrt(universal_var[m])/sqrt(anchor_var[m]) + universal_mean[m]
174 |       return(z)
175 |     } #z-score
176 |     
177 |     norm_5 <- function(x) {
178 |       y <- universal_mean[all_markers]
179 |       z <- x
180 |       m <- match(names(y), names(x))
181 |       z[m] <- z[m] * lm(universal_mean[m] ~ anchor_mean[m])$coefficient[[2]]
182 |       return(z)
183 |     } #beadlike
184 | 
185 |     # find the user-specified normalization function
186 |     if (mode == "meanshift") {
187 |       norm <- norm_1 
188 |     } else if (mode == "meanshift_bulk") {
189 |       norm <- norm_2
190 |     } else if (mode == "variance") {
191 |       norm <- norm_3
192 |     } else if (mode == "z_score") {
193 |       norm <- norm_4
194 |     } else if (mode == "beadlike") {
195 |       norm <- norm_5
196 |     }
197 |     
198 |     # create final data structure 
199 |     result <- 
200 |       dplyr::mutate(
201 |         md, 
202 |         universal_mean = list(0), 
203 |         universal_var = list(0), 
204 |         anchor_mean = list(0), 
205 |         anchor_var = list(0), 
206 |         mean_b4norm = list(0), 
207 |         var_b4norm = list(0), 
208 |         mean_norm = list(0), 
209 |         var_norm = list(0), 
210 |         anchor_mean_norm = list(0), 
211 |         anchor_var_norm = list(0)
212 |       )
213 |     
214 |     # for each file being batch-normalized...
215 |     for (i in 1:length(md$filename)) {
216 |       # calculate adjustment parameters from control plate
217 |       
218 |       # find the anchor file corresponding to the same plate and cohort 
219 |       # as the file being batch normalized
220 |       filename_anchor <- 
221 |         md_control$filename[
222 |           which(
223 |             (md_control$plate_number == md$plate_number[i]) & 
224 |               (md_control$cohort == md$cohort[i])
225 |           )
226 |         ]
227 |       
228 |       # read in the anchor file
229 |       fcs <- 
230 |         flowCore::read.FCS(
231 |           file.path(input_data_path, paste0(input_prefix, filename_anchor)), 
232 |           transformation = FALSE, 
233 |           truncate_max_range = FALSE
234 |         )
235 |       
236 |       # arcsinh transform all columns of the anchor file
237 |       asinh_transform <- 
238 |         flowCore::arcsinhTransform(a = shift_factor, b = scale_factor)
239 |       col_names <- flowCore::colnames(fcs)
240 |       tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
241 |       fcs_asinh <- flowCore::transform(fcs, tlist)
242 |     
243 |       # find the mean and variance vector of the anchor file
244 |       anchor_expr <- flowCore::exprs(fcs_asinh)
245 |       anchor_mean <- apply(anchor_expr, 2, mean)
246 |       anchor_var <- apply(anchor_expr, 2, var)
247 |       
248 |       # find the bulk mean and bulk variance of the anchor file
249 |       anchor_bulk_mean <- mean(anchor_mean)
250 |       anchor_bulk_var <- mean(anchor_var)
251 |       
252 |       # normalize the anchor file
253 |       anchor_expr_norm <- 
254 |         t(apply(anchor_expr, 1, norm))
255 |       
256 |       anchor_mean_norm <- apply(anchor_expr_norm, 2, mean)
257 |       anchor_var_norm <- apply(anchor_expr_norm, 2, var)
258 |       
259 |       # normalize the target file
260 |       
261 |       ## read in target file  
262 |       filename <- md$filename[i]
263 |       fcs <- 
264 |         flowCore::read.FCS(
265 |           file.path(input_data_path, paste0(input_prefix, filename)), 
266 |           transformation = FALSE, 
267 |           truncate_max_range = FALSE
268 |         )
269 |       
270 |       # arcsinh-transform all columns of the target file
271 |       col_names <- flowCore::colnames(fcs)
272 |       tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
273 |       fcs_asinh <- flowCore::transform(fcs, tlist)
274 |       
275 |       # extract target file's expression matrix before normalization
276 |       expr_b4norm <- flowCore::exprs(fcs_asinh)
277 |       
278 |       # find the target file's un-normalized mean and variance vectors
279 |       mean_b4norm <- apply(expr_b4norm, 2, mean)
280 |       var_b4norm <- apply(expr_b4norm, 2, var)
281 |       
282 |       ## normalize the target file
283 |       expr_norm <- 
284 |         t(apply(expr_b4norm, 1, norm))
285 |       
286 |       # find the mean and variance vectors of the normalized target file
287 |       mean_norm <- apply(expr_norm, 2, mean)
288 |       var_norm <- apply(expr_norm, 2, var)
289 |       
290 |       # create flowFrame to be written as the output .fcs file for this sample
291 |       fcs_norm <- flowCore::flowFrame(expr_norm)
292 |       
293 |       # normalization completed, reverse asinh transformation for final output
294 |       my_rev_asinh <- 
295 |         function(x) {
296 |           rev_asinh(x, shift_factor = shift_factor, scale_factor = scale_factor)
297 |         }
298 |       tlist2 <- flowCore::transformList(from = col_names, tfun = my_rev_asinh)
299 |       fcs_asinh_rev <- flowCore::transform(fcs_norm, tlist2)
300 |       
301 |       # prepare and write out final .fcs file
302 |       flowCore::pData(flowCore::parameters(fcs_asinh_rev))$desc <- 
303 |         flowCore::pData(flowCore::parameters(fcs_asinh))$desc
304 |       fcs_name <- file.path(output_data_path, paste0(output_prefix, filename))
305 |       flowCore::write.FCS(x = fcs_asinh_rev, filename = fcs_name)
306 |       
307 |       # update final data structure 
308 |       result$universal_mean[[i]] <- universal_mean 
309 |       result$universal_var[[i]] <- universal_var
310 |       result$anchor_mean[[i]] <- anchor_mean
311 |       result$anchor_var[[i]] <- anchor_var
312 |       result$mean_b4norm[[i]] <- mean_b4norm
313 |       result$var_b4norm[[i]] <- var_b4norm
314 |       result$mean_norm[[i]] <- mean_norm
315 |       result$var_norm[[i]] <- var_norm
316 |       result$anchor_mean_norm[[i]] <- anchor_mean_norm
317 |       result$anchor_var_norm[[i]] <- anchor_var_norm
318 |     }
319 |     
320 |     # add marker list and arcsinh transformation parameters to the final data structure
321 |     attr(result, which = "shift_factor") <- shift_factor
322 |     attr(result, which = "scale_factor") <- scale_factor
323 |     attr(result, which = "all_markers") <- all_markers
324 |     
325 |     # return result
326 |     return(result)
327 |     
328 |   }
329 | 


--------------------------------------------------------------------------------
/R/cytofin_make_plots.R:
--------------------------------------------------------------------------------
  1 | #' Make diagnostic plots to evaluate CytofIn batch normalization
  2 | #' 
  3 | #' When given the output data structure from `cytofin_normalize` or `cytofin_normalize_nrs`, 
  4 | #' this function plots mean and variance plots for all normalized .fcs files and their 
  5 | #' associated anchors.
  6 | #'
  7 | #' @param normalization_result An output data.frame produced by the `cytofin_normalize` or
  8 | #' `cytofin_normalize_nrs` function. 
  9 | #' 
 10 | #' The following columns should be present: `filename`,
 11 | #' `cohort`, `plate_number`, `patient_id`, `condition`, `is_anchor`, `validation`,
 12 | #'  `universal_var`, `anchor_mean`, `anchor_var`, `mean_b4norm`, `var_b4norm`,
 13 | #'  `mean_norm`, `var_norm`, `mean_ctr_norm`, `var_ctr_norm`.
 14 | #'  
 15 | #' @param which_rows A numeric vector indicating which rows of `normalization_result`
 16 | #'  (i.e. which .fcs files in the combined dataset) should be used for plotting. Defaults 
 17 | #'  to 1:nrow(normalization_result), which will make all possible plots.
 18 | #'
 19 | #' @param val_path The folder directory containing validation (i.e. bead-normalized)
 20 | #' .fcs files corresponding to the input .fcs files in the metadata table. (Optional).
 21 | #'
 22 | #' @return 8 diagnostic plots are made for each input .fcs file that was batch 
 23 | #' normalized (i.e. each .fcs file represented as a row in `normalization_result`). 
 24 | #' From left-to-right (and top-to-bottom), these plots represent the following: 
 25 | #'    1) The entry in the universal mean vector corresponding to each antigen in the 
 26 | #'    consensus antigen panel. X-axis: antigen index in the universal mean vector. 
 27 | #'    Y-axis: Arcsinh-transformed entry in the universal mean vector corresponding 
 28 | #'    to each antigen. 
 29 | #'    2) The mean (across all cells) antigen expression vector for the anchor 
 30 | #'    associated with each input .fcs file both before and after normalization. 
 31 | #'    X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the 
 32 | #'    anchor .fcs file.
 33 | #'    3) The mean (across all cells) antigen expression vector for each input 
 34 | #'    .fcs file both before and after normalization. 
 35 | #'    X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the 
 36 | #'    input .fcs file.
 37 | #'    4) The mean (across all cells) antigen expression vector for each "validation"
 38 | #'    (i.e. bead-normalized) .fcs file both before and after bead-normalization. 
 39 | #'    This plot can be used to compare CytofIn batch normalization with gold-
 40 | #'    standard approaches. If `val_path` is "none", this plot will be identical to 
 41 | #'    plot 3 (see above).  
 42 | #'    X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the 
 43 | #'    validation .fcs file.
 44 | #'    5) The entry in the universal standard deviation vector corresponding to each antigen in the 
 45 | #'    consensus antigen panel. X-axis: antigen index in the universal standard deviation vector. 
 46 | #'    Y-axis: Arcsinh-transformed entry in the universal standard deviation vector corresponding 
 47 | #'    to each antigen.
 48 | #'    6) The standard deviation (across all cells) antigen expression vector for the anchor 
 49 | #'    associated with each input .fcs file both before and after normalization. 
 50 | #'    X-axis: antigen index (as in plot 1). Y-axis: the standard deviation of all 
 51 | #'    antigen expression values in the anchor .fcs file.
 52 | #'    7) The standard deviation (across all cells) antigen expression vector for each input 
 53 | #'    .fcs file both before and after normalization. 
 54 | #'    X-axis: antigen index (as in plot 1). Y-axis: the standard deviation of all 
 55 | #'    antigen expression values in the input .fcs file.
 56 | #'    8) The standard deviation (across all cells) antigen expression vector for each "validation"
 57 | #'    (i.e. bead-normalized) .fcs file both before and after bead-normalization. 
 58 | #'    This plot can be used to compare CytofIn batch normalization with gold-
 59 | #'    standard approaches. If `val_path` is "none", this plot will be identical to 
 60 | #'    plot 3 (see above).  
 61 | #'    X-axis: antigen index (as in plot 1). Y-axis: Standard deviation of all 
 62 | #'    antigen expression values in the validation .fcs file.
 63 | #'
 64 | #' @export
 65 | #'
 66 | cytofin_make_plots <- 
 67 |   function(
 68 |     normalization_result, 
 69 |     which_rows = 1:nrow(normalization_result),
 70 |     val_path = "none"
 71 |   ) {
 72 |     
 73 |     # extract needed values from the normalization_result attributes
 74 |     all_markers <- attr(normalization_result, which = "all_markers")
 75 |     shift_factor <- attr(normalization_result, which = "shift_factor")
 76 |     scale_factor <- attr(normalization_result, which = "scale_factor")
 77 |     
 78 |     # filter out rows that we aren't interested in plotting
 79 |     normalization_result <- normalization_result[which_rows,]
 80 |     
 81 |     # for all rows in the normalization result
 82 |     for (i in 1:nrow(normalization_result)) {
 83 |       
 84 |       # extract needed values for the current file
 85 |       filename <- normalization_result$filename[[i]]
 86 |       plate_number <- normalization_result$plate_number[[i]]
 87 |       cohort <- normalization_result$cohort[[i]]
 88 |       universal_mean <- normalization_result$universal_mean[[i]]
 89 |       universal_var <- normalization_result$universal_var[[i]]
 90 |       anchor_mean <- normalization_result$anchor_mean[[i]]
 91 |       anchor_var <- normalization_result$anchor_var[[i]]
 92 |       mean_b4norm <- normalization_result$mean_b4norm[[i]]
 93 |       var_b4norm <- normalization_result$var_b4norm[[i]]
 94 |       mean_norm <- normalization_result$mean_norm[[i]]
 95 |       var_norm <- normalization_result$var_norm[[i]]
 96 |       anchor_mean_norm <- normalization_result$anchor_mean_norm[[i]]
 97 |       anchor_var_norm <- normalization_result$anchor_var_norm[[i]]
 98 |       
 99 |       # find name of the anchor that corresponds to each file
100 |       md_control <- dplyr::filter(normalization_result, is_anchor == 1)
101 |       
102 |       filename_anchor <-
103 |         md_control$filename[
104 |           which(
105 |             (md_control$plate_number == plate_number) &
106 |               (md_control$cohort == cohort)
107 |           )
108 |         ]
109 |       
110 |       # read in validation .fcs file
111 |       if (val_path != "none") {
112 |         filename_val <- normalization_result$validation[i]
113 |         fcs <-
114 |           flowCore::read.flowSet(
115 |             file.path(val_path, filename_val),
116 |             transformation = FALSE,
117 |             truncate_max_range = FALSE
118 |           )
119 |         
120 |         # arcsinh-transform validation .fcs file
121 |         asinh_transform <-
122 |           flowCore::arcsinhTransform(a = shift_factor, b = scale_factor)
123 |         col_names <- flowCore::colnames(fcs)
124 |         tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
125 |         fcs_asinh <- flowCore::transform(fcs, tlist)
126 |         expr_val <- flowCore::fsApply(fcs_asinh, flowCore::exprs)
127 |         
128 |         # find the mean and variance vector from the validation file
129 |         mean_val <- apply(expr_val, 2, mean)
130 |         var_val <- apply(expr_val, 2, var)
131 |         
132 |         # find the bulk mean and variance from the validation file
133 |         mean_val_mean <- mean(mean_val)
134 |         var_val_mean <- mean(var_val)
135 |       }
136 |       
137 |       # make visualizations
138 |       par(mfrow = c(2, 4))
139 |       len <- length(universal_mean[all_markers])
140 |       
141 |       # expression (mean)
142 |       # plot 1
143 |       plot(
144 |         universal_mean[all_markers],
145 |         col = "red",
146 |         xlab = "antigen",
147 |         ylab = "universal expression (mean)",
148 |         xlim = c(0, len),
149 |         ylim = c(-5, 10),
150 |         main = "overall",
151 |         cex.main = 1
152 |       )
153 |       
154 |       legend(1, 10, legend = c("universal"), col = c("red"), lty = 1:2, cex = 0.8)
155 |       
156 |       # plot 2
157 |       plot(
158 |         anchor_mean[all_markers],
159 |         col = "cyan",
160 |         xlab = "antigen",
161 |         ylab = "control expression (mean)",
162 |         xlim = c(0, len),
163 |         ylim = c(-5, 10),
164 |         main = filename_anchor,
165 |         cex.main = 0.8
166 |       )
167 |       
168 |       par(new = TRUE)
169 |       plot(
170 |         anchor_mean_norm[all_markers],
171 |         col = "blue",
172 |         xlab = "antigen",
173 |         ylab = "control expression (mean)",
174 |         xlim = c(0, len),
175 |         ylim = c(-5, 10),
176 |         cex.main = 0.8
177 |       )
178 |       legend(1, 10, legend = c("normalized", "original"), col = c("blue", "cyan"), lty = 1:2, cex = 0.8)
179 |       
180 |       # plot 3
181 |       plot(
182 |         mean_b4norm[all_markers],
183 |         col = "green",
184 |         xlab = "antigen",
185 |         ylab = "sample expression (mean)",
186 |         xlim = c(0, len),
187 |         ylim = c(-5, 10),
188 |         main = filename,
189 |         cex.main = 0.8
190 |       )
191 |       par(new = TRUE)
192 |       plot(
193 |         mean_norm[all_markers],
194 |         col = "darkgreen",
195 |         xlab = "antigen",
196 |         ylab = "sample expression (mean)",
197 |         xlim = c(0, len),
198 |         ylim = c(-5, 10),
199 |         cex.main = 0.8
200 |       )
201 |       legend(1, 10, legend = c("normalized", "original"), col = c("darkgreen", "green"), lty = 1:2, cex = 0.8)
202 |       
203 |       # plot 4
204 |       if (val_path != "none") {
205 |         plot(
206 |           mean_b4norm[all_markers],
207 |           col = "green",
208 |           xlab = "antigen",
209 |           ylab = "overlay expression (mean)",
210 |           xlim = c(0, len),
211 |           ylim = c(-5, 10)
212 |         )
213 |         
214 |         par(new = TRUE)
215 |         plot(
216 |           mean_norm[all_markers], 
217 |           col = "darkgreen", 
218 |           xlab = "antigen", 
219 |           ylab = "overlay expression (mean)", 
220 |           xlim = c(0, len), 
221 |           ylim = c(-5, 10)
222 |         )
223 |         
224 |         par(new = TRUE)
225 |         plot(
226 |           mean_val[all_markers], 
227 |           col = "purple", 
228 |           xlab = "antigen", 
229 |           ylab = "overlay expression (mean)", 
230 |           xlim = c(0, len), 
231 |           ylim = c(-5, 10)
232 |         )
233 |         par(new = TRUE)
234 |         legend(1, 10, legend = c("original", "normalized", "validation"), col = c("green", "darkgreen", "purple"), lty = 1:2, cex = 0.8)
235 |         
236 |       } else {
237 |         
238 |         plot(
239 |           mean_b4norm[all_markers], 
240 |           col = "green", 
241 |           xlab = "antigen", 
242 |           ylab = "overlay expression (mean)", 
243 |           xlim = c(0, len), 
244 |           ylim = c(-5, 10)
245 |         )
246 |         par(new = TRUE)
247 |         plot(mean_norm[all_markers], col = "darkgreen", xlab = "antigen", ylab = "overlay expression (mean)", xlim = c(0, len), ylim = c(-5, 10))
248 |         par(new = TRUE)
249 |         legend(1, 10, legend = c("normalized", "original"), col = c("darkgreen", "green"), lty = 1:2, cex = 0.8)
250 |       }
251 |       
252 |       # expression (std)
253 |       # plot 5
254 |       plot(
255 |         sqrt(universal_var[all_markers]),
256 |         col = "red",
257 |         xlab = "antigen",
258 |         ylab = "universal expression (std)",
259 |         xlim = c(0, len),
260 |         ylim = c(-5, 10),
261 |         main = "overall",
262 |         cex.main = 1
263 |       )
264 |       legend(1, 10, legend = c("universal"), col = c("red"), lty = 1:2, cex = 0.8)
265 |       
266 |       # plot 6
267 |       plot(sqrt(anchor_var[all_markers]), col = "cyan", xlab = "antigen", ylab = "control expression (std)", xlim = c(0, len), ylim = c(-5, 10), main = filename_anchor, cex.main = 0.8)
268 |       par(new = TRUE)
269 |       plot(sqrt(anchor_var_norm[all_markers]), col = "blue", xlab = "antigen", ylab = "control expression (std)", xlim = c(0, len), ylim = c(-5, 10), cex.main = 0.8)
270 |       legend(1, 10, legend = c("normalized", "original"), col = c("blue", "cyan"), lty = 1:2, cex = 0.8)
271 |       
272 |       # plot 7
273 |       plot(
274 |         sqrt(var_b4norm[all_markers]), 
275 |         col = "green", 
276 |         xlab = "antigen", 
277 |         ylab = "sample expression (std)", 
278 |         xlim = c(0, len), 
279 |         ylim = c(-5, 10), 
280 |         main = filename, 
281 |         cex.main = 0.8
282 |       )
283 |       
284 |       par(new = TRUE)
285 |       plot(
286 |         sqrt(var_norm[all_markers]), 
287 |         col = "darkgreen", 
288 |         xlab = "antigen", 
289 |         ylab = "sample expression (std)", 
290 |         xlim = c(0, len), 
291 |         ylim = c(-5, 10), 
292 |         cex.main = 0.8
293 |       )
294 |       
295 |       legend(1, 10, legend = c("normalized", "original"), col = c("darkgreen", "green"), lty = 1:2, cex = 0.8)
296 |       
297 |       # plot 8
298 |       if (val_path != "none") {
299 |         plot(
300 |           sqrt(var_b4norm[all_markers]), 
301 |           col = "green", 
302 |           xlab = "antigen", 
303 |           ylab = "overlay expression (std)",
304 |           xlim = c(0, len), 
305 |           ylim = c(-5, 10)
306 |         )
307 |         
308 |         par(new = TRUE)
309 |         plot(
310 |           sqrt(var_norm[all_markers]), 
311 |           col = "darkgreen", 
312 |           xlab = "antigen", 
313 |           ylab = "overlay expression (std)", 
314 |           xlim = c(0, len), 
315 |           ylim = c(-5, 10)
316 |         )
317 |         
318 |         par(new = TRUE)
319 |         plot(
320 |           var_val[all_markers], 
321 |           col = "purple", 
322 |           xlab = "antigen", 
323 |           ylab = "overlay expression (std)", 
324 |           xlim = c(0, len), 
325 |           ylim = c(-5, 10)
326 |         )
327 |         
328 |         par(new = TRUE)
329 |         legend(1, 10, legend = c("original", "normalized", "validation"), col = c("green", "darkgreen", "purple"), lty = 1:2, cex = 0.8)
330 |         
331 |       } else {
332 |         plot(
333 |           sqrt(var_b4norm[all_markers]), 
334 |           col = "green", 
335 |           xlab = "antigen", 
336 |           ylab = "overlay expression (std)", 
337 |           xlim = c(0, len), 
338 |           ylim = c(-5, 10)
339 |         )
340 |         
341 |         par(new = TRUE)
342 |         plot(
343 |           sqrt(var_norm[all_markers]), 
344 |           col = "darkgreen", 
345 |           xlab = "antigen", 
346 |           ylab = "overlay expression (std)", 
347 |           xlim = c(0, len),
348 |           ylim = c(-5, 10)
349 |         )
350 |         
351 |         par(new = TRUE)
352 |         legend(
353 |           1, 
354 |           10, 
355 |           legend = c("original", "normalized"), 
356 |           col = c("green", "darkgreen"), 
357 |           lty = 1:2, cex = 0.8
358 |         )
359 |       }
360 |     }
361 |   }
362 | 


--------------------------------------------------------------------------------
/R/cytofin_normalize_nrs.R:
--------------------------------------------------------------------------------
  1 | #' Batch normalize CyTOF plates from heterogeneous sources using stable channels
  2 | #'
  3 | #' This function batch normalizes CyTOF data from multiple plates (from one or more 
  4 | #' experimental cohorts) by computing the non-redundancy score (NRS) for each 
  5 | #' channel in the dataset, then using the most redundant (i.e. the "most stable") 
  6 | #' channels as a reference for batch normalization.
  7 | #' 
  8 | #' @param metadata_path A filepath leading to an .xlsx or .csv file 
  9 | #' containing a table of CyTOF file (.fcs file) names. Columns should include
 10 | #' `filename`, `cohort`, `plate_number`, `patient_id`, `condition`, `is_anchor`, 
 11 | #' and `validation`.
 12 | #' 
 13 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
 14 | #' 
 15 | #' @param panel_path A file path leading to an .xlsx or .csv file containing 
 16 | #' a table of standardized antigen panel information. Columns should include 
 17 | #' `metal_name`, `antigen_name`, `antigen_pattern`, 
 18 | #' `lineage`, `functional`, and `general`. 
 19 | #' 
 20 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
 21 | #' 
 22 | #' @param input_data_path A folder directory containing the input CyTOF files
 23 | #' to be normalized. In most cases, this will be the directory to which the output
 24 | #' .fcs files from `cytofin_homogenize` were written.
 25 | #' 
 26 | #' @param output_data_path A folder directory to which the output (i.e. 
 27 | #' batch normalized/batch corrected) .fcs files should be written.
 28 | #' 
 29 | #' @param input_prefix The string that was appended to the name of the input files 
 30 | #' of `cytofin_homogenize` to create their corresponding output file names. 
 31 | #' Defaults to "homogenized_".
 32 | #' 
 33 | #' @param output_prefix A string to be appended to the name of each input file 
 34 | #' to create the name of the corresponding output file (post-homogenization). 
 35 | #' Defaults to "normalized_" (e.g. an input file named "file1.fcs" will correspond to 
 36 | #' the output file "normalized_file1.fcs" saved in `output_data_path`).
 37 | #' 
 38 | #' @param shift_factor The scalar value `a` in the following equation used to 
 39 | #' transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
 40 | #'   
 41 | #'    `new_x <- asinh(a + b * x)`.
 42 | #'    
 43 | #' Defaults to 0. 
 44 | #' 
 45 | #' @param scale_factor The scalar value `b` in the following equation used to 
 46 | #' transform CyTOF raw data ion counts using the hyperbolic arcsinh function: 
 47 | #'  
 48 | #'    `new_x <- asinh(a + b * x)`.
 49 | #'    
 50 | #' Defaults to 0.2. 
 51 | #' 
 52 | #' @param nchannels An integer representing the number of most stable channels to
 53 | #' use during batch normalization. Defaults to 3.
 54 | #' 
 55 | #' @param make_plot A boolean value indicating if a plot depicting the non-
 56 | #' redundancy scores of each marker in each .fcs file being batch normalized
 57 | #' should be plotted as a side-effect of the function call. Defaults to FALSE.
 58 | #'
 59 | #' @return Batch-normalized .fcs files are saved in the directory specified by 
 60 | #' `output_data_path`. 
 61 | #' 
 62 | #' In addition, a data.frame containing information about 
 63 | #' each input .fcs file (that can be used for plotting with `cytofin_make_plots`)
 64 | #' is returned with the following columns: 
 65 | #'    * All of the columns in the input metadata table (located at `metadata_path`)
 66 | #'    * __universal_mean__: the universal mean vector to which all files are adjusted 
 67 | #'    (will be identical for all input .fcs files)
 68 | #'    * __universal_var__: the universal mean vector to which all files are adjusted
 69 | #'    (will be identical for all input .fcs files)
 70 | #'    * __anchor_mean__: the mean (across all cells) vector for the anchor file associated
 71 | #'    with each input .fcs file (i.e. the anchor located on the same plate as the 
 72 | #'    input .fcs file)
 73 | #'    * __anchor_var__: the variance (across all cells) vector for the anchor file associated
 74 | #'    with each input .fcs file (i.e. the anchor located on the same plate as the 
 75 | #'    input .fcs file)
 76 | #'    * __mean_b4norm__: the mean (across all cells) vector of the input .fcs file 
 77 | #'    before batch normalization. 
 78 | #'    * __var_b4norm__: the variance (across all cells) vector of the input .fcs file 
 79 | #'    before batch normalization.
 80 | #'    * __mean_norm__: the mean (across all cells) vector of the input .fcs file 
 81 | #'    after batch normalization.
 82 | #'    * __var_norm__: the variance (across all cells) vector of the input .fcs file 
 83 | #'    after batch normalization.
 84 | #'    * __anchor_mean_norm__: the mean (across all cells) vector for the anchor file associated
 85 | #'    with each input .fcs file (i.e. the anchor located on the same plate as the 
 86 | #'    input .fcs file) after batch normalization.
 87 | #'    * __anchor_var_norm__: the variance (across all cells) vector for the anchor file associated
 88 | #'    with each input .fcs file (i.e. the anchor located on the same plate as the 
 89 | #'    input .fcs file) after batch normalization.
 90 | #'    
 91 | #' @export
 92 | #' 
 93 | cytofin_normalize_nrs <-
 94 |   function(
 95 |     metadata_path,
 96 |     panel_path,
 97 |     input_data_path,
 98 |     output_data_path,
 99 |     input_prefix = "homogenized_", 
100 |     output_prefix = "normalized_", 
101 |     shift_factor = 0, 
102 |     scale_factor = 0.2,
103 |     nchannels = 3, 
104 |     make_plot = FALSE
105 |   ) {
106 |     
107 |     # create output directory
108 |     dir.create(output_data_path, showWarnings = FALSE, recursive = TRUE)
109 |   
110 |     #read metadata table
111 |     md <- cytofin_read_metadata(metadata_path)
112 |     
113 |     # read in standardized panel
114 |     ref_panel <- cytofin_read_panel_info(panel_path = panel_path)
115 |     
116 |     # compile list of all markers to keep during analysis
117 |     lineage_markers <- 
118 |       as.character(ref_panel$metal_name[ref_panel$lineage == 1])
119 |     
120 |     functional_markers <- 
121 |       as.character(ref_panel$metal_name[ref_panel$functional == 1])
122 |     
123 |     all_markers <- c(lineage_markers, functional_markers)
124 |     
125 |     # transformation function
126 |     norm <- function(x) {
127 |       y <- universal_mean[all_markers]
128 |       z <- x
129 |       m <- match(names(y), names(x))
130 |       z[m] <- 
131 |         z[m] - 
132 |         mean(mean_ctr[selected_markers]) + 
133 |         mean(universal_mean[selected_markers])
134 |       return(z)
135 |     } # meanshift bulk
136 |     
137 |     # create final data structure 
138 |     result <- 
139 |       dplyr::mutate(
140 |         md, 
141 |         universal_mean = list(0), 
142 |         universal_var = list(0), 
143 |         anchor_mean = list(0), 
144 |         anchor_var = list(0), 
145 |         mean_b4norm = list(0), 
146 |         var_b4norm = list(0), 
147 |         mean_norm = list(0), 
148 |         var_norm = list(0), 
149 |         anchor_mean_norm = list(0), 
150 |         anchor_var_norm = list(0)
151 |       )
152 |     
153 |     ## create function to compute non-redundancy score for all channels
154 |     NRS <- function(x, ncomp = 3) {
155 |       pr <- prcomp(x, center = TRUE, scale. = FALSE)
156 |       score <- 
157 |         rowSums(
158 |           outer(
159 |             rep(1, ncol(x)),
160 |             pr$sdev[1:ncomp]^2
161 |           ) * 
162 |             abs(pr$rotation[, 1:ncomp]))
163 |       return(score)
164 |     }
165 |     
166 |     # read in all .fcs files to be normalized
167 |     fcs <- 
168 |       flowCore::read.flowSet(
169 |         file.path(input_data_path, paste0(input_prefix, md$filename)), 
170 |         transformation = FALSE, 
171 |         truncate_max_range = FALSE
172 |       )
173 |     
174 |     # arcsinh transform all channels of the input .fcs files
175 |     asinh_transform <- 
176 |       flowCore::arcsinhTransform(a = shift_factor, b = scale_factor)
177 |     col_names <- flowCore::colnames(fcs)
178 |     tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
179 |     fcs_asinh <- flowCore::transform(fcs, tlist)
180 |     
181 |     # find the mean and variance vector of all cells in the dataset
182 |     expr <- flowCore::fsApply(fcs_asinh, flowCore::exprs)
183 |     universal_mean <- apply(expr, 2, mean)
184 |     universal_var <- apply(expr, 2, var)
185 |     
186 |     # calculate non-redundancy scores for each antigen in each .fcs file
187 |     nrs_sample <-
188 |       flowCore::fsApply(fcs_asinh[, all_markers], NRS, use.exprs = TRUE)
189 |     
190 |     # find mean non-redundancy scores for each antigen across all samples 
191 |     colnames(nrs_sample) <-
192 |       as.character(ref_panel$antigen_name[match((colnames(nrs_sample)), ref_panel$metal_name)])
193 |     nrs <- colMeans(nrs_sample, na.rm = TRUE)
194 |     
195 |     nrs_sample <- data.frame(nrs_sample)
196 |     markers_ord <- names(sort(nrs, decreasing = TRUE))
197 |     nrs_sample <- data.frame(nrs_sample)
198 |     nrs_sample$sample_id <- rownames(nrs_sample)
199 |     
200 |     if (make_plot) {
201 |       # make data.frame for plotting
202 |       ggdf <-
203 |         reshape2::melt(
204 |           nrs_sample, 
205 |           id.var = "sample_id", 
206 |           value.name = "nrs", 
207 |           variable.name = "antigen"
208 |         )
209 |       
210 |       ggdf$antigen <- 
211 |         factor(ggdf$antigen, levels = markers_ord)
212 |       
213 |       # make plot
214 |       p <- 
215 |         ggplot2::ggplot(ggdf, ggplot2::aes(x = antigen, y = nrs)) +
216 |         ggplot2::geom_point(
217 |           ggplot2::aes(color = sample_id),
218 |           alpha = 0.9,
219 |           position = ggplot2::position_jitter(width = 0.3, height = 0)
220 |         ) +
221 |         ggplot2::geom_boxplot(outlier.color = NA, fill = NA) +
222 |         ggplot2::stat_summary(fun = "mean", geom = "point", shape = 21, fill = "white") +
223 |         ggplot2::theme_bw() +
224 |         ggplot2::theme(
225 |           axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust = 1)
226 |         )
227 |       
228 |       print(p)
229 |       
230 |     }
231 |     
232 |     ####--------#####
233 |     
234 |     # select nchannels antigens with the lowest NRS for calibration
235 |     selected_markers <- names(sort(nrs, decreasing = FALSE))[1:nchannels]
236 |     
237 |     # find the metal names corresponding to the chosen antigens
238 |     selected_markers <- as.character(ref_panel$metal_name[match(selected_markers, ref_panel$antigen_name)])
239 |     
240 |     for (i in 1:length(md$filename)) {
241 |       # calculate adjustment parameters from control plate
242 |       
243 |       # read in .fcs file
244 |       filename_ctr <- md$filename[i]
245 |       fcs <- 
246 |         flowCore::read.FCS(
247 |           file.path(input_data_path, paste0(input_prefix, filename_ctr)), 
248 |           transformation = FALSE, 
249 |           truncate_max_range = FALSE
250 |         )
251 |       
252 |       # asinh-transform .fcs file and subset out only its selected channels
253 |       col_names <- flowCore::colnames(fcs)
254 |       tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
255 |       fcs_asinh <- flowCore::transform(fcs, tlist)
256 |       expr_ctr <- flowCore::exprs(fcs_asinh[, selected_markers])
257 |       
258 |       # find the mean and variance of the nchannels selected channels in the .fcs file
259 |       mean_ctr <- apply(expr_ctr, 2, mean)
260 |       mean_ctr_mean <- mean(mean_ctr)
261 |       var_ctr <- apply(expr_ctr, 2, var)
262 |       var_ctr_mean <- mean(var_ctr)
263 | 
264 |       # batch normalize the .fcs channels
265 |       expr_ctr_norm <- 
266 |         t(apply(flowCore::exprs(fcs_asinh), 1, norm))
267 |       
268 |       # find the mean and variance vectors of the normalized input file 
269 |       mean_ctr_norm <- apply(expr_ctr_norm[, selected_markers], 2, mean)
270 |       var_ctr_norm <- apply(expr_ctr_norm[, selected_markers], 2, var)
271 |       
272 |       # find the bulk mean and variance of the normalized input file
273 |       mean_ctr_norm_mean <- mean(mean_ctr_norm)
274 |       var_ctr_norm_mean <- mean(var_ctr_norm)
275 |       
276 |       # normalize the target plate
277 |       ## before
278 |       
279 |       # read in input .fcs file
280 |       filename <- md$filename[i]
281 |       fcs <- 
282 |         flowCore::read.FCS(
283 |           file.path(input_data_path, paste0(input_prefix, filename)), 
284 |           transformation = FALSE, 
285 |           truncate_max_range = FALSE
286 |         )
287 |       
288 |       # asinh-transform input .fcs file
289 |       col_names <- flowCore::colnames(fcs)
290 |       tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
291 |       fcs_asinh <- flowCore::transform(fcs, tlist)
292 |       expr_b4norm <- flowCore::exprs(fcs_asinh)
293 |       
294 |       # find the mean and variance vectors before batch correction
295 |       mean_b4norm <- apply(expr_b4norm, 2, mean)
296 |       var_b4norm <- apply(expr_b4norm, 2, var)
297 |       
298 |       # find the bulk mean and bulk variance before batch correction
299 |       mean_b4norm_mean <- mean(mean_b4norm)
300 |       var_b4norm_mean <- mean(var_b4norm)
301 |       
302 |       ## after
303 |       expr_norm <- 
304 |         t(apply(flowCore::exprs(fcs_asinh), 1, norm))
305 |       
306 |       # calculate mean and variance vectors for the normalized input .fcs file
307 |       mean_norm <- apply(expr_norm, 2, mean)
308 |       var_norm <- apply(expr_norm, 2, var)
309 |       
310 |       # calculate bulk mean and variance values for the normalized input .fcs file
311 |       mean_norm_mean <- mean(mean_norm)
312 |       var_norm_mean <- mean(var_norm)
313 |       
314 |       # create output flowFrame 
315 |       fcs_norm <- flowCore::flowFrame(expr_norm)
316 |       
317 |       # normalization completed, reverse transformation
318 |       my_rev_asinh <- 
319 |         function(x) {
320 |           rev_asinh(x, shift_factor = shift_factor, scale_factor = scale_factor)
321 |         }
322 |       tlist2 <- flowCore::transformList(from = col_names, tfun = my_rev_asinh)
323 |       fcs_asinh_rev <- flowCore::transform(fcs_norm, tlist2)
324 |       
325 |       
326 |       flowCore::pData(flowCore::parameters(fcs_asinh_rev))$desc <- 
327 |         flowCore::pData(flowCore::parameters(fcs_asinh))$desc
328 |       
329 |       # write out output .fcs file
330 |       fcs_name <- file.path(output_data_path, paste0(output_prefix, filename))
331 |       flowCore::write.FCS(fcs_asinh_rev, fcs_name)
332 |       
333 |       # update final data structure
334 |       result$universal_mean[[i]] <- universal_mean 
335 |       result$universal_var[[i]] <- universal_var
336 |       result$anchor_mean[[i]] <- mean_ctr
337 |       result$anchor_var[[i]] <- var_ctr
338 |       result$mean_b4norm[[i]] <- mean_b4norm
339 |       result$var_b4norm[[i]] <- var_b4norm
340 |       result$mean_norm[[i]] <- mean_norm
341 |       result$var_norm[[i]] <- var_norm
342 |       result$anchor_mean_norm[[i]] <- mean_ctr_norm
343 |       result$anchor_var_norm[[i]] <- var_ctr_norm
344 |     }
345 |     
346 |     # add marker list and arcsinh transformation parameters to the final data structure
347 |     attr(result, which = "shift_factor") <- shift_factor
348 |     attr(result, which = "scale_factor") <- scale_factor
349 |     attr(result, which = "all_markers") <- all_markers
350 |     
351 |     # return result
352 |     return(result)
353 |     
354 |     
355 |   }
356 | 


--------------------------------------------------------------------------------
/vignettes/cytofin.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "CytofIn Tutorial"
  3 | author: "Timothy Keyes"
  4 | output: rmarkdown::html_vignette
  5 | vignette: >
  6 |   %\VignetteIndexEntry{CytofIn Tutorial}
  7 |   %\VignetteEngine{knitr::rmarkdown}
  8 |   \usepackage[utf8]{inputenc}
  9 | 
 10 | ---
 11 | 
 12 | 
 13 | ```{r, include = FALSE}
 14 | knitr::opts_chunk$set(
 15 |   collapse = TRUE,
 16 |   comment = "#>", 
 17 |   dpi = 150, 
 18 |   out.width = "100%"
 19 | )
 20 | ```
 21 | 
 22 | 
 23 | CytofIn (**CyTOF** **In**tegration) is an R package for homogenizing and
 24 | normalizing heterogeneous [mass cytometry
 25 | (CyTOF)](https://pubmed.ncbi.nlm.nih.gov/21551058/) data from diverse
 26 | data sources. Specifically, `CytofIn` provides functions that perform the
 27 | following tasks:
 28 | 
 29 | -   **Dataset homogenization** - CyTOF datasets that were collected
 30 |     separately may differ in which markers were included in their
 31 |     antibody panels; in addition, they may use different naming
 32 |     conventions for their panels' shared markers. Thus, data mining
 33 |     across multiple CyTOF datasets requires **homogenization,** the
 34 |     process of aligning each dataset's antibody panels so that they can
 35 |     be analyzed together. In `CytofIn`, data homogenization (i.e. panel
 36 |     alignment) is performed with the `cytofin_homogenize` function that
 37 |     leverages user-provided panel information to combine datasets.
 38 | -   **Dataset normalization** - Combined analysis of multiple CyTOF
 39 |     datasets is likely to be confounded by dataset-to-dataset batch
 40 |     effects due to differences in instrumentation and experimental
 41 |     protocols between groups. To normalize multiple CyTOF datasets with
 42 |     respect to these batch effects, `CytofIn` provides 3 functions:
 43 |     `cytofin_prep_anchors`, `cytofin_normalize`, and
 44 |     `cytofin_normalize_nrs`.
 45 | -   **Visualization** - After batch normalization, the means and
 46 |     standard deviations for each of the input .fcs files (as well as
 47 |     their associated anchors) can be visualized using the
 48 |     `cytofin_make_plots` function.
 49 | 
 50 | The general CytofIn workflow unfolds in 3 steps. First, users align the
 51 | panels of the CyTOF datasets being integrated using
 52 | `cytofin_homogenize()`. Second, users generate reference statistics from
 53 | "generalized anchors" identified on each CyTOF plate (see below) using
 54 | `cytofin_prep_anchors()`. Finally, users can then normalize/batch
 55 | correct the datasets relative to one another using their choice of
 56 | `cytofin_normalize()` or `cytofin_normalize_nrs()`, each of which
 57 | performs the normalization procedure differently (see below).
 58 | 
 59 | # Installation
 60 | 
 61 | To install CytofIn, run the following code:
 62 | 
 63 | ```{r, eval = FALSE}
 64 | library(devtools)
 65 | install_github("bennyyclo/Cytofin")
 66 | ```
 67 | 
 68 | To attach the CytofIn package to your current R session, run the
 69 | following line:
 70 | 
 71 | ```{r}
 72 | library(cytofin)
 73 | ```
 74 | 
 75 | # Data for this vignette
 76 | 
 77 | ## Establishing a root directory
 78 | 
 79 | For the sake of this vignette, we will work within a single folder,
 80 | where we will store the input data, the output data, and all
 81 | intermediate files from the CytofIn pipeline. We will default to using
 82 | the current working directory, but feel free to modify the following
 83 | line of code to change which path you want to use.
 84 | 
 85 | ```{r}
 86 | # change this path to wherever you want this vignette to find and store
 87 | # its input and output files
 88 | base_path <- getwd()
 89 | ```
 90 | 
 91 | ```{r, include = FALSE}
 92 | base_path <- file.path("~", "Desktop", "cytofin_tests")
 93 | ```
 94 | 
 95 | ## Downloading the data
 96 | 
 97 | Now that we've identified the root directory we'll use for this
 98 | vignette, we will create two folders in which we will store the raw
 99 | input data and the validation (bead-normalized) data used in this
100 | vignette:
101 | 
102 | ```{r}
103 | dir.create(file.path(base_path, "raw_data"), showWarnings = FALSE)
104 | dir.create(file.path(base_path, "validation_data"), showWarnings = FALSE)
105 | ```
106 | 
107 | To fill each of these folders with the .fcs files we're analyzing in
108 | this vignette, please download the raw input files
109 | [here](https://flowrepository.org/id/FR-FCM-Z427) and the validation
110 | files [here](https://flowrepository.org/id/FR-FCM-Z42C) on
111 | [FlowRepository](https://flowrepository.org/). Once the files are
112 | downloaded, unzip them. Finally, move all of the unzipped .fcs files from each repository into the `raw_data` and
113 | `validation_data` folders that we just created, respectively.
114 | 
115 | # Usage
116 | 
117 | ## CyTOF data homogenization (cytofin_homogenize)
118 | 
119 | Here, the term "homogenization" refers to the process of aligning the
120 | antigen panels of multiple CyTOF experiments by (1) removing all
121 | channels that are not shared across all cohorts and (2) standardizing
122 | the antigen names used to refer to each channel so that existing
123 | analysis tools (like the `flowCore` and `tidyverse` packages) can be
124 | applied in later analytical steps. In CytofIn, dataset homogenization is
125 | performed using the `cytofin_homogenize()` function.
126 | 
127 | The `cytofin_homogenize()` function takes several arguments. The first
128 | of these is `metadata_path`, a string that specifies the file path to a
129 | .csv or .xlsx metadata file containing information about each of the
130 | .fcs files being analyzed. Specifically, the metadata file will have one
131 | row for each .fcs file being analyzed and must contain the following
132 | columns (all of which will be converted to character vectors):
133 | 
134 | -   **filename -** Required. The name of the .fcs file within its local
135 |     directory.
136 | -   **cohort -** Required. The name of the cohort (i.e. experimental
137 |     source) of each .fcs file.
138 | -   **plate_number -** Required. The name of the CyTOF plate (e.g.
139 |     "plate1", "plate2", etc.) on which the sample corresponding to each
140 |     .fcs file was analyzed during data acquisition.
141 | -   **patient_id -** Optional. The name of the patient to whom each .fcs
142 |     file corresponds.
143 | -   **condition -** Optional. The stimulation condition corresponding to
144 |     each .fcs file (i.e. "basal", "IL-3", etc.).
145 | -   **is_anchor -** Required. A numeric column indicating whether or not
146 |     each sample should be used as an "anchor" for the batch correction
147 |     procedure (1 if yes; 0 if no). Exactly one anchor should be
148 |     identified for each CyTOF plate being analyzed.
149 | -   **validation -** Optional. The name of the
150 |     [bead-normalized](https://pubmed.ncbi.nlm.nih.gov/23512433/) .fcs
151 |     file corresponding to each input file listed in the `filename`
152 |     column (per gold-standard batch normalization procedure in CyTOF
153 |     batch correction). Most users will ignore this column because
154 |     bead-normalized data will not be available, but it can be used to
155 |     validate the results of the CytofIn batch normalization algorithms
156 |     if bead-normalized data are available.
157 | 
158 | Importantly, only the fields marked as "required" are needed for
159 | `cytofin_homogenize()` to work; "NA" can be recorded for any/all
160 | optional columns that don't apply to the experimental design of the
161 | files being analyzed (for example, if no stimulation conditions were
162 | used in the studies being integrated, enter "NA" for each element of the
163 | `condition` column). Alternatively, these columns can be omitted from
164 | the metadata table entirely. The following image provides a visual summary of the metadata table used throughout the `CytofIn` pipeline. 
165 | 
166 | ![](../inst/images/image2.png)
167 | 
168 | For the user's convenience, the `cytofin_generate_metadata_template`
169 | function is provided to generate an example metadata .csv file filled
170 | with dummy example data in a location specified by the user:
171 | 
172 | ```{r, eval = FALSE}
173 | # specify the path where you'd like to store the template file
174 | my_path <- file.path(base_path, "template_folder")
175 | 
176 | # generate the template file, which then can be edited manually 
177 | cytofin_generate_metadata_template(template_path = my_path)
178 | ```
179 | 
180 | The second argument for `cytofin_homogenize` is `panel_path`, a string
181 | that specifies the file path to a .csv or .xlsx file containing
182 | information about the panel(s) of each of the .fcs files being analyzed.
183 | Each row represents a channel (i.e. a protein measurement) to be
184 | included in the final, homogenized panel. This file must contain the
185 | following columns:
186 | 
187 | -   **metal_name -** A character vector representing the name of the
188 |     metal isotope measured by each channel.
189 | -   **antigen_name -** A character vector representing the name of the
190 |     antigen associated with a given metal isotope in the consensus panel
191 |     (the final antigen name to assign to a given channel during
192 |     homogenization).
193 | -   **antigen_pattern -** A regular expression used to match antigen
194 |     names that may differ slightly across different .fcs files. For
195 |     example, the regular expression "(C\|c)(D\|d)45" will detect all of
196 |     the following channel names: "cd45", "CD45", "Cd45", or "cD45".
197 | -   **lineage -** A numeric vector representing whether or not a marker
198 |     is a lineage marker (1 if yes; 0 otherwise).
199 | -   **functional -** A numeric vector representing whether or not a
200 |     marker is a functional marker (1 if yes; 0 otherwise).
201 | -   **general -** A numeric vector representing whether or not a marker
202 |     is a "general" (i.e. neither a lineage nor a functional) marker (1
203 |     if yes; 0 otherwise).
204 | 
205 | The layout of this antigen table (and how it's used during .fcs file homogenization) is displayed in the picture below.
206 | 
207 | ![](../inst/images/image1.png)
208 | 
209 | As in `cytofin_generate_metadata_template`, the `cytofin_generate_panel_template` function is provided to
210 | generate an example metadata .csv file filled with dummy example data:
211 | 
212 | ```{r, eval = FALSE}
213 | # generate the template file, which then can be edited manually 
214 | cytofin_generate_panel_template(template_path = my_path)
215 | ```
216 | 
217 | For many users, the most difficult part of filling out the consensus
218 | panel information table will be designing the regular expressions for
219 | the `antigen_pattern` column. However, in most cases the required
220 | regular expressions will be quite simple; for a primer on regular
221 | expressions (and their use in the
222 | [`stringr`](https://stringr.tidyverse.org/) package) written by
223 | [RStudio](https://www.rstudio.com/about/), install the `stringr` package
224 | and read the following vignette:
225 | 
226 | ```{r, eval = FALSE}
227 | vignette(topic = "regular-expressions", package = "stringr")
228 | ```
229 | 
230 | The next two arguments for `cytofin_homogenize` are `input_data_path`
231 | and `output_data_path`, two strings that indicate which directory input
232 | .fcs files should be read from and which directory homogenized .fcs
233 | files should be written to, respectively. Lastly, the final two
234 | arguments are optional: `prefix` allows the user to specify the prefix
235 | appended to each input .fcs file name to get the name of the
236 | corresponding output (i.e. homogenized) .fcs file name, and `verbose` is
237 | a boolean value (default = FALSE) specifying if chatty print statements
238 | should be made while the homogenization is performed.
239 | 
240 | Using these arguments, `cytofin_homogenize` can homogenize a set of
241 | CyTOF files with distinct antigen naming conventions. Specifically, the
242 | program performs a regular expression search to match the synonymous
243 | term in the panel and correct the antigen name with standardized names
244 | in the panel.
245 | 
246 | Example function call:
247 | 
248 | ```{r, warning = FALSE}
249 | # define input paths 
250 | metadata_path <- 
251 |   system.file(
252 |     file.path("extdata", "test_metadata_raw.csv"), 
253 |     package = "cytofin"
254 |   )
255 | 
256 | panel_path <- 
257 |   system.file(
258 |     file.path("extdata", "test_panel.csv"), 
259 |     package = "cytofin"
260 |   )
261 | 
262 | input_data_path <- 
263 |   file.path(base_path, "raw_data")
264 | 
265 | validation_data_path <- 
266 |   file.path(base_path, "validation_data")
267 | 
268 | # define output path
269 | # --Change this line to wherever you want the output files saved!--
270 | output_data_path <- file.path(base_path, "homogenization_output")
271 | 
272 | # call homogenization function
273 | cytofin_homogenize(
274 |   metadata_path = metadata_path, 
275 |   panel_path = panel_path, 
276 |   input_data_path = input_data_path, 
277 |   output_data_path = output_data_path
278 | )
279 | ```
280 | 
281 | This function call will save homogenized .fcs files to the directory
282 | located at `output_data_path`. These files will be different from the
283 | input .fcs files in the `input_data_path` directory in that they will
284 | only contain channels whose antigen names match the `antigen_pattern`
285 | column of the reference panel located at `panel_path`. All other
286 | channels will be removed, and the names of the channels with matches in
287 | `antigen_pattern` will be standardized to the names given in the
288 | `antigen_name` column of the reference panel.
289 | 
290 | The input files for this homogenization run were as follows:
291 | 
292 | ```{r}
293 | list.files(input_data_path, pattern = ".fcs$")
294 | ```
295 | 
296 | ...and the corresponding output file saved in the `output_data_path`
297 | directory are now as follows:
298 | 
299 | ```{r}
300 | list.files(output_data_path, pattern = ".fcs$")
301 | ```
302 | 
303 | ## CyTOF batch normalization
304 | 
305 | After dataset homogenization, **batch correction** (or **batch
306 | normalization**) can be performed across datasets.
307 | 
308 | In short, `CytofIn` performs batch normalization though the use of
309 | user-identified **generalized anchors** - which are non-identical references assumed to have low variability across batches - that can be used to estimate batch effects from samples collated from heterogeneous sources. To batch normalize using healthy control samples (one per plate) as generalized anchors (which
310 | is ideal when such samples are available), use `cytofin_normalize`. To
311 | batch normalize using the antigen channels with the lowest variability across samples as generalized anchors (which is ideal when healthy samples are unavailable on all plates being analyzed), use `cytofin_normalize_nrs`.
312 | 
313 | The use of both of these functions is detailed below.
314 | 
315 | ### Batch normalization using external anchors (cytofin_normalize)
316 | 
317 | #### Overview 
318 | 
319 | The `cytofin_normalize` uses user-identified external anchors on each
320 | CyTOF plate being integrated to correct batch effects on a
321 | plate-to-plate basis. One sample on each CyTOF barcoding plate should be
322 | chosen as that plate's external anchor. In general, external anchors
323 | should be chosen based on which samples are the most biologically
324 | similar to one another from plate to plate. For example, if healthy,
325 | non-stimulated samples are included on each CyTOF plate being
326 | integrated, the only expected variability between these samples other
327 | than batch effects would be person-to-person variability. Thus, these
328 | samples are likely to be biologically similar to one another and are
329 | suitable to be chosen as external anchors. Alternatively, if a single
330 | patient or cell line was included on every CyTOF plate being integrated,
331 | the samples corresponding to that patient or cell line on each plate are
332 | would also be suitable as external anchor choices.
333 | 
334 | Once users have identified 1 external anchor per plate for `CytofIn`
335 | data integration, users must mark its row in the metadata table with a
336 | "1" in the `is_anchor` column (all other samples should be marked with
337 | "0"). `CytofIn` then uses these anchors to define a **universal mean**
338 | and **universal variance** that represent the central tendency and
339 | dispersion, respectively, of the target distribution to which all
340 | samples will be batch corrected. This correction will be performed with
341 | the user's choice from one of five batch correction functions.
342 | 
343 | In short, `CytofIn`'s batch normalization procedure using external
344 | anchors has two steps:
345 | 
346 | 1.  Preparation of external anchors  
347 | 2.  Application of a transformation function that performs the batch
348 |     correction (of which `CytofIn` provides 5 options)
349 | 
350 | We detail function calls for each of these steps below.
351 | 
352 | #### Step 1 - Anchor preparation
353 | 
354 | The `cytofin_prep_anchors` function concatenates the identified anchor
355 | files and then calculates summary statistics that are used for batch
356 | correction in later steps of the pipeline. First, `CytofIn` calculates
357 | the mean and standard deviation of each channel in the homogenized
358 | dataset across all cells from samples identified as external anchors.
359 | These values represent the overall central tendency and dispersion,
360 | respectively, of each channel among the anchor samples on each CyTOF
361 | plate; thus, we call them the **universal means** and **universal
362 | variances** of the `CytofIn` integration. Accordingly, the universal
363 | mean and universal variance vectors will each have *g* elements, where
364 | *g* is the number of channels in the consensus antigen panel in the
365 | panel information table. The universal mean and universal variance
366 | vectors are used in the `meanshift`, `variance`, `z-score`, and
367 | `beadlike` methods of batch correction (see below).
368 | 
369 | In addition, the mean of all of the elements of the universal mean
370 | vector (i.e. the mean of all channel means) and the mean of all of the
371 | elements of the universal variance vector (i.e. the mean of all channel
372 | variances) are calculated. These values represent the central tendency
373 | and dispersion of antigen measurements in general among the healthy
374 | control samples on each CyTOF plate and are thus no longer
375 | channel-specific. Thus, we call them the *bulk mean* and *bulk
376 | variance*, and they are used in the `meanshift_bulk` batch correction
377 | method implemented in `cytofin_homogenize`.
378 | 
379 | To calculate these values, we use the `cytofin_prep_anchors` function.
380 | `cytofin_prep_anchors` returns the universal mean vector, universal
381 | variance vector, bulk mean, and bulk variance as a `list()`. In
382 | addition, users are given an option to save these statistics as an .rds
383 | file in a specified directory in order to avoid performing redundant
384 | calculations in future analyses.
385 | 
386 | Specifically, `cytofin_prep_anchors` takes 4 required arguments:
387 | 
388 | -   `metadata_path`: A connection leading to an .xlsx or .csv file
389 |     containing a metadata table with information about each file to be
390 |     analyzed. This file should be identical to that used for
391 |     `cytofin_homogenize`.
392 | -   `panel_path`: A connection leading to an .xlsx or .csv file
393 |     containing information about the standardized antigen panel in the
394 |     homogenized dataset. This file should be identical to that used for
395 |     `cytofin_homogenize`.
396 | -   `input_data_path`: A connection to a directory containing the input
397 |     .FCS files from which to draw summary statistics
398 | -   `output_path`: A connection to a directory where the output .rds and
399 |     .FCS files will be saved. The default is "none", in which case no
400 |     output files will be stored (and the only effect of the function
401 |     will be to return the calculated statistics as a `list()`).
402 | 
403 | In addition, `cytofin_prep_anchors` also takes 2 optional arguments
404 | relating to the conventional arcsinh transformation performed on the raw
405 | ion counts of the input data. These optional arguments are as follows:
406 | 
407 | -   `shift_factor`: The scalar value `a` in the following equation used
408 |     to transform CyTOF raw data ion counts using the hyperbolic arcsinh
409 |     function: `new_x <- asinh(a + b * x)`. Defaults to 0.
410 | 
411 | -   `scale_factor`: The scalar value `b` in the following equation used
412 |     to transform CyTOF raw data ion counts using the hyperbolic arcsinh
413 |     function: `new_x <- asinh(a + b * x)`. Defaults to 0.2.
414 | 
415 | Finally, here is an example functional call of `cytofin_prep_anchors`:
416 | 
417 | ```{r}
418 | input_data_path <- file.path(base_path, "homogenization_output")
419 | output_path <- file.path(base_path, "anchor_prep_output")
420 | 
421 | anchor_statistics <- 
422 |   cytofin_prep_anchors(
423 |     metadata_path = metadata_path, 
424 |     panel_path = panel_path, 
425 |     input_data_path = input_data_path, 
426 |     output_path = output_path
427 |   )
428 | 
429 | print(anchor_statistics)
430 | ```
431 | 
432 | As shown above, the returned value is a list with 4 items in it: the
433 | universal variance vector (`universal_var`), the universal mean vector
434 | (`universal_mean`), the bulk variance (`bulk_var`) and the bulk mean
435 | (`bulk_mean`). Note that the elements of `universal_var` and
436 | `universal_mean` are named with their corresponding metal names (not
437 | antigen names), as this interfaces a bit more conveniently with the
438 | `flowCore` functions that `CytofIn` uses under-the-hood.
439 | 
440 | Importantly, you only need to use `cytofin_prep_anchors` if you plan to
441 | batch normalize your .fcs files using external anchors identified on
442 | each plate (using `cytofin_normalize`). If you plan to batch normalize
443 | your .fcs files using non-redundancy scores from each sample's most
444 | stable channels (using `cytofin_normalize_nrs`), you do not need to run
445 | `cytofin_prep_anchors` first.
446 | 
447 | #### Step 2 - Batch normalization
448 | 
449 | After the anchors' summary statistics are computed, batch correction
450 | using external anchors can be performed using either
451 | `cytofin_normalize`. This function can perform batch correction using 5
452 | different normalizations functions (which we call "modes"). Specifically, the options are called the "meanshift", "meanshift_bulk", "variance", "z-score", and "beadlike" normalization functions. Which of
453 | these is most applicable to a given analysis will differ from user to
454 | user. We recommended that users try using both and then manually
455 | inspect/visualize the batch-corrected data in order to determine which
456 | method they prefer.
457 | 
458 | To perform batch normalization using external anchors identified on each
459 | plate, use `cytofin_normalize`. This batch normalization strategy
460 | assumes that the anchors on each plate are relatively similar to one
461 | another, and it uses this similarity to adjust the marker expression
462 | measurements on each plate based on how much each plate's anchor differs
463 | from the other anchors. The `cytofin_normalize` function takes several
464 | required arguments:
465 | 
466 | -   `metadata_path`: A connection leading to an .xlsx or .csv file
467 |     containing a metadata table with information about each file to be
468 |     analyzed. This file should be identical to that used for
469 |     `cytofin_homogenize`.
470 | -   `panel_path`: A connection leading to an .xlsx or .csv file
471 |     containing information about the standardized antigen panel in the
472 |     homogenized dataset. This file should be identical to that used for
473 |     `cytofin_homogenize`.
474 | -   `anchor_statistics`: Either a list of numeric values produced by the
475 |     `cytofin_prep_anchors` function or a connection leading to an .rds
476 |     object containing anchor statistics.
477 | -   `input_data_path`: A connection to a directory containing the input
478 |     .fcs files to be batch normalized. In most cases, this will be the
479 |     directory to which the output .FCS files from `cytofin_homogenize`
480 |     were written.
481 | -   `output_data_path`: A connection to a directory where the output
482 |     (i.e. batch normalized) .FCS files will be written.
483 | -   `mode`: A string indicating which transformation function should be
484 |     used for batch normalization ("meanshift", "meanshift_bulk",
485 |     "variance", "z-score", or "beadlike").
486 | 
487 | In addition to these required arguments, `cytofin_normalize` takes
488 | several optional arguments:
489 | 
490 | -   `input_prefix`: The string that was appended to the name of the raw
491 |     input .fcs files of `cytofin_homogenize` to create their
492 |     corresponding output file names. Defaults to "homogenized\_".
493 | 
494 | -   `output_prefix`: The string to be appended to the name of each input
495 |     .fcs file to create the name of the corresponding output file
496 |     (post-homogenization). Defaults to "normalized\_".
497 | 
498 | -   `shift_factor` and `scale_factor`: The scalar values *a* and *b*,
499 |     respectively, to be used in the hyperbolic arc-sine function used to
500 |     transform CyTOF ion counts according to the following equation:
501 |     `new_x <- asinh(a + b * x)`. `shift_factor` defaults to 0 and
502 |     `scale_factor` defaults to 0.2, which are customary values used by
503 |     most scientists in the CyTOF community.
504 | 
505 | Using these arguments, a call to `cytofin_normalize` will perform the
506 | batch correction and save the output (i.e. batch normalized) .fcs files
507 | to the directory specified by `output_data_path`. An example function
508 | call is given here:
509 | 
510 | ```{r}
511 | output_data_path <- 
512 |   file.path(base_path, "normalization_results")
513 | 
514 | norm_result <- 
515 |   cytofin_normalize(
516 |     metadata_path = metadata_path, 
517 |     panel_path = panel_path, 
518 |     anchor_statistics = anchor_statistics, 
519 |     input_data_path = input_data_path, 
520 |     output_data_path = output_data_path, 
521 |     mode = "meanshift"
522 |   )
523 | ```
524 | 
525 | When this function is called, it has two effects. The first is to save
526 | the batch-normalized output .fcs files to the `output_data_path`
527 | directory. The second is to return a data.frame that stores mean and
528 | variance information about each input file (as well as its associated
529 | anchor) both before and after normalization. This data.frame can be
530 | passed directly into the `cytofin_make_plots` function to return 8
531 | diagnostic plots per sample illustrating the quality of the
532 | normalization:
533 | 
534 | ```{r, eval = FALSE}
535 | # we make only the plot for the first input .fcs file
536 | # for illustrative purposes
537 | cytofin_make_plots(
538 |   normalization_result = norm_result,
539 |   which_rows = 1,
540 |   val_path = "none"
541 | )
542 | ```
543 | 
544 | ### Batch normalization using internal anchors (cytofin_normalize_nrs)
545 | 
546 | 
547 | In the event that external anchors are not available, `CytofIn` can use
548 | "internal anchors" within each sample for batch normalization.
549 | Specifically, instead of defining a single external anchor for all the
550 | samples on a given plate like `cytofin_normalize`, the
551 | `cytofin_normalize_nrs` function identifies the most stable channels in
552 | the dataset overall and uses them as internal anchors that are used to
553 | batch normalize all other channels from sample-to-sample. A schematic diagram of how `cytofin_normalize_nrs` works is provided below: 
554 | 
555 | ![](../inst/images/image3.png)
556 | 
557 | To identify
558 | the most stable channels in the combined dataset, `CytofIn` uses a
559 | PCA-based non-redundancy score (NRS) as described before (see
560 | [here](https://pubmed.ncbi.nlm.nih.gov/26095251/)). A minimum of 3
561 | channels should be selected to establish an internal reference from
562 | which signals can be calibrated between CyTOF files.
563 | 
564 | To do this, `cytofin_normalize_nrs` takes several of the same arguments as
565 | `cytofin_normalize`, defined as above: `metadata_path`, `panel_path`,
566 | `input_data_path`, `output_data_path`, `input_prefix`, `output_prefix`,
567 | `shift_factor`, and `scale_factor`. In addition, it takes the following
568 | optional arguments:
569 | 
570 | -   `nchannels`: An integer representing the number of "most stable"
571 |     (i.e. with the lowest non-redundancy scores) channels that should be
572 |     used for batch normalization. Defaults to 3.
573 | 
574 | -   `make_plot`: A boolean value representing if, in addition to its
575 |     other effects, `cytofin_normalize_nrs` should return a plot
576 |     illustrating the distribution of non-redundancy scores for each
577 |     channel among all .fcs files being batch normalized. Defaults to
578 |     FALSE.
579 | 
580 | These arguments can be used in a function call as follows:
581 | 
582 | ```{r}
583 | # path to save the normalized .fcs files
584 | output_data_path <- 
585 |   file.path(base_path, "normalization_nrs_results")
586 | 
587 | # call function
588 | norm_result_nrs <- 
589 |   cytofin_normalize_nrs(
590 |     metadata_path = metadata_path, 
591 |     panel_path = panel_path, 
592 |     input_data_path = input_data_path, 
593 |     output_data_path = output_data_path, 
594 |     nchannels = 3, 
595 |     make_plot = FALSE
596 |   )
597 | ```
598 | 
599 | Just like `cytofin_normalize` above, `cytofin_normalize_nrs` has several
600 | effects. First, it writes batch-normalized .fcs files to
601 | `output_data_path` and makes a plot depicting sample-wise and
602 | channel-wise non-redundancy scores according to the value of
603 | `make_plot`. In addition, it returns a data.frame that can be passed
604 | into `cytofin_make_plots` to make diagnostic plots regarding the batch
605 | normalization procedure:
606 | 
607 | ```{r, eval = FALSE}
608 | # show only 1 set of plots for illustrative purposes
609 | cytofin_make_plots(
610 |   normalization_result = norm_result_nrs, 
611 |   which_rows = 7, 
612 |   val_path = validation_data_path
613 | )
614 | ```
615 | 
616 | # Additional Information
617 | 
618 | For questions about the `cytofin` R package, please email
619 | [kardavis\@stanford.edu](mailto:kardavis@stanford.edu) or open a GitHub
620 | issue [here](https://github.com/bennyyclo/Cytofin).
621 | 
622 | ```{r}
623 | # session information for rendering this README file
624 | sessionInfo()
625 | ```
626 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: 
  3 |   github_document: 
  4 |     toc: true
  5 |     toc_depth: 5
  6 | editor_options: 
  7 |   markdown: 
  8 |     wrap: 72
  9 | ---
 10 | 
 11 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 12 | 
 13 | ```{r, include = FALSE}
 14 | knitr::opts_chunk$set(
 15 |   collapse = TRUE,
 16 |   comment = "#>",
 17 |   fig.path = "man/figures/README-",
 18 |   out.width = "100%", 
 19 |   dpi = 300
 20 | )
 21 | ```
 22 | 
 23 | # cytofin
 24 | 
 25 | CytofIn (**CyTOF** **In**tegration) is an R package for homogenizing and
 26 | normalizing heterogeneous [mass cytometry
 27 | (CyTOF)](https://pubmed.ncbi.nlm.nih.gov/21551058/) data from diverse
 28 | data sources. Specifically, `CytofIn` provides functions that perform the
 29 | following tasks:
 30 | 
 31 | -   **Dataset homogenization** - CyTOF datasets that were collected
 32 |     separately may differ in which markers were included in their
 33 |     antibody panels; in addition, they may use different naming
 34 |     conventions for their panels' shared markers. Thus, data mining
 35 |     across multiple CyTOF datasets requires **homogenization,** the
 36 |     process of aligning each dataset's antibody panels so that they can
 37 |     be analyzed together. In `CytofIn`, data homogenization (i.e. panel
 38 |     alignment) is performed with the `cytofin_homogenize` function that
 39 |     leverages user-provided panel information to combine datasets.
 40 | -   **Dataset normalization** - Combined analysis of multiple CyTOF
 41 |     datasets is likely to be confounded by dataset-to-dataset batch
 42 |     effects due to differences in instrumentation and experimental
 43 |     protocols between groups. To normalize multiple CyTOF datasets with
 44 |     respect to these batch effects, `CytofIn` provides 3 functions:
 45 |     `cytofin_prep_anchors`, `cytofin_normalize`, and
 46 |     `cytofin_normalize_nrs`.
 47 | -   **Visualization** - After batch normalization, the means and
 48 |     standard deviations for each of the input .fcs files (as well as
 49 |     their associated anchors) can be visualized using the
 50 |     `cytofin_make_plots` function.
 51 | 
 52 | The general CytofIn workflow unfolds in 3 steps. First, users align the
 53 | panels of the CyTOF datasets being integrated using
 54 | `cytofin_homogenize()`. Second, users generate reference statistics from
 55 | "generalized anchors" identified on each CyTOF plate (see below) using
 56 | `cytofin_prep_anchors()`. Finally, users can then normalize/batch
 57 | correct the datasets relative to one another using their choice of
 58 | `cytofin_normalize()` or `cytofin_normalize_nrs()`, each of which
 59 | performs the normalization procedure differently (see below).
 60 | 
 61 | ## Installation
 62 | 
 63 | To install CytofIn, run the following code:
 64 | 
 65 | ```{r, eval = FALSE}
 66 | library(devtools)
 67 | install_github("bennyyclo/Cytofin")
 68 | ```
 69 | 
 70 | To attach the CytofIn package to your current R session, run the
 71 | following line:
 72 | 
 73 | ```{r}
 74 | library(cytofin)
 75 | ```
 76 | 
 77 | ## Data for this vignette
 78 | 
 79 | ### Establishing a root directory
 80 | 
 81 | For the sake of this vignette, we will work within a single folder,
 82 | where we will store the input data, the output data, and all
 83 | intermediate files from the CytofIn pipeline. We will default to using
 84 | the current working directory, but feel free to modify the following
 85 | line of code to change which path you want to use.
 86 | 
 87 | ```{r}
 88 | # change this path to wherever you want this vignette to find and store
 89 | # its input and output files
 90 | base_path <- getwd()
 91 | ```
 92 | 
 93 | ```{r, include = FALSE}
 94 | base_path <- file.path("~", "Desktop", "cytofin_tests")
 95 | ```
 96 | 
 97 | ### Downloading the data
 98 | 
 99 | Now that we've identified the root directory we'll use for this
100 | vignette, we will create two folders in which we will store the raw
101 | input data and the validation (bead-normalized) data used in this
102 | vignette:
103 | 
104 | ```{r}
105 | dir.create(file.path(base_path, "raw_data"), showWarnings = FALSE)
106 | dir.create(file.path(base_path, "validation_data"), showWarnings = FALSE)
107 | ```
108 | 
109 | To fill each of these folders with the .fcs files we're analyzing in
110 | this vignette, please download the raw input files
111 | [here](https://flowrepository.org/id/FR-FCM-Z427) and the validation
112 | files [here](https://flowrepository.org/id/FR-FCM-Z42C) on
113 | [FlowRepository](https://flowrepository.org/). Once the files are
114 | downloaded, unzip them. Finally, move all of the unzipped .fcs files from each repository into the `raw_data` and
115 | `validation_data` folders that we just created, respectively.
116 | 
117 | ## Usage
118 | 
119 | ### CyTOF data homogenization (cytofin_homogenize)
120 | 
121 | Here, the term "homogenization" refers to the process of aligning the
122 | antigen panels of multiple CyTOF experiments by (1) removing all
123 | channels that are not shared across all cohorts and (2) standardizing
124 | the antigen names used to refer to each channel so that existing
125 | analysis tools (like the `flowCore` and `tidyverse` packages) can be
126 | applied in later analytical steps. In CytofIn, dataset homogenization is
127 | performed using the `cytofin_homogenize()` function.
128 | 
129 | The `cytofin_homogenize()` function takes several arguments. The first
130 | of these is `metadata_path`, a string that specifies the file path to a
131 | .csv or .xlsx metadata file containing information about each of the
132 | .fcs files being analyzed. Specifically, the metadata file will have one
133 | row for each .fcs file being analyzed and must contain the following
134 | columns (all of which will be converted to character vectors):
135 | 
136 | -   **filename -** Required. The name of the .fcs file within its local
137 |     directory.
138 | -   **cohort -** Required. The name of the cohort (i.e. experimental
139 |     source) of each .fcs file.
140 | -   **plate_number -** Required. The name of the CyTOF plate (e.g.
141 |     "plate1", "plate2", etc.) on which the sample corresponding to each
142 |     .fcs file was analyzed during data acquisition.
143 | -   **patient_id -** Optional. The name of the patient to whom each .fcs
144 |     file corresponds.
145 | -   **condition -** Optional. The stimulation condition corresponding to
146 |     each .fcs file (i.e. "basal", "IL-3", etc.).
147 | -   **is_anchor -** Required. A numeric column indicating whether or not
148 |     each sample should be used as an "anchor" for the batch correction
149 |     procedure (1 if yes; 0 if no). Exactly one anchor should be
150 |     identified for each CyTOF plate being analyzed.
151 | -   **validation -** Optional. The name of the
152 |     [bead-normalized](https://pubmed.ncbi.nlm.nih.gov/23512433/) .fcs
153 |     file corresponding to each input file listed in the `filename`
154 |     column (per gold-standard batch normalization procedure in CyTOF
155 |     batch correction). Most users will ignore this column because
156 |     bead-normalized data will not be available, but it can be used to
157 |     validate the results of the CytofIn batch normalization algorithms
158 |     if bead-normalized data are available.
159 | 
160 | Importantly, only the fields marked as "required" are needed for
161 | `cytofin_homogenize()` to work; "NA" can be recorded for any/all
162 | optional columns that don't apply to the experimental design of the
163 | files being analyzed (for example, if no stimulation conditions were
164 | used in the studies being integrated, enter "NA" for each element of the
165 | `condition` column). Alternatively, these columns can be omitted from
166 | the metadata table entirely. The following image provides a visual summary of the metadata table used throughout the `CytofIn` pipeline. 
167 | 
168 | ![](./inst/images/image2.png)
169 | 
170 | For the user's convenience, the `cytofin_generate_metadata_template`
171 | function is provided to generate an example metadata .csv file filled
172 | with dummy example data in a location specified by the user:
173 | 
174 | ```{r, eval = FALSE}
175 | # specify the path where you'd like to store the template file
176 | my_path <- file.path(base_path, "template_folder")
177 | 
178 | # generate the template file, which then can be edited manually 
179 | cytofin_generate_metadata_template(template_path = my_path)
180 | ```
181 | 
182 | The second argument for `cytofin_homogenize` is `panel_path`, a string
183 | that specifies the file path to a .csv or .xlsx file containing
184 | information about the panel(s) of each of the .fcs files being analyzed.
185 | Each row represents a channel (i.e. a protein measurement) to be
186 | included in the final, homogenized panel. This file must contain the
187 | following columns:
188 | 
189 | -   **metal_name -** A character vector representing the name of the
190 |     metal isotope measured by each channel.
191 | -   **antigen_name -** A character vector representing the name of the
192 |     antigen associated with a given metal isotope in the consensus panel
193 |     (the final antigen name to assign to a given channel during
194 |     homogenization).
195 | -   **antigen_pattern -** A regular expression used to match antigen
196 |     names that may differ slightly across different .fcs files. For
197 |     example, the regular expression "(C\|c)(D\|d)45" will detect all of
198 |     the following channel names: "cd45", "CD45", "Cd45", or "cD45".
199 | -   **lineage -** A numeric vector representing whether or not a marker
200 |     is a lineage marker (1 if yes; 0 otherwise).
201 | -   **functional -** A numeric vector representing whether or not a
202 |     marker is a functional marker (1 if yes; 0 otherwise).
203 | -   **general -** A numeric vector representing whether or not a marker
204 |     is a "general" (i.e. neither a lineage nor a functional) marker (1
205 |     if yes; 0 otherwise).
206 | 
207 | The layout of this antigen table (and how it's used during .fcs file homogenization) is displayed in the picture below.
208 | 
209 | ![](./inst/images/image1.png)
210 | 
211 | As in `cytofin_generate_metadata_template`, the `cytofin_generate_panel_template` function is provided to
212 | generate an example metadata .csv file filled with dummy example data:
213 | 
214 | ```{r, eval = FALSE}
215 | # generate the template file, which then can be edited manually 
216 | cytofin_generate_panel_template(template_path = my_path)
217 | ```
218 | 
219 | For many users, the most difficult part of filling out the consensus
220 | panel information table will be designing the regular expressions for
221 | the `antigen_pattern` column. However, in most cases the required
222 | regular expressions will be quite simple; for a primer on regular
223 | expressions (and their use in the
224 | [`stringr`](https://stringr.tidyverse.org/) package) written by
225 | [RStudio](https://www.rstudio.com/about/), install the `stringr` package
226 | and read the following vignette:
227 | 
228 | ```{r, eval = FALSE}
229 | vignette(topic = "regular-expressions", package = "stringr")
230 | ```
231 | 
232 | The next two arguments for `cytofin_homogenize` are `input_data_path`
233 | and `output_data_path`, two strings that indicate which directory input
234 | .fcs files should be read from and which directory homogenized .fcs
235 | files should be written to, respectively. Lastly, the final two
236 | arguments are optional: `prefix` allows the user to specify the prefix
237 | appended to each input .fcs file name to get the name of the
238 | corresponding output (i.e. homogenized) .fcs file name, and `verbose` is
239 | a boolean value (default = FALSE) specifying if chatty print statements
240 | should be made while the homogenization is performed.
241 | 
242 | Using these arguments, `cytofin_homogenize` can homogenize a set of
243 | CyTOF files with distinct antigen naming conventions. Specifically, the
244 | program performs a regular expression search to match the synonymous
245 | term in the panel and correct the antigen name with standardized names
246 | in the panel.
247 | 
248 | Example function call:
249 | 
250 | ```{r, warning = FALSE}
251 | # define input paths 
252 | metadata_path <- 
253 |   system.file(
254 |     file.path("extdata", "test_metadata_raw.csv"), 
255 |     package = "cytofin"
256 |   )
257 | 
258 | panel_path <- 
259 |   system.file(
260 |     file.path("extdata", "test_panel.csv"), 
261 |     package = "cytofin"
262 |   )
263 | 
264 | input_data_path <- 
265 |   file.path(base_path, "raw_data")
266 | 
267 | validation_data_path <- 
268 |   file.path(base_path, "validation_data")
269 | 
270 | # define output path
271 | # --Change this line to wherever you want the output files saved!--
272 | output_data_path <- file.path(base_path, "homogenization_output")
273 | 
274 | # call homogenization function
275 | cytofin_homogenize(
276 |   metadata_path = metadata_path, 
277 |   panel_path = panel_path, 
278 |   input_data_path = input_data_path, 
279 |   output_data_path = output_data_path
280 | )
281 | ```
282 | 
283 | This function call will save homogenized .fcs files to the directory
284 | located at `output_data_path`. These files will be different from the
285 | input .fcs files in the `input_data_path` directory in that they will
286 | only contain channels whose antigen names match the `antigen_pattern`
287 | column of the reference panel located at `panel_path`. All other
288 | channels will be removed, and the names of the channels with matches in
289 | `antigen_pattern` will be standardized to the names given in the
290 | `antigen_name` column of the reference panel.
291 | 
292 | The input files for this homogenization run were as follows:
293 | 
294 | ```{r}
295 | list.files(input_data_path, pattern = ".fcs$")
296 | ```
297 | 
298 | ...and the corresponding output file saved in the `output_data_path`
299 | directory are now as follows:
300 | 
301 | ```{r}
302 | list.files(output_data_path, pattern = ".fcs$")
303 | ```
304 | 
305 | ### CyTOF batch normalization
306 | 
307 | After dataset homogenization, **batch correction** (or **batch
308 | normalization**) can be performed across datasets.
309 | 
310 | In short, `CytofIn` performs batch normalization though the use of
311 | user-identified **generalized anchors** - which are non-identical references assumed to have low variability across batches - that can be used to estimate batch effects from samples collated from heterogeneous sources. To batch normalize using healthy control samples (one per plate) as generalized anchors (which
312 | is ideal when such samples are available), use `cytofin_normalize`. To
313 | batch normalize using the antigen channels with the lowest variability across samples as generalized anchors (which is ideal when healthy samples are unavailable on all plates being analyzed), use `cytofin_normalize_nrs`.
314 | 
315 | The use of both of these functions is detailed below.
316 | 
317 | #### Batch normalization using external anchors (cytofin_normalize)
318 | 
319 | ##### Overview 
320 | 
321 | The `cytofin_normalize` uses user-identified external anchors on each
322 | CyTOF plate being integrated to correct batch effects on a
323 | plate-to-plate basis. One sample on each CyTOF barcoding plate should be
324 | chosen as that plate's external anchor. In general, external anchors
325 | should be chosen based on which samples are the most biologically
326 | similar to one another from plate to plate. For example, if healthy,
327 | non-stimulated samples are included on each CyTOF plate being
328 | integrated, the only expected variability between these samples other
329 | than batch effects would be person-to-person variability. Thus, these
330 | samples are likely to be biologically similar to one another and are
331 | suitable to be chosen as external anchors. Alternatively, if a single
332 | patient or cell line was included on every CyTOF plate being integrated,
333 | the samples corresponding to that patient or cell line on each plate are
334 | would also be suitable as external anchor choices.
335 | 
336 | Once users have identified 1 external anchor per plate for `CytofIn`
337 | data integration, users must mark its row in the metadata table with a
338 | "1" in the `is_anchor` column (all other samples should be marked with
339 | "0"). `CytofIn` then uses these anchors to define a **universal mean**
340 | and **universal variance** that represent the central tendency and
341 | dispersion, respectively, of the target distribution to which all
342 | samples will be batch corrected. This correction will be performed with
343 | the user's choice from one of five batch correction functions.
344 | 
345 | In short, `CytofIn`'s batch normalization procedure using external
346 | anchors has two steps:
347 | 
348 | 1.  Preparation of external anchors  
349 | 2.  Application of a transformation function that performs the batch
350 |     correction (of which `CytofIn` provides 5 options)
351 | 
352 | We detail function calls for each of these steps below.
353 | 
354 | ##### Step 1 - Anchor preparation
355 | 
356 | The `cytofin_prep_anchors` function concatenates the identified anchor
357 | files and then calculates summary statistics that are used for batch
358 | correction in later steps of the pipeline. First, `CytofIn` calculates
359 | the mean and standard deviation of each channel in the homogenized
360 | dataset across all cells from samples identified as external anchors.
361 | These values represent the overall central tendency and dispersion,
362 | respectively, of each channel among the anchor samples on each CyTOF
363 | plate; thus, we call them the **universal means** and **universal
364 | variances** of the `CytofIn` integration. Accordingly, the universal
365 | mean and universal variance vectors will each have *g* elements, where
366 | *g* is the number of channels in the consensus antigen panel in the
367 | panel information table. The universal mean and universal variance
368 | vectors are used in the `meanshift`, `variance`, `z-score`, and
369 | `beadlike` methods of batch correction (see below).
370 | 
371 | In addition, the mean of all of the elements of the universal mean
372 | vector (i.e. the mean of all channel means) and the mean of all of the
373 | elements of the universal variance vector (i.e. the mean of all channel
374 | variances) are calculated. These values represent the central tendency
375 | and dispersion of antigen measurements in general among the healthy
376 | control samples on each CyTOF plate and are thus no longer
377 | channel-specific. Thus, we call them the *bulk mean* and *bulk
378 | variance*, and they are used in the `meanshift_bulk` batch correction
379 | method implemented in `cytofin_homogenize`.
380 | 
381 | To calculate these values, we use the `cytofin_prep_anchors` function.
382 | `cytofin_prep_anchors` returns the universal mean vector, universal
383 | variance vector, bulk mean, and bulk variance as a `list()`. In
384 | addition, users are given an option to save these statistics as an .rds
385 | file in a specified directory in order to avoid performing redundant
386 | calculations in future analyses.
387 | 
388 | Specifically, `cytofin_prep_anchors` takes 4 required arguments:
389 | 
390 | -   `metadata_path`: A connection leading to an .xlsx or .csv file
391 |     containing a metadata table with information about each file to be
392 |     analyzed. This file should be identical to that used for
393 |     `cytofin_homogenize`.
394 | -   `panel_path`: A connection leading to an .xlsx or .csv file
395 |     containing information about the standardized antigen panel in the
396 |     homogenized dataset. This file should be identical to that used for
397 |     `cytofin_homogenize`.
398 | -   `input_data_path`: A connection to a directory containing the input
399 |     .FCS files from which to draw summary statistics
400 | -   `output_path`: A connection to a directory where the output .rds and
401 |     .FCS files will be saved. The default is "none", in which case no
402 |     output files will be stored (and the only effect of the function
403 |     will be to return the calculated statistics as a `list()`).
404 | 
405 | In addition, `cytofin_prep_anchors` also takes 2 optional arguments
406 | relating to the conventional arcsinh transformation performed on the raw
407 | ion counts of the input data. These optional arguments are as follows:
408 | 
409 | -   `shift_factor`: The scalar value `a` in the following equation used
410 |     to transform CyTOF raw data ion counts using the hyperbolic arcsinh
411 |     function: `new_x <- asinh(a + b * x)`. Defaults to 0.
412 | 
413 | -   `scale_factor`: The scalar value `b` in the following equation used
414 |     to transform CyTOF raw data ion counts using the hyperbolic arcsinh
415 |     function: `new_x <- asinh(a + b * x)`. Defaults to 0.2.
416 | 
417 | Finally, here is an example functional call of `cytofin_prep_anchors`:
418 | 
419 | ```{r}
420 | input_data_path <- file.path(base_path, "homogenization_output")
421 | output_path <- file.path(base_path, "anchor_prep_output")
422 | 
423 | anchor_statistics <- 
424 |   cytofin_prep_anchors(
425 |     metadata_path = metadata_path, 
426 |     panel_path = panel_path, 
427 |     input_data_path = input_data_path, 
428 |     output_path = output_path
429 |   )
430 | 
431 | print(anchor_statistics)
432 | ```
433 | 
434 | As shown above, the returned value is a list with 4 items in it: the
435 | universal variance vector (`universal_var`), the universal mean vector
436 | (`universal_mean`), the bulk variance (`bulk_var`) and the bulk mean
437 | (`bulk_mean`). Note that the elements of `universal_var` and
438 | `universal_mean` are named with their corresponding metal names (not
439 | antigen names), as this interfaces a bit more conveniently with the
440 | `flowCore` functions that `CytofIn` uses under-the-hood.
441 | 
442 | Importantly, you only need to use `cytofin_prep_anchors` if you plan to
443 | batch normalize your .fcs files using external anchors identified on
444 | each plate (using `cytofin_normalize`). If you plan to batch normalize
445 | your .fcs files using non-redundancy scores from each sample's most
446 | stable channels (using `cytofin_normalize_nrs`), you do not need to run
447 | `cytofin_prep_anchors` first.
448 | 
449 | ##### Step 2 - Batch normalization
450 | 
451 | After the anchors' summary statistics are computed, batch correction
452 | using external anchors can be performed using either
453 | `cytofin_normalize`. This function can perform batch correction using 5
454 | different normalizations functions (which we call "modes"). Specifically, the options are called the "meanshift", "meanshift_bulk", "variance", "z-score", and "beadlike" normalization functions. Which of
455 | these is most applicable to a given analysis will differ from user to
456 | user. We recommended that users try using both and then manually
457 | inspect/visualize the batch-corrected data in order to determine which
458 | method they prefer.
459 | 
460 | To perform batch normalization using external anchors identified on each
461 | plate, use `cytofin_normalize`. This batch normalization strategy
462 | assumes that the anchors on each plate are relatively similar to one
463 | another, and it uses this similarity to adjust the marker expression
464 | measurements on each plate based on how much each plate's anchor differs
465 | from the other anchors. The `cytofin_normalize` function takes several
466 | required arguments:
467 | 
468 | -   `metadata_path`: A connection leading to an .xlsx or .csv file
469 |     containing a metadata table with information about each file to be
470 |     analyzed. This file should be identical to that used for
471 |     `cytofin_homogenize`.
472 | -   `panel_path`: A connection leading to an .xlsx or .csv file
473 |     containing information about the standardized antigen panel in the
474 |     homogenized dataset. This file should be identical to that used for
475 |     `cytofin_homogenize`.
476 | -   `anchor_statistics`: Either a list of numeric values produced by the
477 |     `cytofin_prep_anchors` function or a connection leading to an .rds
478 |     object containing anchor statistics.
479 | -   `input_data_path`: A connection to a directory containing the input
480 |     .fcs files to be batch normalized. In most cases, this will be the
481 |     directory to which the output .FCS files from `cytofin_homogenize`
482 |     were written.
483 | -   `output_data_path`: A connection to a directory where the output
484 |     (i.e. batch normalized) .FCS files will be written.
485 | -   `mode`: A string indicating which transformation function should be
486 |     used for batch normalization ("meanshift", "meanshift_bulk",
487 |     "variance", "z-score", or "beadlike").
488 | 
489 | In addition to these required arguments, `cytofin_normalize` takes
490 | several optional arguments:
491 | 
492 | -   `input_prefix`: The string that was appended to the name of the raw
493 |     input .fcs files of `cytofin_homogenize` to create their
494 |     corresponding output file names. Defaults to "homogenized\_".
495 | 
496 | -   `output_prefix`: The string to be appended to the name of each input
497 |     .fcs file to create the name of the corresponding output file
498 |     (post-homogenization). Defaults to "normalized\_".
499 | 
500 | -   `shift_factor` and `scale_factor`: The scalar values *a* and *b*,
501 |     respectively, to be used in the hyperbolic arc-sine function used to
502 |     transform CyTOF ion counts according to the following equation:
503 |     `new_x <- asinh(a + b * x)`. `shift_factor` defaults to 0 and
504 |     `scale_factor` defaults to 0.2, which are customary values used by
505 |     most scientists in the CyTOF community.
506 | 
507 | Using these arguments, a call to `cytofin_normalize` will perform the
508 | batch correction and save the output (i.e. batch normalized) .fcs files
509 | to the directory specified by `output_data_path`. An example function
510 | call is given here:
511 | 
512 | ```{r}
513 | output_data_path <- 
514 |   file.path(base_path, "normalization_results")
515 | 
516 | norm_result <- 
517 |   cytofin_normalize(
518 |     metadata_path = metadata_path, 
519 |     panel_path = panel_path, 
520 |     anchor_statistics = anchor_statistics, 
521 |     input_data_path = input_data_path, 
522 |     output_data_path = output_data_path, 
523 |     mode = "meanshift"
524 |   )
525 | ```
526 | 
527 | When this function is called, it has two effects. The first is to save
528 | the batch-normalized output .fcs files to the `output_data_path`
529 | directory. The second is to return a data.frame that stores mean and
530 | variance information about each input file (as well as its associated
531 | anchor) both before and after normalization. This data.frame can be
532 | passed directly into the `cytofin_make_plots` function to return 8
533 | diagnostic plots per sample illustrating the quality of the
534 | normalization:
535 | 
536 | ```{r}
537 | # we make only the plot for the first input .fcs file
538 | # for illustrative purposes
539 | cytofin_make_plots(
540 |   normalization_result = norm_result,
541 |   which_rows = 1,
542 |   val_path = "none"
543 | )
544 | ```
545 | 
546 | #### Batch normalization using internal anchors (cytofin_normalize_nrs)
547 | 
548 | 
549 | In the event that external anchors are not available, `CytofIn` can use
550 | "internal anchors" within each sample for batch normalization.
551 | Specifically, instead of defining a single external anchor for all the
552 | samples on a given plate like `cytofin_normalize`, the
553 | `cytofin_normalize_nrs` function identifies the most stable channels in
554 | the dataset overall and uses them as internal anchors that are used to
555 | batch normalize all other channels from sample-to-sample. A schematic diagram of how `cytofin_normalize_nrs` works is provided below: 
556 | 
557 | ![](./inst/images/image3.png)
558 | 
559 | In words, to identify
560 | the most stable channels in the combined dataset, `CytofIn` uses a
561 | PCA-based non-redundancy score (NRS) as described before (see
562 | [here](https://pubmed.ncbi.nlm.nih.gov/26095251/)). A minimum of 3
563 | channels should be selected to establish an internal reference from
564 | which signals can be calibrated between CyTOF files.
565 | 
566 | To do this, `cytofin_normalize_nrs` takes several of the same arguments as
567 | `cytofin_normalize`, defined as above: `metadata_path`, `panel_path`,
568 | `input_data_path`, `output_data_path`, `input_prefix`, `output_prefix`,
569 | `shift_factor`, and `scale_factor`. In addition, it takes the following
570 | optional arguments:
571 | 
572 | -   `nchannels`: An integer representing the number of "most stable"
573 |     (i.e. with the lowest non-redundancy scores) channels that should be
574 |     used for batch normalization. Defaults to 3.
575 | 
576 | -   `make_plot`: A boolean value representing if, in addition to its
577 |     other effects, `cytofin_normalize_nrs` should return a plot
578 |     illustrating the distribution of non-redundancy scores for each
579 |     channel among all .fcs files being batch normalized. Defaults to
580 |     FALSE.
581 | 
582 | These arguments can be used in a function call as follows:
583 | 
584 | ```{r}
585 | # path to save the normalized .fcs files
586 | output_data_path <- 
587 |   file.path(base_path, "normalization_nrs_results")
588 | 
589 | # call function
590 | norm_result_nrs <- 
591 |   cytofin_normalize_nrs(
592 |     metadata_path = metadata_path, 
593 |     panel_path = panel_path, 
594 |     input_data_path = input_data_path, 
595 |     output_data_path = output_data_path, 
596 |     nchannels = 3, 
597 |     make_plot = FALSE
598 |   )
599 | ```
600 | 
601 | Just like `cytofin_normalize` above, `cytofin_normalize_nrs` has several
602 | effects. First, it writes batch-normalized .fcs files to
603 | `output_data_path` and makes a plot depicting sample-wise and
604 | channel-wise non-redundancy scores according to the value of
605 | `make_plot`. In addition, it returns a data.frame that can be passed
606 | into `cytofin_make_plots` to make diagnostic plots regarding the batch
607 | normalization procedure:
608 | 
609 | ```{r}
610 | # show only 1 set of plots for illustrative purposes
611 | cytofin_make_plots(
612 |   normalization_result = norm_result_nrs, 
613 |   which_rows = 7, 
614 |   val_path = validation_data_path
615 | )
616 | ```
617 | 
618 | ## Additional Information
619 | 
620 | For questions about the `cytofin` R package, please email
621 | [kardavis\@stanford.edu](mailto:kardavis@stanford.edu) or open a GitHub
622 | issue [here](https://github.com/bennyyclo/Cytofin).
623 | 
624 | ```{r}
625 | # session information for rendering this README file
626 | sessionInfo()
627 | ```
628 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | -   [cytofin](#cytofin)
  3 |     -   [Installation](#installation)
  4 |     -   [Data for this vignette](#data-for-this-vignette)
  5 |         -   [Establishing a root
  6 |             directory](#establishing-a-root-directory)
  7 |         -   [Downloading the data](#downloading-the-data)
  8 |     -   [Usage](#usage)
  9 |         -   [CyTOF data homogenization
 10 |             (cytofin\_homogenize)](#cytof-data-homogenization-cytofin_homogenize)
 11 |         -   [CyTOF batch normalization](#cytof-batch-normalization)
 12 |             -   [Batch normalization using external anchors
 13 |                 (cytofin\_normalize)](#batch-normalization-using-external-anchors-cytofin_normalize)
 14 |                 -   [Overview](#overview)
 15 |                 -   [Step 1 - Anchor
 16 |                     preparation](#step-1---anchor-preparation)
 17 |                 -   [Step 2 - Batch
 18 |                     normalization](#step-2---batch-normalization)
 19 |             -   [Batch normalization using internal anchors
 20 |                 (cytofin\_normalize\_nrs)](#batch-normalization-using-internal-anchors-cytofin_normalize_nrs)
 21 |     -   [Additional Information](#additional-information)
 22 | 
 23 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 24 | 
 25 | # cytofin
 26 | 
 27 | CytofIn (**CyTOF** **In**tegration) is an R package for homogenizing and
 28 | normalizing heterogeneous [mass cytometry
 29 | (CyTOF)](https://pubmed.ncbi.nlm.nih.gov/21551058/) data from diverse
 30 | data sources. Specifically, `CytofIn` provides functions that perform
 31 | the following tasks:
 32 | 
 33 | -   **Dataset homogenization** - CyTOF datasets that were collected
 34 |     separately may differ in which markers were included in their
 35 |     antibody panels; in addition, they may use different naming
 36 |     conventions for their panels’ shared markers. Thus, data mining
 37 |     across multiple CyTOF datasets requires **homogenization,** the
 38 |     process of aligning each dataset’s antibody panels so that they can
 39 |     be analyzed together. In `CytofIn`, data homogenization (i.e. panel
 40 |     alignment) is performed with the `cytofin_homogenize` function that
 41 |     leverages user-provided panel information to combine datasets.
 42 | -   **Dataset normalization** - Combined analysis of multiple CyTOF
 43 |     datasets is likely to be confounded by dataset-to-dataset batch
 44 |     effects due to differences in instrumentation and experimental
 45 |     protocols between groups. To normalize multiple CyTOF datasets with
 46 |     respect to these batch effects, `CytofIn` provides 3 functions:
 47 |     `cytofin_prep_anchors`, `cytofin_normalize`, and
 48 |     `cytofin_normalize_nrs`.
 49 | -   **Visualization** - After batch normalization, the means and
 50 |     standard deviations for each of the input .fcs files (as well as
 51 |     their associated anchors) can be visualized using the
 52 |     `cytofin_make_plots` function.
 53 | 
 54 | The general CytofIn workflow unfolds in 3 steps. First, users align the
 55 | panels of the CyTOF datasets being integrated using
 56 | `cytofin_homogenize()`. Second, users generate reference statistics from
 57 | “generalized anchors” identified on each CyTOF plate (see below) using
 58 | `cytofin_prep_anchors()`. Finally, users can then normalize/batch
 59 | correct the datasets relative to one another using their choice of
 60 | `cytofin_normalize()` or `cytofin_normalize_nrs()`, each of which
 61 | performs the normalization procedure differently (see below).
 62 | 
 63 | ## Installation
 64 | 
 65 | To install CytofIn, run the following code:
 66 | 
 67 | ``` r
 68 | library(devtools)
 69 | install_github("bennyyclo/Cytofin")
 70 | ```
 71 | Please also ensure that the flowcore package is installed:
 72 | 
 73 | ``` r
 74 | if (!requireNamespace("BiocManager", quietly = TRUE))
 75 |     install.packages("BiocManager")
 76 | 
 77 | BiocManager::install("flowCore")
 78 | ```
 79 | 
 80 | To attach the CytofIn package to your current R session, run the
 81 | following line:
 82 | 
 83 | ``` r
 84 | library(cytofin)
 85 | ```
 86 | 
 87 | ## Data for this vignette
 88 | 
 89 | ### Establishing a root directory
 90 | 
 91 | For the sake of this vignette, we will work within a single folder,
 92 | where we will store the input data, the output data, and all
 93 | intermediate files from the CytofIn pipeline. We will default to using
 94 | the current working directory, but feel free to modify the following
 95 | line of code to change which path you want to use.
 96 | 
 97 | ``` r
 98 | # change this path to wherever you want this vignette to find and store
 99 | # its input and output files
100 | base_path <- getwd()
101 | ```
102 | 
103 | ### Downloading the data
104 | 
105 | Now that we’ve identified the root directory we’ll use for this
106 | vignette, we will create two folders in which we will store the raw
107 | input data and the validation (bead-normalized) data used in this
108 | vignette:
109 | 
110 | ``` r
111 | dir.create(file.path(base_path, "raw_data"), showWarnings = FALSE)
112 | dir.create(file.path(base_path, "validation_data"), showWarnings = FALSE)
113 | ```
114 | 
115 | To fill each of these folders with the .fcs files we’re analyzing in
116 | this vignette, please download the raw input files
117 | [here](https://flowrepository.org/id/FR-FCM-Z427) and the validation
118 | files [here](https://flowrepository.org/id/FR-FCM-Z42C) on
119 | [FlowRepository](https://flowrepository.org/). Once the files are
120 | downloaded, unzip them. Finally, move all of the unzipped .fcs files
121 | from each repository into the `raw_data` and `validation_data` folders
122 | that we just created, respectively.
123 | 
124 | ## Usage
125 | 
126 | ### CyTOF data homogenization (cytofin\_homogenize)
127 | 
128 | Here, the term “homogenization” refers to the process of aligning the
129 | antigen panels of multiple CyTOF experiments by (1) removing all
130 | channels that are not shared across all cohorts and (2) standardizing
131 | the antigen names used to refer to each channel so that existing
132 | analysis tools (like the `flowCore` and `tidyverse` packages) can be
133 | applied in later analytical steps. In CytofIn, dataset homogenization is
134 | performed using the `cytofin_homogenize()` function.
135 | 
136 | The `cytofin_homogenize()` function takes several arguments. The first
137 | of these is `metadata_path`, a string that specifies the file path to a
138 | .csv or .xlsx metadata file containing information about each of the
139 | .fcs files being analyzed. Specifically, the metadata file will have one
140 | row for each .fcs file being analyzed and must contain the following
141 | columns (all of which will be converted to character vectors):
142 | 
143 | -   **filename -** Required. The name of the .fcs file within its local
144 |     directory.
145 | -   **cohort -** Required. The name of the cohort (i.e. experimental
146 |     source) of each .fcs file.
147 | -   **plate\_number -** Required. The name of the CyTOF plate (e.g.
148 |     “plate1”, “plate2”, etc.) on which the sample corresponding to each
149 |     .fcs file was analyzed during data acquisition.
150 | -   **patient\_id -** Optional. The name of the patient to whom each
151 |     .fcs file corresponds.
152 | -   **condition -** Optional. The stimulation condition corresponding to
153 |     each .fcs file (i.e. “basal”, “IL-3”, etc.).
154 | -   **is\_anchor -** Required. A numeric column indicating whether or
155 |     not each sample should be used as an “anchor” for the batch
156 |     correction procedure (1 if yes; 0 if no). Exactly one anchor should
157 |     be identified for each CyTOF plate being analyzed.
158 | -   **validation -** Optional. The name of the
159 |     [bead-normalized](https://pubmed.ncbi.nlm.nih.gov/23512433/) .fcs
160 |     file corresponding to each input file listed in the `filename`
161 |     column (per gold-standard batch normalization procedure in CyTOF
162 |     batch correction). Most users will ignore this column because
163 |     bead-normalized data will not be available, but it can be used to
164 |     validate the results of the CytofIn batch normalization algorithms
165 |     if bead-normalized data are available.
166 | 
167 | Importantly, only the fields marked as “required” are needed for
168 | `cytofin_homogenize()` to work; “NA” can be recorded for any/all
169 | optional columns that don’t apply to the experimental design of the
170 | files being analyzed (for example, if no stimulation conditions were
171 | used in the studies being integrated, enter “NA” for each element of the
172 | `condition` column). Alternatively, these columns can be omitted from
173 | the metadata table entirely. The following image provides a visual
174 | summary of the metadata table used throughout the `CytofIn` pipeline.
175 | 
176 | ![](./inst/images/image2.png)
177 | 
178 | For the user’s convenience, the `cytofin_generate_metadata_template`
179 | function is provided to generate an example metadata .csv file filled
180 | with dummy example data in a location specified by the user:
181 | 
182 | ``` r
183 | # specify the path where you'd like to store the template file
184 | my_path <- file.path(base_path, "template_folder")
185 | 
186 | # generate the template file, which then can be edited manually 
187 | cytofin_generate_metadata_template(template_path = my_path)
188 | ```
189 | 
190 | The second argument for `cytofin_homogenize` is `panel_path`, a string
191 | that specifies the file path to a .csv or .xlsx file containing
192 | information about the panel(s) of each of the .fcs files being analyzed.
193 | Each row represents a channel (i.e. a protein measurement) to be
194 | included in the final, homogenized panel. This file must contain the
195 | following columns:
196 | 
197 | -   **metal\_name -** A character vector representing the name of the
198 |     metal isotope measured by each channel.
199 | -   **antigen\_name -** A character vector representing the name of the
200 |     antigen associated with a given metal isotope in the consensus panel
201 |     (the final antigen name to assign to a given channel during
202 |     homogenization).
203 | -   **antigen\_pattern -** A regular expression used to match antigen
204 |     names that may differ slightly across different .fcs files. For
205 |     example, the regular expression “(C\|c)(D\|d)45” will detect all of
206 |     the following channel names: “cd45”, “CD45”, “Cd45”, or “cD45”.
207 | -   **lineage -** A numeric vector representing whether or not a marker
208 |     is a lineage marker (1 if yes; 0 otherwise).
209 | -   **functional -** A numeric vector representing whether or not a
210 |     marker is a functional marker (1 if yes; 0 otherwise).
211 | -   **general -** A numeric vector representing whether or not a marker
212 |     is a “general” (i.e. neither a lineage nor a functional) marker (1
213 |     if yes; 0 otherwise).
214 | 
215 | The layout of this antigen table (and how it’s used during .fcs file
216 | homogenization) is displayed in the picture below.
217 | 
218 | ![](./inst/images/image1.png)
219 | 
220 | As in `cytofin_generate_metadata_template`, the
221 | `cytofin_generate_panel_template` function is provided to generate an
222 | example metadata .csv file filled with dummy example data:
223 | 
224 | ``` r
225 | # generate the template file, which then can be edited manually 
226 | cytofin_generate_panel_template(template_path = my_path)
227 | ```
228 | 
229 | For many users, the most difficult part of filling out the consensus
230 | panel information table will be designing the regular expressions for
231 | the `antigen_pattern` column. However, in most cases the required
232 | regular expressions will be quite simple; for a primer on regular
233 | expressions (and their use in the
234 | [`stringr`](https://stringr.tidyverse.org/) package) written by
235 | [RStudio](https://www.rstudio.com/about/), install the `stringr` package
236 | and read the following vignette:
237 | 
238 | ``` r
239 | vignette(topic = "regular-expressions", package = "stringr")
240 | ```
241 | 
242 | The next two arguments for `cytofin_homogenize` are `input_data_path`
243 | and `output_data_path`, two strings that indicate which directory input
244 | .fcs files should be read from and which directory homogenized .fcs
245 | files should be written to, respectively. Lastly, the final two
246 | arguments are optional: `prefix` allows the user to specify the prefix
247 | appended to each input .fcs file name to get the name of the
248 | corresponding output (i.e. homogenized) .fcs file name, and `verbose` is
249 | a boolean value (default = FALSE) specifying if chatty print statements
250 | should be made while the homogenization is performed.
251 | 
252 | Using these arguments, `cytofin_homogenize` can homogenize a set of
253 | CyTOF files with distinct antigen naming conventions. Specifically, the
254 | program performs a regular expression search to match the synonymous
255 | term in the panel and correct the antigen name with standardized names
256 | in the panel.
257 | 
258 | Example function call:
259 | 
260 | ``` r
261 | # define input paths 
262 | metadata_path <- 
263 |   system.file(
264 |     file.path("extdata", "test_metadata_raw.csv"), 
265 |     package = "cytofin"
266 |   )
267 | 
268 | panel_path <- 
269 |   system.file(
270 |     file.path("extdata", "test_panel.csv"), 
271 |     package = "cytofin"
272 |   )
273 | 
274 | input_data_path <- 
275 |   file.path(base_path, "raw_data")
276 | 
277 | validation_data_path <- 
278 |   file.path(base_path, "validation_data")
279 | 
280 | # define output path
281 | # --Change this line to wherever you want the output files saved!--
282 | output_data_path <- file.path(base_path, "homogenization_output")
283 | 
284 | # call homogenization function
285 | cytofin_homogenize(
286 |   metadata_path = metadata_path, 
287 |   panel_path = panel_path, 
288 |   input_data_path = input_data_path, 
289 |   output_data_path = output_data_path
290 | )
291 | ```
292 | 
293 | This function call will save homogenized .fcs files to the directory
294 | located at `output_data_path`. These files will be different from the
295 | input .fcs files in the `input_data_path` directory in that they will
296 | only contain channels whose antigen names match the `antigen_pattern`
297 | column of the reference panel located at `panel_path`. All other
298 | channels will be removed, and the names of the channels with matches in
299 | `antigen_pattern` will be standardized to the names given in the
300 | `antigen_name` column of the reference panel.
301 | 
302 | The input files for this homogenization run were as follows:
303 | 
304 | ``` r
305 | list.files(input_data_path, pattern = ".fcs$")
306 | #>  [1] "ALL05v2_Plate2_healthy basal1.fcs" "ALL05v2_Plate2_UPN94 das.fcs"     
307 | #>  [3] "ALL08_Plate8_Healthy03 basal.fcs"  "ALL08_Plate8_UPN26 basal.fcs"     
308 | #>  [5] "CRLF2_Plate1_Healthy 04 BCR.fcs"   "CRLF2_Plate1_UPN53 das + TSLP.fcs"
309 | #>  [7] "MS_Plate5_Healthy BM.fcs"          "MS_Plate5_SU978 Basal.fcs"        
310 | #>  [9] "SJ_Plate2_Healthy_BM.fcs"          "SJ_Plate2_TB010950_Basal.fcs"
311 | ```
312 | 
313 | …and the corresponding output file saved in the `output_data_path`
314 | directory are now as follows:
315 | 
316 | ``` r
317 | list.files(output_data_path, pattern = ".fcs$")
318 | #>  [1] "homogenized_ALL05v2_Plate2_healthy basal1.fcs"
319 | #>  [2] "homogenized_ALL05v2_Plate2_UPN94 das.fcs"     
320 | #>  [3] "homogenized_ALL08_Plate8_Healthy03 basal.fcs" 
321 | #>  [4] "homogenized_ALL08_Plate8_UPN26 basal.fcs"     
322 | #>  [5] "homogenized_CRLF2_Plate1_Healthy 04 BCR.fcs"  
323 | #>  [6] "homogenized_CRLF2_Plate1_UPN53 das + TSLP.fcs"
324 | #>  [7] "homogenized_MS_Plate5_Healthy BM.fcs"         
325 | #>  [8] "homogenized_MS_Plate5_SU978 Basal.fcs"        
326 | #>  [9] "homogenized_SJ_Plate2_Healthy_BM.fcs"         
327 | #> [10] "homogenized_SJ_Plate2_TB010950_Basal.fcs"
328 | ```
329 | 
330 | ### CyTOF batch normalization
331 | 
332 | After dataset homogenization, **batch correction** (or **batch
333 | normalization**) can be performed across datasets.
334 | 
335 | In short, `CytofIn` performs batch normalization though the use of
336 | user-identified **generalized anchors** - which are non-identical
337 | references assumed to have low variability across batches - that can be
338 | used to estimate batch effects from samples collated from heterogeneous
339 | sources. To batch normalize using healthy control samples (one per
340 | plate) as generalized anchors (which is ideal when such samples are
341 | available), use `cytofin_normalize`. To batch normalize using the
342 | antigen channels with the lowest variability across samples as
343 | generalized anchors (which is ideal when healthy samples are unavailable
344 | on all plates being analyzed), use `cytofin_normalize_nrs`.
345 | 
346 | The use of both of these functions is detailed below.
347 | 
348 | #### Batch normalization using external anchors (cytofin\_normalize)
349 | 
350 | ##### Overview
351 | 
352 | The `cytofin_normalize` uses user-identified external anchors on each
353 | CyTOF plate being integrated to correct batch effects on a
354 | plate-to-plate basis. One sample on each CyTOF barcoding plate should be
355 | chosen as that plate’s external anchor. In general, external anchors
356 | should be chosen based on which samples are the most biologically
357 | similar to one another from plate to plate. For example, if healthy,
358 | non-stimulated samples are included on each CyTOF plate being
359 | integrated, the only expected variability between these samples other
360 | than batch effects would be person-to-person variability. Thus, these
361 | samples are likely to be biologically similar to one another and are
362 | suitable to be chosen as external anchors. Alternatively, if a single
363 | patient or cell line was included on every CyTOF plate being integrated,
364 | the samples corresponding to that patient or cell line on each plate
365 | would also be suitable as external anchor choices.
366 | 
367 | Once users have identified 1 external anchor per plate for `CytofIn`
368 | data integration, users must mark its row in the metadata table with a
369 | “1” in the `is_anchor` column (all other samples should be marked with
370 | “0”). `CytofIn` then uses these anchors to define a **universal mean**
371 | and **universal variance** that represent the central tendency and
372 | dispersion, respectively, of the target distribution to which all
373 | samples will be batch corrected. This correction will be performed with
374 | the user’s choice from one of five batch correction functions.
375 | 
376 | In short, `CytofIn`’s batch normalization procedure using external
377 | anchors has two steps:
378 | 
379 | 1.  Preparation of external anchors  
380 | 2.  Application of a transformation function that performs the batch
381 |     correction (of which `CytofIn` provides 5 options)
382 | 
383 | We detail function calls for each of these steps below.
384 | 
385 | ##### Step 1 - Anchor preparation
386 | 
387 | The `cytofin_prep_anchors` function concatenates the identified anchor
388 | files and then calculates summary statistics that are used for batch
389 | correction in later steps of the pipeline. First, `CytofIn` calculates
390 | the mean and standard deviation of each channel in the homogenized
391 | dataset across all cells from samples identified as external anchors.
392 | These values represent the overall central tendency and dispersion,
393 | respectively, of each channel among the anchor samples on each CyTOF
394 | plate; thus, we call them the **universal means** and **universal
395 | variances** of the `CytofIn` integration. Accordingly, the universal
396 | mean and universal variance vectors will each have *g* elements, where
397 | *g* is the number of channels in the consensus antigen panel in the
398 | panel information table. The universal mean and universal variance
399 | vectors are used in the `meanshift`, `variance`, `z-score`, and
400 | `beadlike` methods of batch correction (see below).
401 | 
402 | In addition, the mean of all of the elements of the universal mean
403 | vector (i.e. the mean of all channel means) and the mean of all of the
404 | elements of the universal variance vector (i.e. the mean of all channel
405 | variances) are calculated. These values represent the central tendency
406 | and dispersion of antigen measurements in general among the healthy
407 | control samples on each CyTOF plate and are thus no longer
408 | channel-specific. Thus, we call them the *bulk mean* and *bulk
409 | variance*, and they are used in the `meanshift_bulk` batch correction
410 | method implemented in `cytofin_homogenize`.
411 | 
412 | To calculate these values, we use the `cytofin_prep_anchors` function.
413 | `cytofin_prep_anchors` returns the universal mean vector, universal
414 | variance vector, bulk mean, and bulk variance as a `list()`. In
415 | addition, users are given an option to save these statistics as an .rds
416 | file in a specified directory in order to avoid performing redundant
417 | calculations in future analyses.
418 | 
419 | Specifically, `cytofin_prep_anchors` takes 4 required arguments:
420 | 
421 | -   `metadata_path`: A directory leading to an .xlsx or .csv file
422 |     containing a metadata table with information about each file to be
423 |     analyzed. This file should be identical to that used for
424 |     `cytofin_homogenize`.
425 | -   `panel_path`: A directory leading to an .xlsx or .csv file
426 |     containing information about the standardized antigen panel in the
427 |     homogenized dataset. This file should be identical to that used for
428 |     `cytofin_homogenize`.
429 | -   `input_data_path`: A directory containing the input
430 |     .FCS files from which to draw summary statistics
431 | -   `output_path`: A directory where the output .rds and
432 |     .FCS files will be saved. The default is “none”, in which case no
433 |     output files will be stored (and the only effect of the function
434 |     will be to return the calculated statistics as a `list()`).
435 | 
436 | In addition, `cytofin_prep_anchors` also takes 2 optional arguments
437 | relating to the conventional arcsinh transformation performed on the raw
438 | ion counts of the input data. These optional arguments are as follows:
439 | 
440 | -   `shift_factor`: The scalar value `a` in the following equation used
441 |     to transform CyTOF raw data ion counts using the hyperbolic arcsinh
442 |     function: `new_x <- asinh(a + b * x)`. Defaults to 0.
443 | 
444 | -   `scale_factor`: The scalar value `b` in the following equation used
445 |     to transform CyTOF raw data ion counts using the hyperbolic arcsinh
446 |     function: `new_x <- asinh(a + b * x)`. Defaults to 0.2.
447 | 
448 | Finally, here is an example functional call of `cytofin_prep_anchors`:
449 | 
450 | ``` r
451 | input_data_path <- file.path(base_path, "homogenization_output")
452 | output_path <- file.path(base_path, "anchor_prep_output")
453 | 
454 | anchor_statistics <- 
455 |   cytofin_prep_anchors(
456 |     metadata_path = metadata_path, 
457 |     panel_path = panel_path, 
458 |     input_data_path = input_data_path, 
459 |     output_path = output_path
460 |   )
461 | 
462 | print(anchor_statistics)
463 | #> $universal_var
464 | #>         Time Event_length    (Pd102)Di    (Pd104)Di    (Pd105)Di    (Pd106)Di 
465 | #>   1.28235792   0.16399756   6.78770451   0.89290897   5.74351522   4.00916670 
466 | #>    (Pd108)Di    (Pd110)Di    (In113)Di    (In115)Di    (La139)Di    (Pr141)Di 
467 | #>   6.47944462   6.14839951   3.14291787   3.69776978   0.31651260   0.20067263 
468 | #>    (Nd142)Di    (Nd143)Di    (Nd144)Di    (Nd145)Di    (Nd146)Di    (Sm147)Di 
469 | #>   0.88280840   0.50837979   0.18512779   0.27893442   0.79089548   1.30174061 
470 | #>    (Nd148)Di    (Sm149)Di    (Nd150)Di    (Sm152)Di    (Eu153)Di    (Sm154)Di 
471 | #>   1.53148051   0.24234410   0.19237185   0.78984151   3.36668746   0.64687396 
472 | #>    (Gd156)Di    (Gd158)Di    (Gd160)Di    (Dy161)Di    (Dy162)Di    (Dy163)Di 
473 | #>   0.62963342   0.21865740   2.88801028   0.07940630   0.12194444   0.07128214 
474 | #>    (Dy164)Di    (Ho165)Di    (Er166)Di    (Er167)Di    (Er168)Di    (Er170)Di 
475 | #>   0.44285804   1.04235848   0.28206380   4.31831331   3.59089444   3.35406088 
476 | #>    (Yb171)Di    (Yb172)Di    (Yb173)Di    (Yb174)Di    (Lu175)Di    (Yb176)Di 
477 | #>   1.95310084   0.67905696   0.13911985   6.12832312   1.77734024   0.53625671 
478 | #>    (Ir191)Di    (Ir193)Di 
479 | #>   3.21574811   3.27089639 
480 | #> 
481 | #> $universal_mean
482 | #>         Time Event_length    (Pd102)Di    (Pd104)Di    (Pd105)Di    (Pd106)Di 
483 | #>  14.50995327   2.30820954   3.48055714   1.06062913   4.08199057   4.77092034 
484 | #>    (Pd108)Di    (Pd110)Di    (In113)Di    (In115)Di    (La139)Di    (Pr141)Di 
485 | #>   2.69248853   3.31279576   1.34656332   2.31588156   0.35046633   0.19319399 
486 | #>    (Nd142)Di    (Nd143)Di    (Nd144)Di    (Nd145)Di    (Nd146)Di    (Sm147)Di 
487 | #>   0.57791130   0.34730008   0.20086489   0.34646560   0.61382685   0.56851774 
488 | #>    (Nd148)Di    (Sm149)Di    (Nd150)Di    (Sm152)Di    (Eu153)Di    (Sm154)Di 
489 | #>   1.13302732   0.15299272   0.19208744   0.43406391   2.13362865   0.45270859 
490 | #>    (Gd156)Di    (Gd158)Di    (Gd160)Di    (Dy161)Di    (Dy162)Di    (Dy163)Di 
491 | #>   0.34711746   0.17472376   1.30261426   0.11212254   0.13257570   0.07266354 
492 | #>    (Dy164)Di    (Ho165)Di    (Er166)Di    (Er167)Di    (Er168)Di    (Er170)Di 
493 | #>   0.22465161   0.48758658   0.28522175   2.63843957   2.43044297   0.80540655 
494 | #>    (Yb171)Di    (Yb172)Di    (Yb173)Di    (Yb174)Di    (Lu175)Di    (Yb176)Di 
495 | #>   1.30095098   0.65077576   0.15830507   2.43474419   1.14821570   0.54578885 
496 | #>    (Ir191)Di    (Ir193)Di 
497 | #>   3.80031272   4.48577210 
498 | #> 
499 | #> $bulk_var
500 | #> [1] 1.467075
501 | #> 
502 | #> $bulk_mean
503 | #> [1] 0.969387
504 | ```
505 | 
506 | As shown above, the returned value is a list with 4 items in it: the
507 | universal variance vector (`universal_var`), the universal mean vector
508 | (`universal_mean`), the bulk variance (`bulk_var`) and the bulk mean
509 | (`bulk_mean`). Note that the elements of `universal_var` and
510 | `universal_mean` are named with their corresponding metal names (not
511 | antigen names), as this interfaces a bit more conveniently with the
512 | `flowCore` functions that `CytofIn` uses under-the-hood.
513 | 
514 | Importantly, you only need to use `cytofin_prep_anchors` if you plan to
515 | batch normalize your .fcs files using external anchors identified on
516 | each plate (using `cytofin_normalize`). If you plan to batch normalize
517 | your .fcs files using non-redundancy scores from each sample’s most
518 | stable channels (using `cytofin_normalize_nrs`), you do not need to run
519 | `cytofin_prep_anchors` first.
520 | 
521 | ##### Step 2 - Batch normalization
522 | 
523 | After the anchors’ summary statistics are computed, batch correction
524 | using external anchors can be performed using either
525 | `cytofin_normalize`. This function can perform batch correction using 5
526 | different normalizations functions (which we call “modes”).
527 | Specifically, the options are called the “meanshift”, “meanshift\_bulk”,
528 | “variance”, “z-score”, and “beadlike” normalization functions. Which of
529 | these is most applicable to a given analysis will differ from user to
530 | user. We recommended that users try using both and then manually
531 | inspect/visualize the batch-corrected data in order to determine which
532 | method they prefer.
533 | 
534 | To perform batch normalization using external anchors identified on each
535 | plate, use `cytofin_normalize`. This batch normalization strategy
536 | assumes that the anchors on each plate are relatively similar to one
537 | another, and it uses this similarity to adjust the marker expression
538 | measurements on each plate based on how much each plate’s anchor differs
539 | from the other anchors. The `cytofin_normalize` function takes several
540 | required arguments:
541 | 
542 | -   `metadata_path`: A directory leading to an .xlsx or .csv file
543 |     containing a metadata table with information about each file to be
544 |     analyzed. This file should be identical to that used for
545 |     `cytofin_homogenize`.
546 | -   `panel_path`: A directory leading to an .xlsx or .csv file
547 |     containing information about the standardized antigen panel in the
548 |     homogenized dataset. This file should be identical to that used for
549 |     `cytofin_homogenize`.
550 | -   `anchor_statistics`: Either a list of numeric values produced by the
551 |     `cytofin_prep_anchors` function or a connection leading to an .rds
552 |     object containing anchor statistics.
553 | -   `input_data_path`: A directory containing the input
554 |     .fcs files to be batch normalized. In most cases, this will be the
555 |     directory to which the output .FCS files from `cytofin_homogenize`
556 |     were written.
557 | -   `output_data_path`: A directory where the output
558 |     (i.e. batch normalized) .FCS files will be written.
559 | -   `mode`: A string indicating which transformation function should be
560 |     used for batch normalization (“meanshift”, “meanshift\_bulk”,
561 |     “variance”, “z-score”, or “beadlike”).
562 | 
563 | In addition to these required arguments, `cytofin_normalize` takes
564 | several optional arguments:
565 | 
566 | -   `input_prefix`: The string that was appended to the name of the raw
567 |     input .fcs files of `cytofin_homogenize` to create their
568 |     corresponding output file names. Defaults to “homogenized\_”.
569 | 
570 | -   `output_prefix`: The string to be appended to the name of each input
571 |     .fcs file to create the name of the corresponding output file
572 |     (post-homogenization). Defaults to “normalized\_”.
573 | 
574 | -   `shift_factor` and `scale_factor`: The scalar values *a* and *b*,
575 |     respectively, to be used in the hyperbolic arc-sine function used to
576 |     transform CyTOF ion counts according to the following equation:
577 |     `new_x <- asinh(a + b * x)`. `shift_factor` defaults to 0 and
578 |     `scale_factor` defaults to 0.2, which are customary values used by
579 |     most scientists in the CyTOF community.
580 | 
581 | Using these arguments, a call to `cytofin_normalize` will perform the
582 | batch correction and save the output (i.e. batch normalized) .fcs files
583 | to the directory specified by `output_data_path`. An example function
584 | call is given here:
585 | 
586 | ``` r
587 | output_data_path <- 
588 |   file.path(base_path, "normalization_results")
589 | 
590 | norm_result <- 
591 |   cytofin_normalize(
592 |     metadata_path = metadata_path, 
593 |     panel_path = panel_path, 
594 |     anchor_statistics = anchor_statistics, 
595 |     input_data_path = input_data_path, 
596 |     output_data_path = output_data_path, 
597 |     mode = "meanshift"
598 |   )
599 | ```
600 | 
601 | When this function is called, it has two effects. The first is to save
602 | the batch-normalized output .fcs files to the `output_data_path`
603 | directory. The second is to return a data.frame that stores mean and
604 | variance information about each input file (as well as its associated
605 | anchor) both before and after normalization. This data.frame can be
606 | passed directly into the `cytofin_make_plots` function to return 8
607 | diagnostic plots per sample illustrating the quality of the
608 | normalization:
609 | 
610 | ``` r
611 | # we make only the plot for the first input .fcs file
612 | # for illustrative purposes
613 | cytofin_make_plots(
614 |   normalization_result = norm_result,
615 |   which_rows = 1,
616 |   val_path = "none"
617 | )
618 | ```
619 | 
620 | <img src="man/figures/README-unnamed-chunk-15-1.png" width="100%" />
621 | 
622 | #### Batch normalization using internal anchors (cytofin\_normalize\_nrs)
623 | 
624 | In the event that external anchors are not available, `CytofIn` can use
625 | “internal anchors” within each sample for batch normalization.
626 | Specifically, instead of defining a single external anchor for all the
627 | samples on a given plate like `cytofin_normalize`, the
628 | `cytofin_normalize_nrs` function identifies the most stable channels in
629 | the dataset overall and uses them as internal anchors that are used to
630 | batch normalize all other channels from sample-to-sample. A schematic
631 | diagram of how `cytofin_normalize_nrs` works is provided below:
632 | 
633 | ![](./inst/images/image3.png)
634 | 
635 | In words, to identify the most stable channels in the combined dataset,
636 | `CytofIn` uses a PCA-based non-redundancy score (NRS) as described
637 | before (see [here](https://pubmed.ncbi.nlm.nih.gov/26095251/)). A
638 | minimum of 3 channels should be selected to establish an internal
639 | reference from which signals can be calibrated between CyTOF files.
640 | 
641 | To do this, `cytofin_normalize_nrs` takes several of the same arguments
642 | as `cytofin_normalize`, defined as above: `metadata_path`, `panel_path`,
643 | `input_data_path`, `output_data_path`, `input_prefix`, `output_prefix`,
644 | `shift_factor`, and `scale_factor`. In addition, it takes the following
645 | optional arguments:
646 | 
647 | -   `nchannels`: An integer representing the number of “most stable”
648 |     (i.e. with the lowest non-redundancy scores) channels that should be
649 |     used for batch normalization. Defaults to 3.
650 | 
651 | -   `make_plot`: A boolean value representing if, in addition to its
652 |     other effects, `cytofin_normalize_nrs` should return a plot
653 |     illustrating the distribution of non-redundancy scores for each
654 |     channel among all .fcs files being batch normalized. Defaults to
655 |     FALSE.
656 | 
657 | These arguments can be used in a function call as follows:
658 | 
659 | ``` r
660 | # path to save the normalized .fcs files
661 | output_data_path <- 
662 |   file.path(base_path, "normalization_nrs_results")
663 | 
664 | # call function
665 | norm_result_nrs <- 
666 |   cytofin_normalize_nrs(
667 |     metadata_path = metadata_path, 
668 |     panel_path = panel_path, 
669 |     input_data_path = input_data_path, 
670 |     output_data_path = output_data_path, 
671 |     nchannels = 3, 
672 |     make_plot = FALSE
673 |   )
674 | ```
675 | 
676 | Just like `cytofin_normalize` above, `cytofin_normalize_nrs` has several
677 | effects. First, it writes batch-normalized .fcs files to
678 | `output_data_path` and makes a plot depicting sample-wise and
679 | channel-wise non-redundancy scores according to the value of
680 | `make_plot`. In addition, it returns a data.frame that can be passed
681 | into `cytofin_make_plots` to make diagnostic plots regarding the batch
682 | normalization procedure:
683 | 
684 | ``` r
685 | # show only 1 set of plots for illustrative purposes
686 | cytofin_make_plots(
687 |   normalization_result = norm_result_nrs, 
688 |   which_rows = 7, 
689 |   val_path = validation_data_path
690 | )
691 | ```
692 | 
693 | <img src="man/figures/README-unnamed-chunk-17-1.png" width="100%" />
694 | 
695 | ## Additional Information
696 | 
697 | For questions about the `cytofin` R package, please email
698 | <kardavis@stanford.edu> or open a GitHub issue
699 | [here](https://github.com/bennyyclo/Cytofin).
700 | 
701 | ``` r
702 | # session information for rendering this README file
703 | sessionInfo()
704 | #> R version 4.0.3 (2020-10-10)
705 | #> Platform: x86_64-apple-darwin17.0 (64-bit)
706 | #> Running under: macOS Big Sur 10.16
707 | #> 
708 | #> Matrix products: default
709 | #> BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
710 | #> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
711 | #> 
712 | #> locale:
713 | #> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
714 | #> 
715 | #> attached base packages:
716 | #> [1] stats     graphics  grDevices utils     datasets  methods   base     
717 | #> 
718 | #> other attached packages:
719 | #> [1] cytofin_0.0.0.9000
720 | #> 
721 | #> loaded via a namespace (and not attached):
722 | #>  [1] Rcpp_1.0.6          highr_0.9           compiler_4.0.3     
723 | #>  [4] pillar_1.6.0        cytolib_2.2.1       tools_4.0.3        
724 | #>  [7] digest_0.6.27       evaluate_0.14       lifecycle_1.0.0    
725 | #> [10] tibble_3.1.0        pkgconfig_2.0.3     rlang_0.4.10       
726 | #> [13] DBI_1.1.1           yaml_2.2.1          parallel_4.0.3     
727 | #> [16] xfun_0.22           dplyr_1.0.5         stringr_1.4.0      
728 | #> [19] knitr_1.32          hms_1.0.0           generics_0.1.0     
729 | #> [22] S4Vectors_0.28.1    vctrs_0.3.7         stats4_4.0.3       
730 | #> [25] tidyselect_1.1.0    glue_1.4.2          Biobase_2.50.0     
731 | #> [28] R6_2.5.0            fansi_0.4.2         rmarkdown_2.7      
732 | #> [31] readr_1.4.0         tidyr_1.1.3         RProtoBufLib_2.2.0 
733 | #> [34] purrr_0.3.4         magrittr_2.0.1      matrixStats_0.58.0 
734 | #> [37] htmltools_0.5.1.1   ellipsis_0.3.1      BiocGenerics_0.36.1
735 | #> [40] assertthat_0.2.1    flowCore_2.2.0      utf8_1.2.1         
736 | #> [43] stringi_1.5.3       RcppParallel_5.1.2  crayon_1.4.1
737 | ```
738 | 


--------------------------------------------------------------------------------