├── vignettes
├── .gitignore
└── cytofin.Rmd
├── LICENSE
├── .Rbuildignore
├── inst
├── images
│ ├── image1.png
│ ├── image2.png
│ └── image3.png
└── extdata
│ ├── test_metadata_raw.csv
│ └── test_panel.csv
├── man
├── figures
│ ├── README-unnamed-chunk-11-1.png
│ ├── README-unnamed-chunk-11-10.png
│ ├── README-unnamed-chunk-11-2.png
│ ├── README-unnamed-chunk-11-3.png
│ ├── README-unnamed-chunk-11-4.png
│ ├── README-unnamed-chunk-11-5.png
│ ├── README-unnamed-chunk-11-6.png
│ ├── README-unnamed-chunk-11-7.png
│ ├── README-unnamed-chunk-11-8.png
│ ├── README-unnamed-chunk-11-9.png
│ ├── README-unnamed-chunk-12-1.png
│ ├── README-unnamed-chunk-14-1.png
│ ├── README-unnamed-chunk-15-1.png
│ ├── README-unnamed-chunk-16-1.png
│ └── README-unnamed-chunk-17-1.png
├── homogenize_flowFrame.Rd
├── get_extension.Rd
├── cytofin-package.Rd
├── rev_asinh.Rd
├── cytofin_read_metadata.Rd
├── cytofin_read_panel_info.Rd
├── cytofin_generate_panel_template.Rd
├── cytofin_generate_metadata_template.Rd
├── cytofin_homogenize.Rd
├── cytofin_prep_anchors.Rd
├── cytofin_make_plots.Rd
├── cytofin_normalize.Rd
└── cytofin_normalize_nrs.Rd
├── R
├── cytofin-package.R
├── file_templates.R
├── cytofin_homogenize.R
├── utils.R
├── cytofin_prep_anchors.R
├── cytofin_normalize.R
├── cytofin_make_plots.R
└── cytofin_normalize_nrs.R
├── NAMESPACE
├── LICENSE.md
├── DESCRIPTION
├── README.Rmd
└── README.md
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2021
2 | COPYRIGHT HOLDER: cytofin authors
3 |
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | ^LICENSE\.md$
5 |
--------------------------------------------------------------------------------
/inst/images/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/inst/images/image1.png
--------------------------------------------------------------------------------
/inst/images/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/inst/images/image2.png
--------------------------------------------------------------------------------
/inst/images/image3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/inst/images/image3.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-1.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-10.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-2.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-3.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-4.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-5.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-6.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-7.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-8.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-11-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-11-9.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-12-1.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-14-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-14-1.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-15-1.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-16-1.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bennyyclo/Cytofin/HEAD/man/figures/README-unnamed-chunk-17-1.png
--------------------------------------------------------------------------------
/R/cytofin-package.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | "_PACKAGE"
3 |
4 | # The following block is used by usethis to automatically manage
5 | # roxygen namespace tags. Modify with care!
6 | ## usethis namespace: start
7 |
8 | ## usethis namespace: end
9 | NULL
10 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(cytofin_generate_metadata_template)
4 | export(cytofin_generate_panel_template)
5 | export(cytofin_homogenize)
6 | export(cytofin_make_plots)
7 | export(cytofin_normalize)
8 | export(cytofin_normalize_nrs)
9 | export(cytofin_prep_anchors)
10 |
--------------------------------------------------------------------------------
/man/homogenize_flowFrame.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{homogenize_flowFrame}
4 | \alias{homogenize_flowFrame}
5 | \title{Alter a flowFrame to only include data from channels in a reference panel}
6 | \usage{
7 | homogenize_flowFrame(fcs_raw, ref_panel)
8 | }
9 | \arguments{
10 | \item{fcs_raw}{A flowFrame containing unprocessed CyTOF data}
11 |
12 | \item{ref_panel}{A data.frame representing the reference panel data for a
13 | cytofin analysis.}
14 | }
15 | \value{
16 | a homogenized flowFrame
17 | }
18 | \description{
19 | Alter a flowFrame to only include data from channels in a reference panel
20 | }
21 |
--------------------------------------------------------------------------------
/man/get_extension.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{get_extension}
4 | \alias{get_extension}
5 | \title{Find the extension for a file}
6 | \usage{
7 | get_extension(filename)
8 | }
9 | \arguments{
10 | \item{filename}{A string representing the name of a file in its local directory}
11 | }
12 | \value{
13 | The the file extension of \code{filename}
14 | }
15 | \description{
16 | Find the extension for a file
17 | }
18 | \examples{
19 | \dontrun{
20 | # example file name
21 | my_filename <- "my_file.txt"
22 |
23 | # find and print the extension
24 | my_extension <- getExtension(my_filename)
25 | print(my_extension)
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/man/cytofin-package.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/cytofin-package.R
3 | \docType{package}
4 | \name{cytofin-package}
5 | \alias{cytofin}
6 | \alias{cytofin-package}
7 | \title{cytofin: Integrate CyTOF Datasets From Heterogeneous Sources}
8 | \description{
9 | Integrate multiple CyTOF datasets collected from independent
10 | sources (i.e. labs, institutions, etc.). Cytofin performs CyTOF panel alignment
11 | across datasets ("homogenization") as well as batch correction using generalized
12 | anchors identified on each CyTOF plate ("normalization").
13 | }
14 | \author{
15 | \strong{Maintainer}: Ben Lo \email{bennylo@stanford.edu}
16 |
17 | Authors:
18 | \itemize{
19 | \item Timothy Keyes \email{tkeyes@stanford.edu} (\href{https://orcid.org/0000-0003-0423-9679}{ORCID})
20 | }
21 |
22 | Other contributors:
23 | \itemize{
24 | \item Kara Davis \email{kardavis@stanford.edu} [research team head, owner]
25 | }
26 |
27 | }
28 | \keyword{internal}
29 |
--------------------------------------------------------------------------------
/man/rev_asinh.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{rev_asinh}
4 | \alias{rev_asinh}
5 | \title{Reverses arcsinh transformation with cofactor \code{scale_factor} and a shift of \code{shift_factor}.}
6 | \usage{
7 | rev_asinh(x, shift_factor, scale_factor)
8 | }
9 | \arguments{
10 | \item{x}{A numeric vector.}
11 |
12 | \item{shift_factor}{The scalar value \code{a} in the following equation used to
13 | transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
14 | \code{new_x <- asinh(a + b * x)}.}
15 |
16 | \item{scale_factor}{The scalar value \code{b} in the following equation used to
17 | transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
18 | \code{new_x <- asinh(a + b * x)}.}
19 | }
20 | \value{
21 | A numeric vector after undergoing reverse
22 | arcsinh transformation
23 | }
24 | \description{
25 | Reverses arcsinh transformation with cofactor \code{scale_factor} and a shift of \code{shift_factor}.
26 | }
27 |
--------------------------------------------------------------------------------
/inst/extdata/test_metadata_raw.csv:
--------------------------------------------------------------------------------
1 | filename,cohort,plate_number,patient_id,condition,is_anchor,validation
2 | ALL05v2_Plate2_UPN94 das.fcs,ALL05v2,plate2,UPN94,Das,0,homogenized_ALL05v2_plate2_UPN94 das.fcs
3 | ALL08_Plate8_UPN26 basal.fcs,ALL08,plate8,UPN26,Basal,0,homogenized_ALL08_plate8_UPN26 basal.fcs
4 | CRLF2_Plate1_UPN53 das + TSLP.fcs,CRLF2,plate1,UPN53,das_TSLP,0,homogenized_CRLF2_plate1_UPN53 das + TSLP.fcs
5 | ALL05v2_Plate2_healthy basal1.fcs,ALL05v2,plate2,Healthy,Basal,1,homogenized_ALL05v2_plate2_healthy basal1.fcs
6 | ALL08_Plate8_Healthy03 basal.fcs,ALL08,plate8,Healthy03,Basal,1,homogenized_ALL08_plate9_Healthy03 basal.fcs
7 | CRLF2_Plate1_Healthy 04 BCR.fcs,CRLF2,plate1,Healthy04,BCR,1,homogenized_CRLF2_plate1_Healthy 04 BCR.fcs
8 | MS_Plate5_SU978 Basal.fcs,MajSak,plate5,SU978,Basal,0,homogenized_MajSak_plate5_SU978 Basal.fcs
9 | MS_Plate5_Healthy BM.fcs,MajSak,plate5,Healthy,BM,1,homogenized_MajSak_plate5_Healthy BM.fcs
10 | SJ_Plate2_TB010950_Basal.fcs,StJude,plate2,TB010950,Basal,0,homogenized_StJude_plate2_TB010950_Basal.fcs
11 | SJ_Plate2_Healthy_BM.fcs,StJude,plate2,Healthy,BM,1,homogenized_StJude_plate2_Healthy_BM.fcs
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | Copyright (c) 2021 cytofin authors
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/man/cytofin_read_metadata.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{cytofin_read_metadata}
4 | \alias{cytofin_read_metadata}
5 | \title{Read in a cytofin metadata file}
6 | \usage{
7 | cytofin_read_metadata(metadata_path)
8 | }
9 | \arguments{
10 | \item{metadata_path}{A filepath leading to an .xlsx or .csv file
11 | containing a table of CyTOF file (.fcs file) names. Columns should include
12 | \code{filename}, \code{cohort}, \code{plate_number}, \code{patient_id}, \code{condition}, \code{population},
13 | and \code{validation}. TO DO: Change the names of these columns to more descriptive
14 | names and make sure that they are all actually needed.
15 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
16 | }
17 | \value{
18 | A data.frame containing the metadata information in the
19 | file stored at \code{metadata_path}.
20 | }
21 | \description{
22 | This function reads a cytofin metadata file from a connection
23 | that points to a .csv or a .xlsx file
24 | }
25 | \examples{
26 | \dontrun{
27 | my_path <- file.path("~", "foo", "bar", "metadata.csv")
28 | my_metadata <- cytofin:::cytofin_read_metadata(my_path)
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/man/cytofin_read_panel_info.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/utils.R
3 | \name{cytofin_read_panel_info}
4 | \alias{cytofin_read_panel_info}
5 | \title{Read in a cytofin reference panel information}
6 | \usage{
7 | cytofin_read_panel_info(panel_path)
8 | }
9 | \arguments{
10 | \item{panel_path}{A file path leading to an .xlsx or .csv file containing
11 | a table of standardized antigen panel information. Columns should include
12 | \code{desc}, \code{range}, \code{metal_pattern}, \code{antigen_pattern}, \code{Lineage}, \code{Functional},
13 | and \code{General}. TO DO: Change the names of these columns to more descriptive
14 | names and make sure that they are all actually needed.
15 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
16 | }
17 | \value{
18 | A data.frame containing the reference panel information in the
19 | file stored at \code{panel_path}.
20 | }
21 | \description{
22 | This function reads cytofin reference panel information from a connection
23 | that points to a .csv or a .xlsx file
24 | }
25 | \examples{
26 | \dontrun{
27 | my_path <- file.path("~", "foo", "bar", "panel.csv")
28 | my_metadata <- cytofin:::cytofin_read_panel_info(my_path)
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/man/cytofin_generate_panel_template.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/file_templates.R
3 | \name{cytofin_generate_panel_template}
4 | \alias{cytofin_generate_panel_template}
5 | \title{Generate a template for a cytofin reference panel file}
6 | \usage{
7 | cytofin_generate_panel_template(
8 | file_name = "template_panel_info.csv",
9 | template_path = getwd()
10 | )
11 | }
12 | \arguments{
13 | \item{file_name}{A string representing the name of the .csv file to be
14 | saved in the directory specified by \code{template_path}. Defaults to
15 | "template_panel_info.csv"}
16 |
17 | \item{template_path}{File path or connection where the template file should be
18 | written. Defaults to the current working directory}
19 | }
20 | \description{
21 | \code{cytofin_generate_panel_template} creates a template reference panel .csv file
22 | (with the correct columns and dummy example data) in a specified location.
23 | }
24 | \examples{
25 |
26 | # specify the path where you'd like to store the template file
27 | my_name <- "panel_template.csv"
28 | my_path <- file.path("~", "Desktop", "template_folder")
29 |
30 | # generate the template file, which then can be edited manually
31 | cytofin_generate_panel_template(
32 | file_name = my_name,
33 | template_path = my_path
34 | )
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/man/cytofin_generate_metadata_template.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/file_templates.R
3 | \name{cytofin_generate_metadata_template}
4 | \alias{cytofin_generate_metadata_template}
5 | \title{Generate a template for a cytofin metadata file}
6 | \usage{
7 | cytofin_generate_metadata_template(
8 | file_name = "template_metadata.csv",
9 | template_path = getwd()
10 | )
11 | }
12 | \arguments{
13 | \item{file_name}{A string representing the name of the .csv file to be
14 | saved in the directory specified by \code{template_path}. Defaults to
15 | "template_metadata.csv"}
16 |
17 | \item{template_path}{A file path or connection where the template file should be
18 | written. Defaults to the current working directory}
19 | }
20 | \description{
21 | \code{cytofin_generate_metadata_template} creates a template metadata .csv file
22 | (with the correct columns and dummy example data) in a specified location.
23 | }
24 | \examples{
25 | # specify the path where you'd like to store the template file
26 | my_name <- "metadata_template.csv"
27 | my_path <- file.path("~", "Desktop", "template_folder")
28 |
29 |
30 | # generate the template file, which then can be edited manually
31 | cytofin_generate_metadata_template(
32 | file_name = my_name,
33 | template_path = my_path
34 | )
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: cytofin
2 | Title: Integrate CyTOF Datasets From Heterogeneous Sources
3 | Version: 0.0.0.9000
4 | Authors@R:
5 | c(person(given = "Ben",
6 | family = "Lo",
7 | role = c("aut", "cre"),
8 | email = "bennylo@stanford.edu"),
9 | person(given = "Timothy",
10 | family = "Keyes",
11 | role = "aut",
12 | email = "tkeyes@stanford.edu",
13 | comment = c(ORCID = "0000-0003-0423-9679")),
14 | person(given = "Kara",
15 | family = "Davis",
16 | role = c("rth", "own"),
17 | email = "kardavis@stanford.edu"))
18 | Description: Integrate multiple CyTOF datasets collected from independent
19 | sources (i.e. labs, institutions, etc.). Cytofin performs CyTOF panel alignment
20 | across datasets ("homogenization") as well as batch correction using generalized
21 | anchors identified on each CyTOF plate ("normalization").
22 | License: MIT + file LICENSE
23 | Encoding: UTF-8
24 | LazyData: true
25 | Roxygen: list(markdown = TRUE)
26 | RoxygenNote: 7.1.1
27 | Imports:
28 | flowCore,
29 | readxl,
30 | stringr,
31 | tidyr,
32 | reshape2,
33 | ggplot2,
34 | readr,
35 | dplyr
36 | Suggests:
37 | rmarkdown,
38 | knitr,
39 | testthat (>= 3.0.0)
40 | Config/testthat/edition: 3
41 | VignetteBuilder: knitr
42 |
--------------------------------------------------------------------------------
/inst/extdata/test_panel.csv:
--------------------------------------------------------------------------------
1 | metal_name,antigen_name,antigen_pattern,lineage,functional,general
2 | Time,Time,[Tt]ime,0,0,1
3 | Event_length,Event_length,ength,0,0,1
4 | (Pd102)Di,BC1,BC1,0,0,1
5 | (Pd104)Di,BC2,BC2,0,0,1
6 | (Pd105)Di,BC3,BC3,0,0,1
7 | (Pd106)Di,BC4,BC4,0,0,1
8 | (Pd108)Di,BC5,BC5,0,0,1
9 | (Pd110)Di,BC6,BC6,0,0,1
10 | (In113)Di,CD235_CD61,CD235,1,0,0
11 | (In115)Di,CD45,CD45,1,0,0
12 | (La139)Di,cPARP,PARP,0,1,0
13 | (Pr141)Di,pPLCg1_2,pPLCg1_2,0,1,0
14 | (Nd142)Di,CD19,CD19,1,0,0
15 | (Nd143)Di,CD22,CD22,1,0,0
16 | (Nd144)Di,p4EBP1,p4EBP1,0,1,0
17 | (Nd145)Di,tIkaros,tIkaros,1,0,0
18 | (Nd146)Di,CD79b,CD79b,1,0,0
19 | (Sm147)Di,CD20,CD20,1,0,0
20 | (Nd148)Di,CD34,CD34,1,0,0
21 | (Sm149)Di,CD179a,CD179a,1,0,0
22 | (Nd150)Di,pSTAT5,pSTAT5,0,1,0
23 | (Sm152)Di,Ki67,Ki67,0,1,0
24 | (Eu153)Di,IgMi,IgMi,1,0,0
25 | (Sm154)Di,Kappa_lambda,appa,0,1,0
26 | (Gd156)Di,CD10,CD10,1,0,0
27 | (Gd158)Di,CD179b,CD179b,1,0,0
28 | (Gd160)Di,CD24,CD24,1,0,0
29 | (Dy161)Di,TSLPr,TSLPr,0,1,0
30 | (Dy162)Di,CD127,CD127,1,0,0
31 | (Dy163)Di,RAG1,RAG1,1,0,0
32 | (Dy164)Di,TdT,Td,1,0,0
33 | (Ho165)Di,Pax5,Pax5,1,0,0
34 | (Er166)Di,pSyk,pSyk,0,1,0
35 | (Er167)Di,CD43,CD43,1,0,0
36 | (Er168)Di,CD38,CD38,1,0,0
37 | (Er170)Di,CD3,CD3^,1,0,0
38 | (Yb171)Di,CD33,FITC|CD33,0,1,0
39 | (Yb172)Di,pS6,pS6,0,1,0
40 | (Yb173)Di,pErk,pErk,0,1,0
41 | (Yb174)Di,HLADR,HLADR,1,0,0
42 | (Lu175)Di,IgMs,IgMs,1,0,0
43 | (Yb176)Di,pCreb,pCreb,0,1,0
44 | (Ir191)Di,DNA1,DNA1,0,1,0
45 | (Ir193)Di,DNA2,DNA2,0,1,0
--------------------------------------------------------------------------------
/man/cytofin_homogenize.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/cytofin_homogenize.R
3 | \name{cytofin_homogenize}
4 | \alias{cytofin_homogenize}
5 | \title{Homogenize CyTOF channels names using a consensus antigen panel}
6 | \usage{
7 | cytofin_homogenize(
8 | metadata_path,
9 | panel_path,
10 | input_data_path,
11 | output_data_path,
12 | prefix = "homogenized_",
13 | verbose = FALSE
14 | )
15 | }
16 | \arguments{
17 | \item{metadata_path}{A file path leading to an .xlsx or .csv file
18 | containing a table of CyTOF file (.fcs file) names in the first column (\code{filename})
19 | and additional information about each .fcs file in subsequent columns.
20 | Columns should include \code{filename}, \code{cohort}, \code{plate_number}, \code{patient_id},
21 | \code{condition}, \code{is_anchor}, and \code{validation}.}
22 |
23 | \item{panel_path}{A file path leading to an .xlsx or .csv file containing
24 | a table of standardized antigen panel information. Columns should include
25 | \code{metal_name}, \code{antigen_name}, \code{antigen_pattern},
26 | \code{lineage}, \code{functional}, and \code{general}.}
27 |
28 | \item{input_data_path}{A folder directory containing the input .fcs files
29 | to be homogenized.}
30 |
31 | \item{output_data_path}{A folder directory to which the output (i.e.
32 | homogenized) .fcs files should be written.}
33 |
34 | \item{prefix}{A string appended to the name of each input file to create the
35 | name of the corresponding output file (post-homogenization). Defaults to
36 | "homogenized_" (e.g. an input file named "file1.fcs" will correspond to
37 | the output file "homogenized_file1.fcs" saved in \code{output_data_path}).}
38 |
39 | \item{verbose}{A boolean value indicating whether progress message should be
40 | printed to the console during homogenization. Defaults to FALSE.}
41 | }
42 | \value{
43 | \code{cytofin_homogenize} doesn't return anything. Instead, it has the
44 | side-effect of saving homogenized files (in .fcs format) to the directory
45 | specified with \code{output_data_path}. Each of the saved files will contain
46 | homogenized, user-defined channels according to details specified in the
47 | file at \code{panel_path.}
48 | }
49 | \description{
50 | This function homogenizes CyTOF data (.fcs files) from heterogeneous sources
51 | according to the standard panel in a .csv file located at \code{panel_path.}
52 | }
53 |
--------------------------------------------------------------------------------
/man/cytofin_prep_anchors.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/cytofin_prep_anchors.R
3 | \name{cytofin_prep_anchors}
4 | \alias{cytofin_prep_anchors}
5 | \title{Prepare CyTOF controls for batch normalization across plates}
6 | \usage{
7 | cytofin_prep_anchors(
8 | metadata_path,
9 | panel_path,
10 | input_data_path,
11 | input_prefix = "homogenized_",
12 | output_path = "none",
13 | shift_factor = 0,
14 | scale_factor = 0.2
15 | )
16 | }
17 | \arguments{
18 | \item{metadata_path}{A file path leading to an .xlsx or .csv file
19 | containing a table of CyTOF file (.fcs file) names. Columns should include
20 | \code{filename}, \code{cohort}, \code{plate_number}, \code{patient_id}, \code{condition}, \code{population},
21 | and \code{validation}.
22 |
23 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
24 |
25 | \item{panel_path}{A file path leading to an .xlsx or .csv file containing
26 | a table of standardized antigen panel information. Columns should include
27 | \code{metal_name}, \code{antigen_name}, \code{antigen_pattern}, \code{lineage}, \code{functional},
28 | and \code{general}.
29 |
30 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
31 |
32 | \item{input_data_path}{A folder directory containing the input CyTOF files
33 | to be prepped for normalization. These files should already be homogenized,
34 | and in most cases this will be the directory to which the output
35 | .fcs files from \code{cytofin_homogenize} were written.}
36 |
37 | \item{input_prefix}{The string that was appended to the name of the input files
38 | of \code{cytofin_homogenize} to create their corresponding output file names.
39 | Defaults to "homogenized_".}
40 |
41 | \item{output_path}{A file path specifying where to save the output .rds
42 | file containing the statistics calculated from this step and the concatenated
43 | .FCS files containing all cells from the generalized anchor samples. Defaults
44 | to "none", in which case no files are saved.}
45 |
46 | \item{shift_factor}{The scalar value \code{a} in the following equation used to
47 | transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
48 |
49 | \code{new_x <- asinh(a + b*x)}.
50 |
51 | Defaults to 0.}
52 |
53 | \item{scale_factor}{The scalar value \code{b} in the following equation used to
54 | transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
55 |
56 | \code{new_x <- asinh(a + b*x)}.
57 |
58 | Defaults to 0.2.}
59 | }
60 | \value{
61 | a \code{list()} of summary statistics with the following elements:
62 | \itemize{
63 | \item \strong{universal_var}: a named numeric vector in which each entry corresponds to the
64 | universal variance of an antigen channel in the homogenized dataset
65 | \item \strong{universal_mean}: a named numeric vector in which each entry corresponds to the
66 | universal mean of an antigen channel in the homogenized dataset
67 | \item \strong{bulk_var}: The mean of all the channel-specific universal variances
68 | in \code{universal_var} (a scalar value)
69 | \item \strong{bulk_mean}: The mean of all the channel-specific universal means
70 | in \code{universal_mean} (a scalar value)
71 | }
72 | }
73 | \description{
74 | This function calculates reference statistics needed for CytofIn batch normalization.
75 | Specifically, it calculates the universal mean and universal variance vectors
76 | of the generalized anchors identified in the metadata file at \code{metadata_path};
77 | in addition, it calculates the non-channel-specific bulk mean and bulk variance
78 | of the generalized anchors.
79 | }
80 |
--------------------------------------------------------------------------------
/R/file_templates.R:
--------------------------------------------------------------------------------
1 | #' Generate a template for a cytofin metadata file
2 | #'
3 | #' `cytofin_generate_metadata_template` creates a template metadata .csv file
4 | #' (with the correct columns and dummy example data) in a specified location.
5 | #'
6 | #' @param file_name A string representing the name of the .csv file to be
7 | #' saved in the directory specified by `template_path`. Defaults to
8 | #' "template_metadata.csv"
9 | #'
10 | #' @param template_path A file path or connection where the template file should be
11 | #' written. Defaults to the current working directory
12 | #'
13 | #'
14 | #' @export
15 | #'
16 | #' @examples
17 | #' # specify the path where you'd like to store the template file
18 | #' my_name <- "metadata_template.csv"
19 | #' my_path <- file.path("~", "Desktop", "template_folder")
20 | #'
21 | #'
22 | #' # generate the template file, which then can be edited manually
23 | #' cytofin_generate_metadata_template(
24 | #' file_name = my_name,
25 | #' template_path = my_path
26 | #' )
27 | #'
28 | cytofin_generate_metadata_template <-
29 | function(
30 | file_name = "template_metadata.csv",
31 | template_path = getwd()
32 | ) {
33 |
34 | # create template_path if needed
35 | if(!dir.exists(template_path)) {
36 | dir.create(template_path, showWarnings = FALSE, recursive = TRUE)
37 | }
38 |
39 | #create output data.frame
40 | output_frame <-
41 | data.frame(
42 | filename = c("file_1.fcs", "file_2.fcs", "file_3.fcs", "file_4.fcs"),
43 | cohort = c("cohort_1", "cohort_1", "cohort_2", "cohort_2"),
44 | plate_number = c("plate_1", "plate_1", "plate_2", "plate_2"),
45 | patient_id = c("patient_1", "patient_2", "patient_a", "patient_b"),
46 | condition = c("basal", "basal", "stimulation_1", "stimulation_2"),
47 | is_anchor = c(0, 1, 0, 1),
48 | validation =
49 | paste0(
50 | "validation_",
51 | c("file_1.fcs", "file_2.fcs", "file_3.fcs", "file_4.fcs")
52 | )
53 | )
54 |
55 | readr::write_csv(
56 | x = output_frame,
57 | file = file.path(template_path, file_name)
58 | )
59 |
60 | }
61 |
62 |
63 | #' Generate a template for a cytofin reference panel file
64 | #'
65 | #' `cytofin_generate_panel_template` creates a template reference panel .csv file
66 | #' (with the correct columns and dummy example data) in a specified location.
67 | #'
68 | #' @param file_name A string representing the name of the .csv file to be
69 | #' saved in the directory specified by `template_path`. Defaults to
70 | #' "template_panel_info.csv"
71 | #'
72 | #' @param template_path File path or connection where the template file should be
73 | #' written. Defaults to the current working directory
74 | #'
75 | #'
76 | #' @export
77 | #'
78 | #' @examples
79 | #'
80 | #' # specify the path where you'd like to store the template file
81 | #' my_name <- "panel_template.csv"
82 | #' my_path <- file.path("~", "Desktop", "template_folder")
83 | #'
84 | #' # generate the template file, which then can be edited manually
85 | #' cytofin_generate_panel_template(
86 | #' file_name = my_name,
87 | #' template_path = my_path
88 | #' )
89 | #'
90 | cytofin_generate_panel_template <-
91 | function(
92 | file_name = "template_panel_info.csv",
93 | template_path = getwd()
94 | ) {
95 |
96 | # create template_path if needed
97 | if(!dir.exists(template_path)) {
98 | dir.create(template_path, showWarnings = FALSE, recursive = TRUE)
99 | }
100 |
101 | #create output data.frame
102 | output_frame <-
103 | data.frame(
104 | metal_name = c("Time", "Event_length", "(Pd102)Di", "(Pd104)Di"),
105 | antigen_name = c("Time", "Event_length", "marker_name_1", "marker_name_2"),
106 | antigen_pattern = c("", "", "", ""),
107 | lineage = c(0, 0, 1, 1),
108 | functional = c(0, 0, 0, 1),
109 | general = c(0, 1, 1, 1)
110 | )
111 |
112 | readr::write_csv(
113 | x = output_frame,
114 | file = file.path(template_path, file_name)
115 | )
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/man/cytofin_make_plots.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/cytofin_make_plots.R
3 | \name{cytofin_make_plots}
4 | \alias{cytofin_make_plots}
5 | \title{Make diagnostic plots to evaluate CytofIn batch normalization}
6 | \usage{
7 | cytofin_make_plots(
8 | normalization_result,
9 | which_rows = 1:nrow(normalization_result),
10 | val_path = "none"
11 | )
12 | }
13 | \arguments{
14 | \item{normalization_result}{An output data.frame produced by the \code{cytofin_normalize} or
15 | \code{cytofin_normalize_nrs} function.
16 |
17 | The following columns should be present: \code{filename},
18 | \code{cohort}, \code{plate_number}, \code{patient_id}, \code{condition}, \code{is_anchor}, \code{validation},
19 | \code{universal_var}, \code{anchor_mean}, \code{anchor_var}, \code{mean_b4norm}, \code{var_b4norm},
20 | \code{mean_norm}, \code{var_norm}, \code{mean_ctr_norm}, \code{var_ctr_norm}.}
21 |
22 | \item{which_rows}{A numeric vector indicating which rows of \code{normalization_result}
23 | (i.e. which .fcs files in the combined dataset) should be used for plotting. Defaults
24 | to 1:nrow(normalization_result), which will make all possible plots.}
25 |
26 | \item{val_path}{The folder directory containing validation (i.e. bead-normalized)
27 | .fcs files corresponding to the input .fcs files in the metadata table. (Optional).}
28 | }
29 | \value{
30 | 8 diagnostic plots are made for each input .fcs file that was batch
31 | normalized (i.e. each .fcs file represented as a row in \code{normalization_result}).
32 | From left-to-right (and top-to-bottom), these plots represent the following:
33 | \enumerate{
34 | \item The entry in the universal mean vector corresponding to each antigen in the
35 | consensus antigen panel. X-axis: antigen index in the universal mean vector.
36 | Y-axis: Arcsinh-transformed entry in the universal mean vector corresponding
37 | to each antigen.
38 | \item The mean (across all cells) antigen expression vector for the anchor
39 | associated with each input .fcs file both before and after normalization.
40 | X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the
41 | anchor .fcs file.
42 | \item The mean (across all cells) antigen expression vector for each input
43 | .fcs file both before and after normalization.
44 | X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the
45 | input .fcs file.
46 | \item The mean (across all cells) antigen expression vector for each "validation"
47 | (i.e. bead-normalized) .fcs file both before and after bead-normalization.
48 | This plot can be used to compare CytofIn batch normalization with gold-
49 | standard approaches. If \code{val_path} is "none", this plot will be identical to
50 | plot 3 (see above).
51 | X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the
52 | validation .fcs file.
53 | \item The entry in the universal standard deviation vector corresponding to each antigen in the
54 | consensus antigen panel. X-axis: antigen index in the universal standard deviation vector.
55 | Y-axis: Arcsinh-transformed entry in the universal standard deviation vector corresponding
56 | to each antigen.
57 | \item The standard deviation (across all cells) antigen expression vector for the anchor
58 | associated with each input .fcs file both before and after normalization.
59 | X-axis: antigen index (as in plot 1). Y-axis: the standard deviation of all
60 | antigen expression values in the anchor .fcs file.
61 | \item The standard deviation (across all cells) antigen expression vector for each input
62 | .fcs file both before and after normalization.
63 | X-axis: antigen index (as in plot 1). Y-axis: the standard deviation of all
64 | antigen expression values in the input .fcs file.
65 | \item The standard deviation (across all cells) antigen expression vector for each "validation"
66 | (i.e. bead-normalized) .fcs file both before and after bead-normalization.
67 | This plot can be used to compare CytofIn batch normalization with gold-
68 | standard approaches. If \code{val_path} is "none", this plot will be identical to
69 | plot 3 (see above).
70 | X-axis: antigen index (as in plot 1). Y-axis: Standard deviation of all
71 | antigen expression values in the validation .fcs file.
72 | }
73 | }
74 | \description{
75 | When given the output data structure from \code{cytofin_normalize} or \code{cytofin_normalize_nrs},
76 | this function plots mean and variance plots for all normalized .fcs files and their
77 | associated anchors.
78 | }
79 |
--------------------------------------------------------------------------------
/man/cytofin_normalize.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/cytofin_normalize.R
3 | \name{cytofin_normalize}
4 | \alias{cytofin_normalize}
5 | \title{Batch normalize CyTOF plates from heterogeneous sources using external anchors}
6 | \usage{
7 | cytofin_normalize(
8 | metadata_path,
9 | panel_path,
10 | anchor_statistics,
11 | input_data_path,
12 | output_data_path,
13 | mode = c("meanshift", "meanshift_bulk", "variance", "z_score", "beadlike"),
14 | input_prefix = "homogenized_",
15 | output_prefix = "normalized_",
16 | shift_factor = 0,
17 | scale_factor = 0.2
18 | )
19 | }
20 | \arguments{
21 | \item{metadata_path}{A filepath leading to an .xlsx or .csv file
22 | containing a table of CyTOF file (.fcs file) names. Columns should include
23 | \code{filename}, \code{cohort}, \code{plate_number}, \code{patient_id}, \code{condition}, \code{is_anchor},
24 | and \code{validation}.
25 |
26 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
27 |
28 | \item{panel_path}{A file path leading to an .xlsx or .csv file containing
29 | a table of standardized antigen panel information. Columns should include
30 | \code{metal_name}, \code{antigen_name}, \code{antigen_pattern},
31 | \code{lineage}, \code{functional}, and \code{general}.
32 |
33 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
34 |
35 | \item{anchor_statistics}{a list produced by the \code{cytofin_prep_anchors}
36 | function or the file path to an .rds object containing anchor reference statistics.}
37 |
38 | \item{input_data_path}{A folder directory containing the input CyTOF files
39 | to be normalized. In most cases, this will be the directory to which the output
40 | .fcs files from \code{cytofin_homogenize} were written.}
41 |
42 | \item{output_data_path}{A folder directory to which the output (i.e.
43 | batch normalized/batch corrected) .fcs files should be written.}
44 |
45 | \item{mode}{A string indicating which transformation function should be used
46 | for batch normalization ("meanshift", "meanshift_bulk", "variance", "z_score",
47 | or "beadlike").}
48 |
49 | \item{input_prefix}{The string that was appended to the name of the input files
50 | of \code{cytofin_homogenize} to create their corresponding output file names.
51 | Defaults to "homogenized_".}
52 |
53 | \item{output_prefix}{A string to be appended to the name of each input file
54 | to create the name of the corresponding output file (post-homogenization).
55 | Defaults to "normalized_" (e.g. an input file named "file1.fcs" will correspond to
56 | the output file "normalized_file1.fcs" saved in \code{output_data_path}).}
57 |
58 | \item{shift_factor}{The scalar value \code{a} in the following equation used to
59 | transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
60 |
61 | \code{new_x <- asinh(a + b * x)}.
62 |
63 | Defaults to 0.}
64 |
65 | \item{scale_factor}{The scalar value \code{b} in the following equation used to
66 | transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
67 |
68 | \code{new_x <- asinh(a + b * x)}.
69 |
70 | Defaults to 0.2.}
71 | }
72 | \value{
73 | Batch-normalized .fcs files are saved in the directory specified by
74 | \code{output_data_path}.
75 |
76 | In addition, a data.frame containing information about
77 | each input .fcs file (that can be used for plotting with \code{cytofin_make_plots})
78 | is returned with the following columns:
79 | \itemize{
80 | \item All of the columns in the input metadata table (located at \code{metadata_path})
81 | \item \strong{universal_mean}: the universal mean vector to which all files are adjusted
82 | (will be identical for all input .fcs files)
83 | \item \strong{universal_var}: the universal mean vector to which all files are adjusted
84 | (will be identical for all input .fcs files)
85 | \item \strong{anchor_mean}: the mean (across all cells) vector for the anchor file associated
86 | with each input .fcs file (i.e. the anchor located on the same plate as the
87 | input .fcs file) before batch normalization.
88 | \item \strong{anchor_var}: the variance (across all cells) vector for the anchor file associated
89 | with each input .fcs file (i.e. the anchor located on the same plate as the
90 | input .fcs file)
91 | \item \strong{mean_b4norm}: the mean (across all cells) vector of the input .fcs file
92 | before batch normalization.
93 | \item \strong{var_b4norm}: the variance (across all cells) vector of the input .fcs file
94 | before batch normalization.
95 | \item \strong{mean_norm}: the mean (across all cells) vector of the input .fcs file
96 | after batch normalization.
97 | \item \strong{var_norm}: the variance (across all cells) vector of the input .fcs file
98 | after batch normalization.
99 | \item \strong{anchor_mean_norm}: the mean (across all cells) vector for the anchor file associated
100 | with each input .fcs file (i.e. the anchor located on the same plate as the
101 | input .fcs file) after batch normalization.
102 | \item \strong{anchor_var_norm}: the variance (across all cells) vector for the anchor file associated
103 | with each input .fcs file (i.e. the anchor located on the same plate as the
104 | input .fcs file) after batch normalization.
105 | }
106 | }
107 | \description{
108 | This function batch normalizes CyTOF data from multiple plates (from one or more
109 | experimental cohorts) using external (i.e. "generalized") anchors.
110 | }
111 |
--------------------------------------------------------------------------------
/man/cytofin_normalize_nrs.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/cytofin_normalize_nrs.R
3 | \name{cytofin_normalize_nrs}
4 | \alias{cytofin_normalize_nrs}
5 | \title{Batch normalize CyTOF plates from heterogeneous sources using stable channels}
6 | \usage{
7 | cytofin_normalize_nrs(
8 | metadata_path,
9 | panel_path,
10 | input_data_path,
11 | output_data_path,
12 | input_prefix = "homogenized_",
13 | output_prefix = "normalized_",
14 | shift_factor = 0,
15 | scale_factor = 0.2,
16 | nchannels = 3,
17 | make_plot = FALSE
18 | )
19 | }
20 | \arguments{
21 | \item{metadata_path}{A filepath leading to an .xlsx or .csv file
22 | containing a table of CyTOF file (.fcs file) names. Columns should include
23 | \code{filename}, \code{cohort}, \code{plate_number}, \code{patient_id}, \code{condition}, \code{is_anchor},
24 | and \code{validation}.
25 |
26 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
27 |
28 | \item{panel_path}{A file path leading to an .xlsx or .csv file containing
29 | a table of standardized antigen panel information. Columns should include
30 | \code{metal_name}, \code{antigen_name}, \code{antigen_pattern},
31 | \code{lineage}, \code{functional}, and \code{general}.
32 |
33 | See the vignette for details: \code{vignette("help", package = "cytofin")}}
34 |
35 | \item{input_data_path}{A folder directory containing the input CyTOF files
36 | to be normalized. In most cases, this will be the directory to which the output
37 | .fcs files from \code{cytofin_homogenize} were written.}
38 |
39 | \item{output_data_path}{A folder directory to which the output (i.e.
40 | batch normalized/batch corrected) .fcs files should be written.}
41 |
42 | \item{input_prefix}{The string that was appended to the name of the input files
43 | of \code{cytofin_homogenize} to create their corresponding output file names.
44 | Defaults to "homogenized_".}
45 |
46 | \item{output_prefix}{A string to be appended to the name of each input file
47 | to create the name of the corresponding output file (post-homogenization).
48 | Defaults to "normalized_" (e.g. an input file named "file1.fcs" will correspond to
49 | the output file "normalized_file1.fcs" saved in \code{output_data_path}).}
50 |
51 | \item{shift_factor}{The scalar value \code{a} in the following equation used to
52 | transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
53 |
54 | \code{new_x <- asinh(a + b * x)}.
55 |
56 | Defaults to 0.}
57 |
58 | \item{scale_factor}{The scalar value \code{b} in the following equation used to
59 | transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
60 |
61 | \code{new_x <- asinh(a + b * x)}.
62 |
63 | Defaults to 0.2.}
64 |
65 | \item{nchannels}{An integer representing the number of most stable channels to
66 | use during batch normalization. Defaults to 3.}
67 |
68 | \item{make_plot}{A boolean value indicating if a plot depicting the non-
69 | redundancy scores of each marker in each .fcs file being batch normalized
70 | should be plotted as a side-effect of the function call. Defaults to FALSE.}
71 | }
72 | \value{
73 | Batch-normalized .fcs files are saved in the directory specified by
74 | \code{output_data_path}.
75 |
76 | In addition, a data.frame containing information about
77 | each input .fcs file (that can be used for plotting with \code{cytofin_make_plots})
78 | is returned with the following columns:
79 | \itemize{
80 | \item All of the columns in the input metadata table (located at \code{metadata_path})
81 | \item \strong{universal_mean}: the universal mean vector to which all files are adjusted
82 | (will be identical for all input .fcs files)
83 | \item \strong{universal_var}: the universal mean vector to which all files are adjusted
84 | (will be identical for all input .fcs files)
85 | \item \strong{anchor_mean}: the mean (across all cells) vector for the anchor file associated
86 | with each input .fcs file (i.e. the anchor located on the same plate as the
87 | input .fcs file)
88 | \item \strong{anchor_var}: the variance (across all cells) vector for the anchor file associated
89 | with each input .fcs file (i.e. the anchor located on the same plate as the
90 | input .fcs file)
91 | \item \strong{mean_b4norm}: the mean (across all cells) vector of the input .fcs file
92 | before batch normalization.
93 | \item \strong{var_b4norm}: the variance (across all cells) vector of the input .fcs file
94 | before batch normalization.
95 | \item \strong{mean_norm}: the mean (across all cells) vector of the input .fcs file
96 | after batch normalization.
97 | \item \strong{var_norm}: the variance (across all cells) vector of the input .fcs file
98 | after batch normalization.
99 | \item \strong{anchor_mean_norm}: the mean (across all cells) vector for the anchor file associated
100 | with each input .fcs file (i.e. the anchor located on the same plate as the
101 | input .fcs file) after batch normalization.
102 | \item \strong{anchor_var_norm}: the variance (across all cells) vector for the anchor file associated
103 | with each input .fcs file (i.e. the anchor located on the same plate as the
104 | input .fcs file) after batch normalization.
105 | }
106 | }
107 | \description{
108 | This function batch normalizes CyTOF data from multiple plates (from one or more
109 | experimental cohorts) by computing the non-redundancy score (NRS) for each
110 | channel in the dataset, then using the most redundant (i.e. the "most stable")
111 | channels as a reference for batch normalization.
112 | }
113 |
--------------------------------------------------------------------------------
/R/cytofin_homogenize.R:
--------------------------------------------------------------------------------
1 | #' Homogenize CyTOF channels names using a consensus antigen panel
2 | #'
3 | #' This function homogenizes CyTOF data (.fcs files) from heterogeneous sources
4 | #' according to the standard panel in a .csv file located at `panel_path.`
5 | #'
6 | #' @param metadata_path A file path leading to an .xlsx or .csv file
7 | #' containing a table of CyTOF file (.fcs file) names in the first column (`filename`)
8 | #' and additional information about each .fcs file in subsequent columns.
9 | #' Columns should include `filename`, `cohort`, `plate_number`, `patient_id`,
10 | #' `condition`, `is_anchor`, and `validation`.
11 | #'
12 | #' @param panel_path A file path leading to an .xlsx or .csv file containing
13 | #' a table of standardized antigen panel information. Columns should include
14 | #' `metal_name`, `antigen_name`, `antigen_pattern`,
15 | #' `lineage`, `functional`, and `general`.
16 | #'
17 | #' @param input_data_path A folder directory containing the input .fcs files
18 | #' to be homogenized.
19 | #'
20 | #' @param output_data_path A folder directory to which the output (i.e.
21 | #' homogenized) .fcs files should be written.
22 | #'
23 | #' @param prefix A string appended to the name of each input file to create the
24 | #' name of the corresponding output file (post-homogenization). Defaults to
25 | #' "homogenized_" (e.g. an input file named "file1.fcs" will correspond to
26 | #' the output file "homogenized_file1.fcs" saved in `output_data_path`).
27 | #'
28 | #' @param verbose A boolean value indicating whether progress message should be
29 | #' printed to the console during homogenization. Defaults to FALSE.
30 | #'
31 | #' @return `cytofin_homogenize` doesn't return anything. Instead, it has the
32 | #' side-effect of saving homogenized files (in .fcs format) to the directory
33 | #' specified with `output_data_path`. Each of the saved files will contain
34 | #' homogenized, user-defined channels according to details specified in the
35 | #' file at `panel_path.`
36 | #'
37 | #' @export
38 | #'
39 | cytofin_homogenize <-
40 | function(
41 | metadata_path,
42 | panel_path,
43 | input_data_path,
44 | output_data_path,
45 | prefix = "homogenized_",
46 | verbose = FALSE
47 | ) {
48 |
49 | # create output directory for homogenized .fcs files
50 | dir.create(output_data_path, showWarnings = FALSE, recursive = TRUE)
51 |
52 | # read metadata table
53 | md <- cytofin_read_metadata(metadata_path)
54 |
55 | # read reference panel information
56 | ref_panel <- cytofin_read_panel_info(panel_path)
57 |
58 | # for all files in the input directory
59 | for (file in md$filename) {
60 | # read in FCS file
61 | sink(file = "/dev/null")
62 | fcs_raw <-
63 | flowCore::read.FCS(
64 | filename = file.path(input_data_path, file),
65 | transformation = FALSE,
66 | truncate_max_range = FALSE
67 | )
68 | sink()
69 | if(verbose) {
70 | cat("filename:", file, "\n")
71 | }
72 |
73 | # parse panel in FCS files
74 | data_panel_antigens <-
75 | flowCore::pData(flowCore::parameters(fcs_raw))$desc
76 |
77 | data_panel_metals <-
78 | flowCore::pData(flowCore::parameters(fcs_raw))$name
79 |
80 | # for each channel in the reference panel
81 | for (i in 1:length(ref_panel$antigen_name)) {
82 | tryCatch(
83 | {
84 | # extract the antigen name in the reference and its corresponding regex
85 | ref_antigen <- ref_panel$antigen_name[[i]]
86 | ref_antigen_regex <- ref_panel$antigen_pattern[[i]]
87 |
88 | # Find the index of the data antigen corresponding to the reference antigen
89 | data_antigen_index <-
90 | stringr::str_detect(
91 | string = tidyr::replace_na(data_panel_antigens,''),
92 | pattern = ref_antigen_regex
93 | )
94 | # store the name of the data antigen for reporting
95 | data_antigen <- data_panel_antigens[data_antigen_index]
96 |
97 | # if there was a match with the reference antigen's regex
98 | if (max(data_antigen_index) == 1) {
99 | # rename the data antigen in the flowFrame using the reference antigen name
100 | flowCore::pData(flowCore::parameters(fcs_raw))$desc[data_antigen_index] <-
101 | ref_antigen
102 | # otherwise
103 | }
104 |
105 | # report what was matched if verbose
106 | if(verbose) {
107 | cat(
108 | "matched data antigen: ",
109 | data_antigen,
110 | "\nwith the reference antigen: ",
111 | ref_antigen,
112 | "\nusing the regex: ",
113 | ref_antigen_regex,
114 | "\n"
115 | )
116 | }
117 | },
118 | # if an error is encountered, print some information
119 | error =
120 | function(e) {
121 | txt <-
122 | paste(
123 | md$filename, "item", i ,
124 | "data_antigen", data_antigen, "ref_antigen",
125 | ref_antigen, "ref_antigen_pattern", ref_antigen_pattern,
126 | as.character(e)
127 | )
128 | cat(txt,"\n")
129 | }
130 | )
131 | }
132 |
133 | # finalize the fcs file to write as output
134 | fcs <- homogenize_flowFrame(fcs_raw, ref_panel)
135 |
136 | # write output fcs file to the specified directory
137 | filename <- file.path(output_data_path, paste0(prefix, file))
138 | flowCore::write.FCS(fcs, filename)
139 |
140 | }
141 |
142 | }
143 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | #' Find the extension for a file
2 | #'
3 | #' @param filename A string representing the name of a file in its local directory
4 | #'
5 | #' @return The the file extension of `filename`
6 | #'
7 | #' @examples
8 | #' \dontrun{
9 | #' # example file name
10 | #' my_filename <- "my_file.txt"
11 | #'
12 | #' # find and print the extension
13 | #' my_extension <- getExtension(my_filename)
14 | #' print(my_extension)
15 | #' }
16 | get_extension <- function(filename) {
17 | ex <- strsplit(basename(filename), split="\\.")[[1]]
18 | return(ex[[-1]])
19 | }
20 |
21 |
22 | #' Alter a flowFrame to only include data from channels in a reference panel
23 | #'
24 | #' @param fcs_raw A flowFrame containing unprocessed CyTOF data
25 | #' @param ref_panel A data.frame representing the reference panel data for a
26 | #' cytofin analysis.
27 | #'
28 | #' @return a homogenized flowFrame
29 | #'
30 | homogenize_flowFrame <- function(fcs_raw, ref_panel) {
31 |
32 | #extract some needed values from the raw fcs data and the reference panel
33 | ref_markers <- ref_panel$antigen_name
34 | ref_metals <- ref_panel$metal_name
35 |
36 | panel_fcs <- flowCore::pData(flowCore::parameters(fcs_raw))
37 | panel_markers <- panel_fcs$desc
38 | panel_metals <- as.character(panel_fcs$name)
39 | panel_rownames <- row.names(panel_fcs)
40 |
41 | # create new flowFrame to be modified
42 | fcs <- fcs_raw
43 |
44 | # only keep the markers/metals that are present in the reference marker list
45 | panel_markers_to_keep <- intersect(panel_markers, ref_markers)
46 | panel_metals_to_keep <- panel_metals[panel_markers %in% panel_markers_to_keep]
47 |
48 | # create dictionary to look up which metals from the reference panel
49 | # correspond to shared antigens with the FCS file's panel (which may be on
50 | # different metals)
51 | names(ref_metals) <- ref_markers
52 |
53 | # perform lookup to "rename" metals in the FCS file's panel to the standard
54 | # metal name in the reference
55 | new_panel_metals <- as.character(ref_metals[panel_markers_to_keep])
56 |
57 | # remove columns not present in ref_panel from final fcs file
58 | expr <- flowCore::exprs(fcs)
59 | new_expr <- expr[, panel_metals_to_keep]
60 |
61 | # rename metals using the looked-up values
62 | colnames(new_expr) <- new_panel_metals
63 |
64 | # sort columns into the order in the reference panel
65 | final_expr <- new_expr[ , ref_metals]
66 | flowCore::exprs(fcs) <- final_expr
67 |
68 | # return result
69 | return(fcs)
70 |
71 | }
72 |
73 |
74 | #' Read in a cytofin metadata file
75 | #'
76 | #' This function reads a cytofin metadata file from a connection
77 | #' that points to a .csv or a .xlsx file
78 | #'
79 | #' @param metadata_path A filepath leading to an .xlsx or .csv file
80 | #' containing a table of CyTOF file (.fcs file) names. Columns should include
81 | #' `filename`, `cohort`, `plate_number`, `patient_id`, `condition`, `population`,
82 | #' and `validation`. TO DO: Change the names of these columns to more descriptive
83 | #' names and make sure that they are all actually needed.
84 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
85 | #'
86 | #' @return A data.frame containing the metadata information in the
87 | #' file stored at `metadata_path`.
88 | #'
89 | #' @examples
90 | #' \dontrun{
91 | #' my_path <- file.path("~", "foo", "bar", "metadata.csv")
92 | #' my_metadata <- cytofin:::cytofin_read_metadata(my_path)
93 | #' }
94 | cytofin_read_metadata <- function(metadata_path) {
95 |
96 | if (get_extension(metadata_path) == "xlsx") {
97 | md <- readxl::read_excel(metadata_path)
98 | } else if (get_extension(metadata_path) == "csv") {
99 | md <- read.csv(metadata_path)
100 | } else {
101 | # throw error if the wrong kind of file is given
102 | stop("metadata_path must point to an .xlsx or .csv file")
103 | }
104 |
105 | # trim whitespace from all strings in metadata
106 | md <- data.frame(lapply(md, trimws), stringsAsFactors = FALSE)
107 |
108 | return(md)
109 | }
110 |
111 |
112 | #' Read in a cytofin reference panel information
113 | #'
114 | #' This function reads cytofin reference panel information from a connection
115 | #' that points to a .csv or a .xlsx file
116 | #'
117 | #' @param panel_path A file path leading to an .xlsx or .csv file containing
118 | #' a table of standardized antigen panel information. Columns should include
119 | #' `desc`, `range`, `metal_pattern`, `antigen_pattern`, `Lineage`, `Functional`,
120 | #' and `General`. TO DO: Change the names of these columns to more descriptive
121 | #' names and make sure that they are all actually needed.
122 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
123 | #'
124 | #' @return A data.frame containing the reference panel information in the
125 | #' file stored at `panel_path`.
126 | #'
127 | #' @examples
128 | #' \dontrun{
129 | #' my_path <- file.path("~", "foo", "bar", "panel.csv")
130 | #' my_metadata <- cytofin:::cytofin_read_panel_info(my_path)
131 | #' }
132 | cytofin_read_panel_info <- function(panel_path) {
133 |
134 | if (get_extension(panel_path) == "xlsx") {
135 | ref_panel <- readxl::read_excel(panel_path)
136 | } else if (get_extension(panel_path) == "csv") {
137 | ref_panel <- read.csv(panel_path)
138 | } else {
139 | # throw error if the wrong kind of file is given
140 | stop("panel_path must point to an .xlsx or .csv file")
141 | }
142 |
143 | # trim whitespace from all strings in reference panel
144 | ref_panel <- data.frame(lapply(ref_panel, trimws), stringsAsFactors = FALSE)
145 |
146 | return(ref_panel)
147 | }
148 |
149 |
150 | #' Reverses arcsinh transformation with cofactor `scale_factor` and a shift of `shift_factor`.
151 | #'
152 | #' @param x A numeric vector.
153 | #'
154 | #' @param shift_factor The scalar value `a` in the following equation used to
155 | #' transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
156 | #' `new_x <- asinh(a + b * x)`.
157 | #'
158 | #' @param scale_factor The scalar value `b` in the following equation used to
159 | #' transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
160 | #' `new_x <- asinh(a + b * x)`.
161 | #'
162 | #' @return A numeric vector after undergoing reverse
163 | #' arcsinh transformation
164 | #'
165 | #'
166 | rev_asinh <- function(x, shift_factor, scale_factor) {
167 |
168 | new_x <- (sinh(x) - shift_factor) / scale_factor
169 | return(new_x)
170 |
171 | }
172 |
--------------------------------------------------------------------------------
/R/cytofin_prep_anchors.R:
--------------------------------------------------------------------------------
1 | #' Prepare CyTOF controls for batch normalization across plates
2 | #'
3 | #' This function calculates reference statistics needed for CytofIn batch normalization.
4 | #' Specifically, it calculates the universal mean and universal variance vectors
5 | #' of the generalized anchors identified in the metadata file at `metadata_path`;
6 | #' in addition, it calculates the non-channel-specific bulk mean and bulk variance
7 | #' of the generalized anchors.
8 | #'
9 | #' @param metadata_path A file path leading to an .xlsx or .csv file
10 | #' containing a table of CyTOF file (.fcs file) names. Columns should include
11 | #' `filename`, `cohort`, `plate_number`, `patient_id`, `condition`, `population`,
12 | #' and `validation`.
13 | #'
14 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
15 | #'
16 | #' @param panel_path A file path leading to an .xlsx or .csv file containing
17 | #' a table of standardized antigen panel information. Columns should include
18 | #' `metal_name`, `antigen_name`, `antigen_pattern`, `lineage`, `functional`,
19 | #' and `general`.
20 | #'
21 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
22 | #'
23 | #' @param input_data_path A folder directory containing the input CyTOF files
24 | #' to be prepped for normalization. These files should already be homogenized,
25 | #' and in most cases this will be the directory to which the output
26 | #' .fcs files from `cytofin_homogenize` were written.
27 | #'
28 | #' @param input_prefix The string that was appended to the name of the input files
29 | #' of `cytofin_homogenize` to create their corresponding output file names.
30 | #' Defaults to "homogenized_".
31 | #'
32 | #' @param output_path A file path specifying where to save the output .rds
33 | #' file containing the statistics calculated from this step and the concatenated
34 | #' .FCS files containing all cells from the generalized anchor samples. Defaults
35 | #' to "none", in which case no files are saved.
36 | #'
37 | #' @param shift_factor The scalar value `a` in the following equation used to
38 | #' transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
39 | #'
40 | #' `new_x <- asinh(a + b*x)`.
41 | #'
42 | #' Defaults to 0.
43 | #'
44 | #' @param scale_factor The scalar value `b` in the following equation used to
45 | #' transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
46 | #'
47 | #' `new_x <- asinh(a + b*x)`.
48 | #'
49 | #' Defaults to 0.2.
50 | #'
51 | #' @return a `list()` of summary statistics with the following elements:
52 | #' * __universal_var__: a named numeric vector in which each entry corresponds to the
53 | #' universal variance of an antigen channel in the homogenized dataset
54 | #' * __universal_mean__: a named numeric vector in which each entry corresponds to the
55 | #' universal mean of an antigen channel in the homogenized dataset
56 | #' * __bulk_var__: The mean of all the channel-specific universal variances
57 | #' in `universal_var` (a scalar value)
58 | #' * __bulk_mean__: The mean of all the channel-specific universal means
59 | #' in `universal_mean` (a scalar value)
60 | #'
61 | #'
62 | #' @export
63 | #'
64 |
65 | cytofin_prep_anchors <- function(
66 | metadata_path,
67 | panel_path,
68 | input_data_path,
69 | input_prefix = "homogenized_",
70 | output_path = "none",
71 | shift_factor = 0,
72 | scale_factor = 0.2
73 | ) {
74 |
75 | # create output directory if needed
76 | dir.create(output_path, showWarnings = FALSE, recursive = TRUE)
77 |
78 | # read metadata table and select only the anchor samples
79 | md_control <-
80 | dplyr::filter(cytofin_read_metadata(metadata_path), is_anchor == 1)
81 |
82 | # read reference panel information
83 | ref_panel <- cytofin_read_panel_info(panel_path)
84 |
85 | # extract character vectors of the lineage markers' metals and
86 | # the functional markers' metals
87 | lineage_markers <- ref_panel$metal_name[ref_panel$lineage == 1]
88 | functional_markers <- ref_panel$metal_name[ref_panel$functional == 1]
89 | all_markers <- c(lineage_markers, functional_markers)
90 |
91 | # read in the input data as a flowSet
92 | fcs_control <-
93 | flowCore::read.flowSet(
94 | file.path(input_data_path, paste0(input_prefix, md_control$filename)),
95 | transformation = FALSE,
96 | truncate_max_range = FALSE
97 | )
98 |
99 | # arcsinh-transform all data
100 | asinh_transform <- flowCore::arcsinhTransform(a = shift_factor, b = scale_factor)
101 | col_names <- flowCore::colnames(fcs_control)
102 | expr_untransformed <- flowCore::fsApply(fcs_control, flowCore::exprs)
103 | transform_list <- flowCore::transformList(from = col_names, tfun = asinh_transform)
104 | fcs_asinh <- flowCore::transform(fcs_control, transform_list)
105 | expr <- flowCore::fsApply(fcs_asinh, flowCore::exprs)
106 |
107 | # calculate universal mean and variance
108 | universal_mean <- apply(expr, 2, mean)
109 | universal_var <- apply(expr, 2, var)
110 |
111 | # calculate the mean and variance of all the channel-specific universal means
112 | # and variances, respectively
113 | bulk_var <- mean(universal_var[all_markers])
114 | bulk_mean <- mean(universal_mean[all_markers])
115 |
116 | # collate all reference statistics into a list
117 | result <-
118 | list(
119 | universal_var = universal_var,
120 | universal_mean = universal_mean,
121 | bulk_var = bulk_var,
122 | bulk_mean = bulk_mean
123 | )
124 |
125 | # if the user wants to store intermediate files
126 | if (output_path != "none") {
127 |
128 | # save universal mean and variance information
129 | readr::write_rds(
130 | x = result,
131 | file = file.path(output_path, "anchor_statistics.rds")
132 | )
133 |
134 | # write concatenated control file (asinh-transformed)
135 | gc()
136 | filename <- file.path(output_path, "concatenated_control.fcs")
137 | ff <- flowCore::flowFrame(expr)
138 | data_panel_name <- flowCore::pData(flowCore::parameters(fcs_control[[1]]))$desc
139 | flowCore::pData(flowCore::parameters(ff))$desc <- data_panel_name
140 | flowCore::write.FCS(ff, filename)
141 |
142 | # write concatenated control file (untransformed)
143 | gc()
144 | filename <- file.path(output_path, "concatenated_control_untransformed.fcs")
145 | ff <- flowCore::flowFrame(expr_untransformed)
146 | data_panel_name <- flowCore::pData(flowCore::parameters(fcs_control[[1]]))$desc
147 | flowCore::pData(flowCore::parameters(ff))$desc <- data_panel_name
148 | flowCore::write.FCS(ff, filename)
149 | }
150 |
151 | return(result)
152 | }
153 |
154 |
--------------------------------------------------------------------------------
/R/cytofin_normalize.R:
--------------------------------------------------------------------------------
1 | #' Batch normalize CyTOF plates from heterogeneous sources using external anchors
2 | #'
3 | #' This function batch normalizes CyTOF data from multiple plates (from one or more
4 | #' experimental cohorts) using external (i.e. "generalized") anchors.
5 | #'
6 | #' @param metadata_path A filepath leading to an .xlsx or .csv file
7 | #' containing a table of CyTOF file (.fcs file) names. Columns should include
8 | #' `filename`, `cohort`, `plate_number`, `patient_id`, `condition`, `is_anchor`,
9 | #' and `validation`.
10 | #'
11 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
12 | #'
13 | #' @param panel_path A file path leading to an .xlsx or .csv file containing
14 | #' a table of standardized antigen panel information. Columns should include
15 | #' `metal_name`, `antigen_name`, `antigen_pattern`,
16 | #' `lineage`, `functional`, and `general`.
17 | #'
18 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
19 | #'
20 | #' @param anchor_statistics a list produced by the `cytofin_prep_anchors`
21 | #' function or the file path to an .rds object containing anchor reference statistics.
22 | #'
23 | #' @param input_data_path A folder directory containing the input CyTOF files
24 | #' to be normalized. In most cases, this will be the directory to which the output
25 | #' .fcs files from `cytofin_homogenize` were written.
26 | #'
27 | #' @param output_data_path A folder directory to which the output (i.e.
28 | #' batch normalized/batch corrected) .fcs files should be written.
29 | #'
30 | #' @param mode A string indicating which transformation function should be used
31 | #' for batch normalization ("meanshift", "meanshift_bulk", "variance", "z_score",
32 | #' or "beadlike").
33 | #'
34 | #' @param input_prefix The string that was appended to the name of the input files
35 | #' of `cytofin_homogenize` to create their corresponding output file names.
36 | #' Defaults to "homogenized_".
37 | #'
38 | #' @param output_prefix A string to be appended to the name of each input file
39 | #' to create the name of the corresponding output file (post-homogenization).
40 | #' Defaults to "normalized_" (e.g. an input file named "file1.fcs" will correspond to
41 | #' the output file "normalized_file1.fcs" saved in `output_data_path`).
42 | #'
43 | #' @param shift_factor The scalar value `a` in the following equation used to
44 | #' transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
45 | #'
46 | #' `new_x <- asinh(a + b * x)`.
47 | #'
48 | #' Defaults to 0.
49 | #'
50 | #' @param scale_factor The scalar value `b` in the following equation used to
51 | #' transform CyTOF raw data ion counts using the hyperbolic arc-sine function:
52 | #'
53 | #' `new_x <- asinh(a + b * x)`.
54 | #'
55 | #' Defaults to 0.2.
56 | #'
57 | #' @return Batch-normalized .fcs files are saved in the directory specified by
58 | #' `output_data_path`.
59 | #'
60 | #' In addition, a data.frame containing information about
61 | #' each input .fcs file (that can be used for plotting with `cytofin_make_plots`)
62 | #' is returned with the following columns:
63 | #' * All of the columns in the input metadata table (located at `metadata_path`)
64 | #' * __universal_mean__: the universal mean vector to which all files are adjusted
65 | #' (will be identical for all input .fcs files)
66 | #' * __universal_var__: the universal mean vector to which all files are adjusted
67 | #' (will be identical for all input .fcs files)
68 | #' * __anchor_mean__: the mean (across all cells) vector for the anchor file associated
69 | #' with each input .fcs file (i.e. the anchor located on the same plate as the
70 | #' input .fcs file) before batch normalization.
71 | #' * __anchor_var__: the variance (across all cells) vector for the anchor file associated
72 | #' with each input .fcs file (i.e. the anchor located on the same plate as the
73 | #' input .fcs file)
74 | #' * __mean_b4norm__: the mean (across all cells) vector of the input .fcs file
75 | #' before batch normalization.
76 | #' * __var_b4norm__: the variance (across all cells) vector of the input .fcs file
77 | #' before batch normalization.
78 | #' * __mean_norm__: the mean (across all cells) vector of the input .fcs file
79 | #' after batch normalization.
80 | #' * __var_norm__: the variance (across all cells) vector of the input .fcs file
81 | #' after batch normalization.
82 | #' * __anchor_mean_norm__: the mean (across all cells) vector for the anchor file associated
83 | #' with each input .fcs file (i.e. the anchor located on the same plate as the
84 | #' input .fcs file) after batch normalization.
85 | #' * __anchor_var_norm__: the variance (across all cells) vector for the anchor file associated
86 | #' with each input .fcs file (i.e. the anchor located on the same plate as the
87 | #' input .fcs file) after batch normalization.
88 | #'
89 | #' @export
90 | #'
91 | cytofin_normalize <-
92 | function(
93 | metadata_path,
94 | panel_path,
95 | anchor_statistics,
96 | input_data_path,
97 | output_data_path,
98 | mode = c("meanshift", "meanshift_bulk", "variance", "z_score", "beadlike"),
99 | input_prefix = "homogenized_",
100 | output_prefix = "normalized_",
101 | shift_factor = 0,
102 | scale_factor = 0.2
103 | ) {
104 |
105 | # create output directory
106 | dir.create(output_data_path, showWarnings = FALSE, recursive = TRUE)
107 |
108 | #read metadata table
109 | md <- cytofin_read_metadata(metadata_path)
110 |
111 | # separate metadata for anchor samples
112 | md_control <- dplyr::filter(md, is_anchor == 1)
113 |
114 | # if anchor_statistics is a file path
115 | if (is.character(anchor_statistics)) {
116 | anchor_statistics_list <- readr::read_rds(anchor_statistics)
117 | # else if anchor_statistics is a list
118 | } else if (is.list(anchor_statistics)) {
119 | anchor_statistics_list <- anchor_statistics
120 | } else {
121 | stop("anchor_statistics must be either a character vector (file path) or a list")
122 | }
123 |
124 | # extract needed values from anchor_statistics_list
125 | universal_var <- anchor_statistics_list$universal_var
126 | universal_mean <- anchor_statistics_list$universal_mean
127 | bulk_var <- anchor_statistics_list$bulk_var
128 | bulk_mean <- anchor_statistics_list$bulk_mean
129 |
130 | # read in standardized panel
131 | ref_panel <- cytofin_read_panel_info(panel_path = panel_path)
132 |
133 | # compile list of all markers to keep during analysis
134 | lineage_markers <-
135 | as.character(ref_panel$metal_name[ref_panel$lineage == 1])
136 |
137 | functional_markers <-
138 | as.character(ref_panel$metal_name[ref_panel$functional == 1])
139 |
140 | all_markers <- c(lineage_markers, functional_markers)
141 |
142 | # create transformation functions
143 | norm_1 <- function(x) {
144 | y <- universal_mean[all_markers]
145 | z <- x
146 | m <- match(names(y), names(x))
147 | z[m] <- z[m] - anchor_mean[m] + universal_mean[m]
148 | return(z)
149 | } #meanshift
150 |
151 | norm_2 <- function(x) {
152 | y <- universal_mean[all_markers]
153 | z <- x
154 | m <- match(names(y), names(x))
155 | z[m] <- z[m] - mean(anchor_mean[m]) + mean(universal_mean[m])
156 | return(z)
157 | } #meanshift bulk
158 |
159 | norm_3 <- function(x) {
160 | y <- universal_mean[all_markers]
161 | z <- x
162 | m <- match(names(y), names(x))
163 | z[m] <-
164 | (z[m] - anchor_mean[m] + universal_mean[m]) * sqrt(universal_var[m])/sqrt(anchor_var[m])
165 | return(z)
166 | } #variance
167 |
168 | norm_4 <- function(x) {
169 | y <- universal_mean[all_markers]
170 | z <- x
171 | m <- match(names(y), names(x))
172 | z[m] <-
173 | (z[m] - anchor_mean[m]) * sqrt(universal_var[m])/sqrt(anchor_var[m]) + universal_mean[m]
174 | return(z)
175 | } #z-score
176 |
177 | norm_5 <- function(x) {
178 | y <- universal_mean[all_markers]
179 | z <- x
180 | m <- match(names(y), names(x))
181 | z[m] <- z[m] * lm(universal_mean[m] ~ anchor_mean[m])$coefficient[[2]]
182 | return(z)
183 | } #beadlike
184 |
185 | # find the user-specified normalization function
186 | if (mode == "meanshift") {
187 | norm <- norm_1
188 | } else if (mode == "meanshift_bulk") {
189 | norm <- norm_2
190 | } else if (mode == "variance") {
191 | norm <- norm_3
192 | } else if (mode == "z_score") {
193 | norm <- norm_4
194 | } else if (mode == "beadlike") {
195 | norm <- norm_5
196 | }
197 |
198 | # create final data structure
199 | result <-
200 | dplyr::mutate(
201 | md,
202 | universal_mean = list(0),
203 | universal_var = list(0),
204 | anchor_mean = list(0),
205 | anchor_var = list(0),
206 | mean_b4norm = list(0),
207 | var_b4norm = list(0),
208 | mean_norm = list(0),
209 | var_norm = list(0),
210 | anchor_mean_norm = list(0),
211 | anchor_var_norm = list(0)
212 | )
213 |
214 | # for each file being batch-normalized...
215 | for (i in 1:length(md$filename)) {
216 | # calculate adjustment parameters from control plate
217 |
218 | # find the anchor file corresponding to the same plate and cohort
219 | # as the file being batch normalized
220 | filename_anchor <-
221 | md_control$filename[
222 | which(
223 | (md_control$plate_number == md$plate_number[i]) &
224 | (md_control$cohort == md$cohort[i])
225 | )
226 | ]
227 |
228 | # read in the anchor file
229 | fcs <-
230 | flowCore::read.FCS(
231 | file.path(input_data_path, paste0(input_prefix, filename_anchor)),
232 | transformation = FALSE,
233 | truncate_max_range = FALSE
234 | )
235 |
236 | # arcsinh transform all columns of the anchor file
237 | asinh_transform <-
238 | flowCore::arcsinhTransform(a = shift_factor, b = scale_factor)
239 | col_names <- flowCore::colnames(fcs)
240 | tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
241 | fcs_asinh <- flowCore::transform(fcs, tlist)
242 |
243 | # find the mean and variance vector of the anchor file
244 | anchor_expr <- flowCore::exprs(fcs_asinh)
245 | anchor_mean <- apply(anchor_expr, 2, mean)
246 | anchor_var <- apply(anchor_expr, 2, var)
247 |
248 | # find the bulk mean and bulk variance of the anchor file
249 | anchor_bulk_mean <- mean(anchor_mean)
250 | anchor_bulk_var <- mean(anchor_var)
251 |
252 | # normalize the anchor file
253 | anchor_expr_norm <-
254 | t(apply(anchor_expr, 1, norm))
255 |
256 | anchor_mean_norm <- apply(anchor_expr_norm, 2, mean)
257 | anchor_var_norm <- apply(anchor_expr_norm, 2, var)
258 |
259 | # normalize the target file
260 |
261 | ## read in target file
262 | filename <- md$filename[i]
263 | fcs <-
264 | flowCore::read.FCS(
265 | file.path(input_data_path, paste0(input_prefix, filename)),
266 | transformation = FALSE,
267 | truncate_max_range = FALSE
268 | )
269 |
270 | # arcsinh-transform all columns of the target file
271 | col_names <- flowCore::colnames(fcs)
272 | tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
273 | fcs_asinh <- flowCore::transform(fcs, tlist)
274 |
275 | # extract target file's expression matrix before normalization
276 | expr_b4norm <- flowCore::exprs(fcs_asinh)
277 |
278 | # find the target file's un-normalized mean and variance vectors
279 | mean_b4norm <- apply(expr_b4norm, 2, mean)
280 | var_b4norm <- apply(expr_b4norm, 2, var)
281 |
282 | ## normalize the target file
283 | expr_norm <-
284 | t(apply(expr_b4norm, 1, norm))
285 |
286 | # find the mean and variance vectors of the normalized target file
287 | mean_norm <- apply(expr_norm, 2, mean)
288 | var_norm <- apply(expr_norm, 2, var)
289 |
290 | # create flowFrame to be written as the output .fcs file for this sample
291 | fcs_norm <- flowCore::flowFrame(expr_norm)
292 |
293 | # normalization completed, reverse asinh transformation for final output
294 | my_rev_asinh <-
295 | function(x) {
296 | rev_asinh(x, shift_factor = shift_factor, scale_factor = scale_factor)
297 | }
298 | tlist2 <- flowCore::transformList(from = col_names, tfun = my_rev_asinh)
299 | fcs_asinh_rev <- flowCore::transform(fcs_norm, tlist2)
300 |
301 | # prepare and write out final .fcs file
302 | flowCore::pData(flowCore::parameters(fcs_asinh_rev))$desc <-
303 | flowCore::pData(flowCore::parameters(fcs_asinh))$desc
304 | fcs_name <- file.path(output_data_path, paste0(output_prefix, filename))
305 | flowCore::write.FCS(x = fcs_asinh_rev, filename = fcs_name)
306 |
307 | # update final data structure
308 | result$universal_mean[[i]] <- universal_mean
309 | result$universal_var[[i]] <- universal_var
310 | result$anchor_mean[[i]] <- anchor_mean
311 | result$anchor_var[[i]] <- anchor_var
312 | result$mean_b4norm[[i]] <- mean_b4norm
313 | result$var_b4norm[[i]] <- var_b4norm
314 | result$mean_norm[[i]] <- mean_norm
315 | result$var_norm[[i]] <- var_norm
316 | result$anchor_mean_norm[[i]] <- anchor_mean_norm
317 | result$anchor_var_norm[[i]] <- anchor_var_norm
318 | }
319 |
320 | # add marker list and arcsinh transformation parameters to the final data structure
321 | attr(result, which = "shift_factor") <- shift_factor
322 | attr(result, which = "scale_factor") <- scale_factor
323 | attr(result, which = "all_markers") <- all_markers
324 |
325 | # return result
326 | return(result)
327 |
328 | }
329 |
--------------------------------------------------------------------------------
/R/cytofin_make_plots.R:
--------------------------------------------------------------------------------
1 | #' Make diagnostic plots to evaluate CytofIn batch normalization
2 | #'
3 | #' When given the output data structure from `cytofin_normalize` or `cytofin_normalize_nrs`,
4 | #' this function plots mean and variance plots for all normalized .fcs files and their
5 | #' associated anchors.
6 | #'
7 | #' @param normalization_result An output data.frame produced by the `cytofin_normalize` or
8 | #' `cytofin_normalize_nrs` function.
9 | #'
10 | #' The following columns should be present: `filename`,
11 | #' `cohort`, `plate_number`, `patient_id`, `condition`, `is_anchor`, `validation`,
12 | #' `universal_var`, `anchor_mean`, `anchor_var`, `mean_b4norm`, `var_b4norm`,
13 | #' `mean_norm`, `var_norm`, `mean_ctr_norm`, `var_ctr_norm`.
14 | #'
15 | #' @param which_rows A numeric vector indicating which rows of `normalization_result`
16 | #' (i.e. which .fcs files in the combined dataset) should be used for plotting. Defaults
17 | #' to 1:nrow(normalization_result), which will make all possible plots.
18 | #'
19 | #' @param val_path The folder directory containing validation (i.e. bead-normalized)
20 | #' .fcs files corresponding to the input .fcs files in the metadata table. (Optional).
21 | #'
22 | #' @return 8 diagnostic plots are made for each input .fcs file that was batch
23 | #' normalized (i.e. each .fcs file represented as a row in `normalization_result`).
24 | #' From left-to-right (and top-to-bottom), these plots represent the following:
25 | #' 1) The entry in the universal mean vector corresponding to each antigen in the
26 | #' consensus antigen panel. X-axis: antigen index in the universal mean vector.
27 | #' Y-axis: Arcsinh-transformed entry in the universal mean vector corresponding
28 | #' to each antigen.
29 | #' 2) The mean (across all cells) antigen expression vector for the anchor
30 | #' associated with each input .fcs file both before and after normalization.
31 | #' X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the
32 | #' anchor .fcs file.
33 | #' 3) The mean (across all cells) antigen expression vector for each input
34 | #' .fcs file both before and after normalization.
35 | #' X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the
36 | #' input .fcs file.
37 | #' 4) The mean (across all cells) antigen expression vector for each "validation"
38 | #' (i.e. bead-normalized) .fcs file both before and after bead-normalization.
39 | #' This plot can be used to compare CytofIn batch normalization with gold-
40 | #' standard approaches. If `val_path` is "none", this plot will be identical to
41 | #' plot 3 (see above).
42 | #' X-axis: antigen index (as in plot 1). Y-axis: Mean antigen expression in the
43 | #' validation .fcs file.
44 | #' 5) The entry in the universal standard deviation vector corresponding to each antigen in the
45 | #' consensus antigen panel. X-axis: antigen index in the universal standard deviation vector.
46 | #' Y-axis: Arcsinh-transformed entry in the universal standard deviation vector corresponding
47 | #' to each antigen.
48 | #' 6) The standard deviation (across all cells) antigen expression vector for the anchor
49 | #' associated with each input .fcs file both before and after normalization.
50 | #' X-axis: antigen index (as in plot 1). Y-axis: the standard deviation of all
51 | #' antigen expression values in the anchor .fcs file.
52 | #' 7) The standard deviation (across all cells) antigen expression vector for each input
53 | #' .fcs file both before and after normalization.
54 | #' X-axis: antigen index (as in plot 1). Y-axis: the standard deviation of all
55 | #' antigen expression values in the input .fcs file.
56 | #' 8) The standard deviation (across all cells) antigen expression vector for each "validation"
57 | #' (i.e. bead-normalized) .fcs file both before and after bead-normalization.
58 | #' This plot can be used to compare CytofIn batch normalization with gold-
59 | #' standard approaches. If `val_path` is "none", this plot will be identical to
60 | #' plot 3 (see above).
61 | #' X-axis: antigen index (as in plot 1). Y-axis: Standard deviation of all
62 | #' antigen expression values in the validation .fcs file.
63 | #'
64 | #' @export
65 | #'
66 | cytofin_make_plots <-
67 | function(
68 | normalization_result,
69 | which_rows = 1:nrow(normalization_result),
70 | val_path = "none"
71 | ) {
72 |
73 | # extract needed values from the normalization_result attributes
74 | all_markers <- attr(normalization_result, which = "all_markers")
75 | shift_factor <- attr(normalization_result, which = "shift_factor")
76 | scale_factor <- attr(normalization_result, which = "scale_factor")
77 |
78 | # filter out rows that we aren't interested in plotting
79 | normalization_result <- normalization_result[which_rows,]
80 |
81 | # for all rows in the normalization result
82 | for (i in 1:nrow(normalization_result)) {
83 |
84 | # extract needed values for the current file
85 | filename <- normalization_result$filename[[i]]
86 | plate_number <- normalization_result$plate_number[[i]]
87 | cohort <- normalization_result$cohort[[i]]
88 | universal_mean <- normalization_result$universal_mean[[i]]
89 | universal_var <- normalization_result$universal_var[[i]]
90 | anchor_mean <- normalization_result$anchor_mean[[i]]
91 | anchor_var <- normalization_result$anchor_var[[i]]
92 | mean_b4norm <- normalization_result$mean_b4norm[[i]]
93 | var_b4norm <- normalization_result$var_b4norm[[i]]
94 | mean_norm <- normalization_result$mean_norm[[i]]
95 | var_norm <- normalization_result$var_norm[[i]]
96 | anchor_mean_norm <- normalization_result$anchor_mean_norm[[i]]
97 | anchor_var_norm <- normalization_result$anchor_var_norm[[i]]
98 |
99 | # find name of the anchor that corresponds to each file
100 | md_control <- dplyr::filter(normalization_result, is_anchor == 1)
101 |
102 | filename_anchor <-
103 | md_control$filename[
104 | which(
105 | (md_control$plate_number == plate_number) &
106 | (md_control$cohort == cohort)
107 | )
108 | ]
109 |
110 | # read in validation .fcs file
111 | if (val_path != "none") {
112 | filename_val <- normalization_result$validation[i]
113 | fcs <-
114 | flowCore::read.flowSet(
115 | file.path(val_path, filename_val),
116 | transformation = FALSE,
117 | truncate_max_range = FALSE
118 | )
119 |
120 | # arcsinh-transform validation .fcs file
121 | asinh_transform <-
122 | flowCore::arcsinhTransform(a = shift_factor, b = scale_factor)
123 | col_names <- flowCore::colnames(fcs)
124 | tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
125 | fcs_asinh <- flowCore::transform(fcs, tlist)
126 | expr_val <- flowCore::fsApply(fcs_asinh, flowCore::exprs)
127 |
128 | # find the mean and variance vector from the validation file
129 | mean_val <- apply(expr_val, 2, mean)
130 | var_val <- apply(expr_val, 2, var)
131 |
132 | # find the bulk mean and variance from the validation file
133 | mean_val_mean <- mean(mean_val)
134 | var_val_mean <- mean(var_val)
135 | }
136 |
137 | # make visualizations
138 | par(mfrow = c(2, 4))
139 | len <- length(universal_mean[all_markers])
140 |
141 | # expression (mean)
142 | # plot 1
143 | plot(
144 | universal_mean[all_markers],
145 | col = "red",
146 | xlab = "antigen",
147 | ylab = "universal expression (mean)",
148 | xlim = c(0, len),
149 | ylim = c(-5, 10),
150 | main = "overall",
151 | cex.main = 1
152 | )
153 |
154 | legend(1, 10, legend = c("universal"), col = c("red"), lty = 1:2, cex = 0.8)
155 |
156 | # plot 2
157 | plot(
158 | anchor_mean[all_markers],
159 | col = "cyan",
160 | xlab = "antigen",
161 | ylab = "control expression (mean)",
162 | xlim = c(0, len),
163 | ylim = c(-5, 10),
164 | main = filename_anchor,
165 | cex.main = 0.8
166 | )
167 |
168 | par(new = TRUE)
169 | plot(
170 | anchor_mean_norm[all_markers],
171 | col = "blue",
172 | xlab = "antigen",
173 | ylab = "control expression (mean)",
174 | xlim = c(0, len),
175 | ylim = c(-5, 10),
176 | cex.main = 0.8
177 | )
178 | legend(1, 10, legend = c("normalized", "original"), col = c("blue", "cyan"), lty = 1:2, cex = 0.8)
179 |
180 | # plot 3
181 | plot(
182 | mean_b4norm[all_markers],
183 | col = "green",
184 | xlab = "antigen",
185 | ylab = "sample expression (mean)",
186 | xlim = c(0, len),
187 | ylim = c(-5, 10),
188 | main = filename,
189 | cex.main = 0.8
190 | )
191 | par(new = TRUE)
192 | plot(
193 | mean_norm[all_markers],
194 | col = "darkgreen",
195 | xlab = "antigen",
196 | ylab = "sample expression (mean)",
197 | xlim = c(0, len),
198 | ylim = c(-5, 10),
199 | cex.main = 0.8
200 | )
201 | legend(1, 10, legend = c("normalized", "original"), col = c("darkgreen", "green"), lty = 1:2, cex = 0.8)
202 |
203 | # plot 4
204 | if (val_path != "none") {
205 | plot(
206 | mean_b4norm[all_markers],
207 | col = "green",
208 | xlab = "antigen",
209 | ylab = "overlay expression (mean)",
210 | xlim = c(0, len),
211 | ylim = c(-5, 10)
212 | )
213 |
214 | par(new = TRUE)
215 | plot(
216 | mean_norm[all_markers],
217 | col = "darkgreen",
218 | xlab = "antigen",
219 | ylab = "overlay expression (mean)",
220 | xlim = c(0, len),
221 | ylim = c(-5, 10)
222 | )
223 |
224 | par(new = TRUE)
225 | plot(
226 | mean_val[all_markers],
227 | col = "purple",
228 | xlab = "antigen",
229 | ylab = "overlay expression (mean)",
230 | xlim = c(0, len),
231 | ylim = c(-5, 10)
232 | )
233 | par(new = TRUE)
234 | legend(1, 10, legend = c("original", "normalized", "validation"), col = c("green", "darkgreen", "purple"), lty = 1:2, cex = 0.8)
235 |
236 | } else {
237 |
238 | plot(
239 | mean_b4norm[all_markers],
240 | col = "green",
241 | xlab = "antigen",
242 | ylab = "overlay expression (mean)",
243 | xlim = c(0, len),
244 | ylim = c(-5, 10)
245 | )
246 | par(new = TRUE)
247 | plot(mean_norm[all_markers], col = "darkgreen", xlab = "antigen", ylab = "overlay expression (mean)", xlim = c(0, len), ylim = c(-5, 10))
248 | par(new = TRUE)
249 | legend(1, 10, legend = c("normalized", "original"), col = c("darkgreen", "green"), lty = 1:2, cex = 0.8)
250 | }
251 |
252 | # expression (std)
253 | # plot 5
254 | plot(
255 | sqrt(universal_var[all_markers]),
256 | col = "red",
257 | xlab = "antigen",
258 | ylab = "universal expression (std)",
259 | xlim = c(0, len),
260 | ylim = c(-5, 10),
261 | main = "overall",
262 | cex.main = 1
263 | )
264 | legend(1, 10, legend = c("universal"), col = c("red"), lty = 1:2, cex = 0.8)
265 |
266 | # plot 6
267 | plot(sqrt(anchor_var[all_markers]), col = "cyan", xlab = "antigen", ylab = "control expression (std)", xlim = c(0, len), ylim = c(-5, 10), main = filename_anchor, cex.main = 0.8)
268 | par(new = TRUE)
269 | plot(sqrt(anchor_var_norm[all_markers]), col = "blue", xlab = "antigen", ylab = "control expression (std)", xlim = c(0, len), ylim = c(-5, 10), cex.main = 0.8)
270 | legend(1, 10, legend = c("normalized", "original"), col = c("blue", "cyan"), lty = 1:2, cex = 0.8)
271 |
272 | # plot 7
273 | plot(
274 | sqrt(var_b4norm[all_markers]),
275 | col = "green",
276 | xlab = "antigen",
277 | ylab = "sample expression (std)",
278 | xlim = c(0, len),
279 | ylim = c(-5, 10),
280 | main = filename,
281 | cex.main = 0.8
282 | )
283 |
284 | par(new = TRUE)
285 | plot(
286 | sqrt(var_norm[all_markers]),
287 | col = "darkgreen",
288 | xlab = "antigen",
289 | ylab = "sample expression (std)",
290 | xlim = c(0, len),
291 | ylim = c(-5, 10),
292 | cex.main = 0.8
293 | )
294 |
295 | legend(1, 10, legend = c("normalized", "original"), col = c("darkgreen", "green"), lty = 1:2, cex = 0.8)
296 |
297 | # plot 8
298 | if (val_path != "none") {
299 | plot(
300 | sqrt(var_b4norm[all_markers]),
301 | col = "green",
302 | xlab = "antigen",
303 | ylab = "overlay expression (std)",
304 | xlim = c(0, len),
305 | ylim = c(-5, 10)
306 | )
307 |
308 | par(new = TRUE)
309 | plot(
310 | sqrt(var_norm[all_markers]),
311 | col = "darkgreen",
312 | xlab = "antigen",
313 | ylab = "overlay expression (std)",
314 | xlim = c(0, len),
315 | ylim = c(-5, 10)
316 | )
317 |
318 | par(new = TRUE)
319 | plot(
320 | var_val[all_markers],
321 | col = "purple",
322 | xlab = "antigen",
323 | ylab = "overlay expression (std)",
324 | xlim = c(0, len),
325 | ylim = c(-5, 10)
326 | )
327 |
328 | par(new = TRUE)
329 | legend(1, 10, legend = c("original", "normalized", "validation"), col = c("green", "darkgreen", "purple"), lty = 1:2, cex = 0.8)
330 |
331 | } else {
332 | plot(
333 | sqrt(var_b4norm[all_markers]),
334 | col = "green",
335 | xlab = "antigen",
336 | ylab = "overlay expression (std)",
337 | xlim = c(0, len),
338 | ylim = c(-5, 10)
339 | )
340 |
341 | par(new = TRUE)
342 | plot(
343 | sqrt(var_norm[all_markers]),
344 | col = "darkgreen",
345 | xlab = "antigen",
346 | ylab = "overlay expression (std)",
347 | xlim = c(0, len),
348 | ylim = c(-5, 10)
349 | )
350 |
351 | par(new = TRUE)
352 | legend(
353 | 1,
354 | 10,
355 | legend = c("original", "normalized"),
356 | col = c("green", "darkgreen"),
357 | lty = 1:2, cex = 0.8
358 | )
359 | }
360 | }
361 | }
362 |
--------------------------------------------------------------------------------
/R/cytofin_normalize_nrs.R:
--------------------------------------------------------------------------------
1 | #' Batch normalize CyTOF plates from heterogeneous sources using stable channels
2 | #'
3 | #' This function batch normalizes CyTOF data from multiple plates (from one or more
4 | #' experimental cohorts) by computing the non-redundancy score (NRS) for each
5 | #' channel in the dataset, then using the most redundant (i.e. the "most stable")
6 | #' channels as a reference for batch normalization.
7 | #'
8 | #' @param metadata_path A filepath leading to an .xlsx or .csv file
9 | #' containing a table of CyTOF file (.fcs file) names. Columns should include
10 | #' `filename`, `cohort`, `plate_number`, `patient_id`, `condition`, `is_anchor`,
11 | #' and `validation`.
12 | #'
13 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
14 | #'
15 | #' @param panel_path A file path leading to an .xlsx or .csv file containing
16 | #' a table of standardized antigen panel information. Columns should include
17 | #' `metal_name`, `antigen_name`, `antigen_pattern`,
18 | #' `lineage`, `functional`, and `general`.
19 | #'
20 | #' See the vignette for details: \code{vignette("help", package = "cytofin")}
21 | #'
22 | #' @param input_data_path A folder directory containing the input CyTOF files
23 | #' to be normalized. In most cases, this will be the directory to which the output
24 | #' .fcs files from `cytofin_homogenize` were written.
25 | #'
26 | #' @param output_data_path A folder directory to which the output (i.e.
27 | #' batch normalized/batch corrected) .fcs files should be written.
28 | #'
29 | #' @param input_prefix The string that was appended to the name of the input files
30 | #' of `cytofin_homogenize` to create their corresponding output file names.
31 | #' Defaults to "homogenized_".
32 | #'
33 | #' @param output_prefix A string to be appended to the name of each input file
34 | #' to create the name of the corresponding output file (post-homogenization).
35 | #' Defaults to "normalized_" (e.g. an input file named "file1.fcs" will correspond to
36 | #' the output file "normalized_file1.fcs" saved in `output_data_path`).
37 | #'
38 | #' @param shift_factor The scalar value `a` in the following equation used to
39 | #' transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
40 | #'
41 | #' `new_x <- asinh(a + b * x)`.
42 | #'
43 | #' Defaults to 0.
44 | #'
45 | #' @param scale_factor The scalar value `b` in the following equation used to
46 | #' transform CyTOF raw data ion counts using the hyperbolic arcsinh function:
47 | #'
48 | #' `new_x <- asinh(a + b * x)`.
49 | #'
50 | #' Defaults to 0.2.
51 | #'
52 | #' @param nchannels An integer representing the number of most stable channels to
53 | #' use during batch normalization. Defaults to 3.
54 | #'
55 | #' @param make_plot A boolean value indicating if a plot depicting the non-
56 | #' redundancy scores of each marker in each .fcs file being batch normalized
57 | #' should be plotted as a side-effect of the function call. Defaults to FALSE.
58 | #'
59 | #' @return Batch-normalized .fcs files are saved in the directory specified by
60 | #' `output_data_path`.
61 | #'
62 | #' In addition, a data.frame containing information about
63 | #' each input .fcs file (that can be used for plotting with `cytofin_make_plots`)
64 | #' is returned with the following columns:
65 | #' * All of the columns in the input metadata table (located at `metadata_path`)
66 | #' * __universal_mean__: the universal mean vector to which all files are adjusted
67 | #' (will be identical for all input .fcs files)
68 | #' * __universal_var__: the universal mean vector to which all files are adjusted
69 | #' (will be identical for all input .fcs files)
70 | #' * __anchor_mean__: the mean (across all cells) vector for the anchor file associated
71 | #' with each input .fcs file (i.e. the anchor located on the same plate as the
72 | #' input .fcs file)
73 | #' * __anchor_var__: the variance (across all cells) vector for the anchor file associated
74 | #' with each input .fcs file (i.e. the anchor located on the same plate as the
75 | #' input .fcs file)
76 | #' * __mean_b4norm__: the mean (across all cells) vector of the input .fcs file
77 | #' before batch normalization.
78 | #' * __var_b4norm__: the variance (across all cells) vector of the input .fcs file
79 | #' before batch normalization.
80 | #' * __mean_norm__: the mean (across all cells) vector of the input .fcs file
81 | #' after batch normalization.
82 | #' * __var_norm__: the variance (across all cells) vector of the input .fcs file
83 | #' after batch normalization.
84 | #' * __anchor_mean_norm__: the mean (across all cells) vector for the anchor file associated
85 | #' with each input .fcs file (i.e. the anchor located on the same plate as the
86 | #' input .fcs file) after batch normalization.
87 | #' * __anchor_var_norm__: the variance (across all cells) vector for the anchor file associated
88 | #' with each input .fcs file (i.e. the anchor located on the same plate as the
89 | #' input .fcs file) after batch normalization.
90 | #'
91 | #' @export
92 | #'
93 | cytofin_normalize_nrs <-
94 | function(
95 | metadata_path,
96 | panel_path,
97 | input_data_path,
98 | output_data_path,
99 | input_prefix = "homogenized_",
100 | output_prefix = "normalized_",
101 | shift_factor = 0,
102 | scale_factor = 0.2,
103 | nchannels = 3,
104 | make_plot = FALSE
105 | ) {
106 |
107 | # create output directory
108 | dir.create(output_data_path, showWarnings = FALSE, recursive = TRUE)
109 |
110 | #read metadata table
111 | md <- cytofin_read_metadata(metadata_path)
112 |
113 | # read in standardized panel
114 | ref_panel <- cytofin_read_panel_info(panel_path = panel_path)
115 |
116 | # compile list of all markers to keep during analysis
117 | lineage_markers <-
118 | as.character(ref_panel$metal_name[ref_panel$lineage == 1])
119 |
120 | functional_markers <-
121 | as.character(ref_panel$metal_name[ref_panel$functional == 1])
122 |
123 | all_markers <- c(lineage_markers, functional_markers)
124 |
125 | # transformation function
126 | norm <- function(x) {
127 | y <- universal_mean[all_markers]
128 | z <- x
129 | m <- match(names(y), names(x))
130 | z[m] <-
131 | z[m] -
132 | mean(mean_ctr[selected_markers]) +
133 | mean(universal_mean[selected_markers])
134 | return(z)
135 | } # meanshift bulk
136 |
137 | # create final data structure
138 | result <-
139 | dplyr::mutate(
140 | md,
141 | universal_mean = list(0),
142 | universal_var = list(0),
143 | anchor_mean = list(0),
144 | anchor_var = list(0),
145 | mean_b4norm = list(0),
146 | var_b4norm = list(0),
147 | mean_norm = list(0),
148 | var_norm = list(0),
149 | anchor_mean_norm = list(0),
150 | anchor_var_norm = list(0)
151 | )
152 |
153 | ## create function to compute non-redundancy score for all channels
154 | NRS <- function(x, ncomp = 3) {
155 | pr <- prcomp(x, center = TRUE, scale. = FALSE)
156 | score <-
157 | rowSums(
158 | outer(
159 | rep(1, ncol(x)),
160 | pr$sdev[1:ncomp]^2
161 | ) *
162 | abs(pr$rotation[, 1:ncomp]))
163 | return(score)
164 | }
165 |
166 | # read in all .fcs files to be normalized
167 | fcs <-
168 | flowCore::read.flowSet(
169 | file.path(input_data_path, paste0(input_prefix, md$filename)),
170 | transformation = FALSE,
171 | truncate_max_range = FALSE
172 | )
173 |
174 | # arcsinh transform all channels of the input .fcs files
175 | asinh_transform <-
176 | flowCore::arcsinhTransform(a = shift_factor, b = scale_factor)
177 | col_names <- flowCore::colnames(fcs)
178 | tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
179 | fcs_asinh <- flowCore::transform(fcs, tlist)
180 |
181 | # find the mean and variance vector of all cells in the dataset
182 | expr <- flowCore::fsApply(fcs_asinh, flowCore::exprs)
183 | universal_mean <- apply(expr, 2, mean)
184 | universal_var <- apply(expr, 2, var)
185 |
186 | # calculate non-redundancy scores for each antigen in each .fcs file
187 | nrs_sample <-
188 | flowCore::fsApply(fcs_asinh[, all_markers], NRS, use.exprs = TRUE)
189 |
190 | # find mean non-redundancy scores for each antigen across all samples
191 | colnames(nrs_sample) <-
192 | as.character(ref_panel$antigen_name[match((colnames(nrs_sample)), ref_panel$metal_name)])
193 | nrs <- colMeans(nrs_sample, na.rm = TRUE)
194 |
195 | nrs_sample <- data.frame(nrs_sample)
196 | markers_ord <- names(sort(nrs, decreasing = TRUE))
197 | nrs_sample <- data.frame(nrs_sample)
198 | nrs_sample$sample_id <- rownames(nrs_sample)
199 |
200 | if (make_plot) {
201 | # make data.frame for plotting
202 | ggdf <-
203 | reshape2::melt(
204 | nrs_sample,
205 | id.var = "sample_id",
206 | value.name = "nrs",
207 | variable.name = "antigen"
208 | )
209 |
210 | ggdf$antigen <-
211 | factor(ggdf$antigen, levels = markers_ord)
212 |
213 | # make plot
214 | p <-
215 | ggplot2::ggplot(ggdf, ggplot2::aes(x = antigen, y = nrs)) +
216 | ggplot2::geom_point(
217 | ggplot2::aes(color = sample_id),
218 | alpha = 0.9,
219 | position = ggplot2::position_jitter(width = 0.3, height = 0)
220 | ) +
221 | ggplot2::geom_boxplot(outlier.color = NA, fill = NA) +
222 | ggplot2::stat_summary(fun = "mean", geom = "point", shape = 21, fill = "white") +
223 | ggplot2::theme_bw() +
224 | ggplot2::theme(
225 | axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust = 1)
226 | )
227 |
228 | print(p)
229 |
230 | }
231 |
232 | ####--------#####
233 |
234 | # select nchannels antigens with the lowest NRS for calibration
235 | selected_markers <- names(sort(nrs, decreasing = FALSE))[1:nchannels]
236 |
237 | # find the metal names corresponding to the chosen antigens
238 | selected_markers <- as.character(ref_panel$metal_name[match(selected_markers, ref_panel$antigen_name)])
239 |
240 | for (i in 1:length(md$filename)) {
241 | # calculate adjustment parameters from control plate
242 |
243 | # read in .fcs file
244 | filename_ctr <- md$filename[i]
245 | fcs <-
246 | flowCore::read.FCS(
247 | file.path(input_data_path, paste0(input_prefix, filename_ctr)),
248 | transformation = FALSE,
249 | truncate_max_range = FALSE
250 | )
251 |
252 | # asinh-transform .fcs file and subset out only its selected channels
253 | col_names <- flowCore::colnames(fcs)
254 | tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
255 | fcs_asinh <- flowCore::transform(fcs, tlist)
256 | expr_ctr <- flowCore::exprs(fcs_asinh[, selected_markers])
257 |
258 | # find the mean and variance of the nchannels selected channels in the .fcs file
259 | mean_ctr <- apply(expr_ctr, 2, mean)
260 | mean_ctr_mean <- mean(mean_ctr)
261 | var_ctr <- apply(expr_ctr, 2, var)
262 | var_ctr_mean <- mean(var_ctr)
263 |
264 | # batch normalize the .fcs channels
265 | expr_ctr_norm <-
266 | t(apply(flowCore::exprs(fcs_asinh), 1, norm))
267 |
268 | # find the mean and variance vectors of the normalized input file
269 | mean_ctr_norm <- apply(expr_ctr_norm[, selected_markers], 2, mean)
270 | var_ctr_norm <- apply(expr_ctr_norm[, selected_markers], 2, var)
271 |
272 | # find the bulk mean and variance of the normalized input file
273 | mean_ctr_norm_mean <- mean(mean_ctr_norm)
274 | var_ctr_norm_mean <- mean(var_ctr_norm)
275 |
276 | # normalize the target plate
277 | ## before
278 |
279 | # read in input .fcs file
280 | filename <- md$filename[i]
281 | fcs <-
282 | flowCore::read.FCS(
283 | file.path(input_data_path, paste0(input_prefix, filename)),
284 | transformation = FALSE,
285 | truncate_max_range = FALSE
286 | )
287 |
288 | # asinh-transform input .fcs file
289 | col_names <- flowCore::colnames(fcs)
290 | tlist <- flowCore::transformList(from = col_names, tfun = asinh_transform)
291 | fcs_asinh <- flowCore::transform(fcs, tlist)
292 | expr_b4norm <- flowCore::exprs(fcs_asinh)
293 |
294 | # find the mean and variance vectors before batch correction
295 | mean_b4norm <- apply(expr_b4norm, 2, mean)
296 | var_b4norm <- apply(expr_b4norm, 2, var)
297 |
298 | # find the bulk mean and bulk variance before batch correction
299 | mean_b4norm_mean <- mean(mean_b4norm)
300 | var_b4norm_mean <- mean(var_b4norm)
301 |
302 | ## after
303 | expr_norm <-
304 | t(apply(flowCore::exprs(fcs_asinh), 1, norm))
305 |
306 | # calculate mean and variance vectors for the normalized input .fcs file
307 | mean_norm <- apply(expr_norm, 2, mean)
308 | var_norm <- apply(expr_norm, 2, var)
309 |
310 | # calculate bulk mean and variance values for the normalized input .fcs file
311 | mean_norm_mean <- mean(mean_norm)
312 | var_norm_mean <- mean(var_norm)
313 |
314 | # create output flowFrame
315 | fcs_norm <- flowCore::flowFrame(expr_norm)
316 |
317 | # normalization completed, reverse transformation
318 | my_rev_asinh <-
319 | function(x) {
320 | rev_asinh(x, shift_factor = shift_factor, scale_factor = scale_factor)
321 | }
322 | tlist2 <- flowCore::transformList(from = col_names, tfun = my_rev_asinh)
323 | fcs_asinh_rev <- flowCore::transform(fcs_norm, tlist2)
324 |
325 |
326 | flowCore::pData(flowCore::parameters(fcs_asinh_rev))$desc <-
327 | flowCore::pData(flowCore::parameters(fcs_asinh))$desc
328 |
329 | # write out output .fcs file
330 | fcs_name <- file.path(output_data_path, paste0(output_prefix, filename))
331 | flowCore::write.FCS(fcs_asinh_rev, fcs_name)
332 |
333 | # update final data structure
334 | result$universal_mean[[i]] <- universal_mean
335 | result$universal_var[[i]] <- universal_var
336 | result$anchor_mean[[i]] <- mean_ctr
337 | result$anchor_var[[i]] <- var_ctr
338 | result$mean_b4norm[[i]] <- mean_b4norm
339 | result$var_b4norm[[i]] <- var_b4norm
340 | result$mean_norm[[i]] <- mean_norm
341 | result$var_norm[[i]] <- var_norm
342 | result$anchor_mean_norm[[i]] <- mean_ctr_norm
343 | result$anchor_var_norm[[i]] <- var_ctr_norm
344 | }
345 |
346 | # add marker list and arcsinh transformation parameters to the final data structure
347 | attr(result, which = "shift_factor") <- shift_factor
348 | attr(result, which = "scale_factor") <- scale_factor
349 | attr(result, which = "all_markers") <- all_markers
350 |
351 | # return result
352 | return(result)
353 |
354 |
355 | }
356 |
--------------------------------------------------------------------------------
/vignettes/cytofin.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "CytofIn Tutorial"
3 | author: "Timothy Keyes"
4 | output: rmarkdown::html_vignette
5 | vignette: >
6 | %\VignetteIndexEntry{CytofIn Tutorial}
7 | %\VignetteEngine{knitr::rmarkdown}
8 | \usepackage[utf8]{inputenc}
9 |
10 | ---
11 |
12 |
13 | ```{r, include = FALSE}
14 | knitr::opts_chunk$set(
15 | collapse = TRUE,
16 | comment = "#>",
17 | dpi = 150,
18 | out.width = "100%"
19 | )
20 | ```
21 |
22 |
23 | CytofIn (**CyTOF** **In**tegration) is an R package for homogenizing and
24 | normalizing heterogeneous [mass cytometry
25 | (CyTOF)](https://pubmed.ncbi.nlm.nih.gov/21551058/) data from diverse
26 | data sources. Specifically, `CytofIn` provides functions that perform the
27 | following tasks:
28 |
29 | - **Dataset homogenization** - CyTOF datasets that were collected
30 | separately may differ in which markers were included in their
31 | antibody panels; in addition, they may use different naming
32 | conventions for their panels' shared markers. Thus, data mining
33 | across multiple CyTOF datasets requires **homogenization,** the
34 | process of aligning each dataset's antibody panels so that they can
35 | be analyzed together. In `CytofIn`, data homogenization (i.e. panel
36 | alignment) is performed with the `cytofin_homogenize` function that
37 | leverages user-provided panel information to combine datasets.
38 | - **Dataset normalization** - Combined analysis of multiple CyTOF
39 | datasets is likely to be confounded by dataset-to-dataset batch
40 | effects due to differences in instrumentation and experimental
41 | protocols between groups. To normalize multiple CyTOF datasets with
42 | respect to these batch effects, `CytofIn` provides 3 functions:
43 | `cytofin_prep_anchors`, `cytofin_normalize`, and
44 | `cytofin_normalize_nrs`.
45 | - **Visualization** - After batch normalization, the means and
46 | standard deviations for each of the input .fcs files (as well as
47 | their associated anchors) can be visualized using the
48 | `cytofin_make_plots` function.
49 |
50 | The general CytofIn workflow unfolds in 3 steps. First, users align the
51 | panels of the CyTOF datasets being integrated using
52 | `cytofin_homogenize()`. Second, users generate reference statistics from
53 | "generalized anchors" identified on each CyTOF plate (see below) using
54 | `cytofin_prep_anchors()`. Finally, users can then normalize/batch
55 | correct the datasets relative to one another using their choice of
56 | `cytofin_normalize()` or `cytofin_normalize_nrs()`, each of which
57 | performs the normalization procedure differently (see below).
58 |
59 | # Installation
60 |
61 | To install CytofIn, run the following code:
62 |
63 | ```{r, eval = FALSE}
64 | library(devtools)
65 | install_github("bennyyclo/Cytofin")
66 | ```
67 |
68 | To attach the CytofIn package to your current R session, run the
69 | following line:
70 |
71 | ```{r}
72 | library(cytofin)
73 | ```
74 |
75 | # Data for this vignette
76 |
77 | ## Establishing a root directory
78 |
79 | For the sake of this vignette, we will work within a single folder,
80 | where we will store the input data, the output data, and all
81 | intermediate files from the CytofIn pipeline. We will default to using
82 | the current working directory, but feel free to modify the following
83 | line of code to change which path you want to use.
84 |
85 | ```{r}
86 | # change this path to wherever you want this vignette to find and store
87 | # its input and output files
88 | base_path <- getwd()
89 | ```
90 |
91 | ```{r, include = FALSE}
92 | base_path <- file.path("~", "Desktop", "cytofin_tests")
93 | ```
94 |
95 | ## Downloading the data
96 |
97 | Now that we've identified the root directory we'll use for this
98 | vignette, we will create two folders in which we will store the raw
99 | input data and the validation (bead-normalized) data used in this
100 | vignette:
101 |
102 | ```{r}
103 | dir.create(file.path(base_path, "raw_data"), showWarnings = FALSE)
104 | dir.create(file.path(base_path, "validation_data"), showWarnings = FALSE)
105 | ```
106 |
107 | To fill each of these folders with the .fcs files we're analyzing in
108 | this vignette, please download the raw input files
109 | [here](https://flowrepository.org/id/FR-FCM-Z427) and the validation
110 | files [here](https://flowrepository.org/id/FR-FCM-Z42C) on
111 | [FlowRepository](https://flowrepository.org/). Once the files are
112 | downloaded, unzip them. Finally, move all of the unzipped .fcs files from each repository into the `raw_data` and
113 | `validation_data` folders that we just created, respectively.
114 |
115 | # Usage
116 |
117 | ## CyTOF data homogenization (cytofin_homogenize)
118 |
119 | Here, the term "homogenization" refers to the process of aligning the
120 | antigen panels of multiple CyTOF experiments by (1) removing all
121 | channels that are not shared across all cohorts and (2) standardizing
122 | the antigen names used to refer to each channel so that existing
123 | analysis tools (like the `flowCore` and `tidyverse` packages) can be
124 | applied in later analytical steps. In CytofIn, dataset homogenization is
125 | performed using the `cytofin_homogenize()` function.
126 |
127 | The `cytofin_homogenize()` function takes several arguments. The first
128 | of these is `metadata_path`, a string that specifies the file path to a
129 | .csv or .xlsx metadata file containing information about each of the
130 | .fcs files being analyzed. Specifically, the metadata file will have one
131 | row for each .fcs file being analyzed and must contain the following
132 | columns (all of which will be converted to character vectors):
133 |
134 | - **filename -** Required. The name of the .fcs file within its local
135 | directory.
136 | - **cohort -** Required. The name of the cohort (i.e. experimental
137 | source) of each .fcs file.
138 | - **plate_number -** Required. The name of the CyTOF plate (e.g.
139 | "plate1", "plate2", etc.) on which the sample corresponding to each
140 | .fcs file was analyzed during data acquisition.
141 | - **patient_id -** Optional. The name of the patient to whom each .fcs
142 | file corresponds.
143 | - **condition -** Optional. The stimulation condition corresponding to
144 | each .fcs file (i.e. "basal", "IL-3", etc.).
145 | - **is_anchor -** Required. A numeric column indicating whether or not
146 | each sample should be used as an "anchor" for the batch correction
147 | procedure (1 if yes; 0 if no). Exactly one anchor should be
148 | identified for each CyTOF plate being analyzed.
149 | - **validation -** Optional. The name of the
150 | [bead-normalized](https://pubmed.ncbi.nlm.nih.gov/23512433/) .fcs
151 | file corresponding to each input file listed in the `filename`
152 | column (per gold-standard batch normalization procedure in CyTOF
153 | batch correction). Most users will ignore this column because
154 | bead-normalized data will not be available, but it can be used to
155 | validate the results of the CytofIn batch normalization algorithms
156 | if bead-normalized data are available.
157 |
158 | Importantly, only the fields marked as "required" are needed for
159 | `cytofin_homogenize()` to work; "NA" can be recorded for any/all
160 | optional columns that don't apply to the experimental design of the
161 | files being analyzed (for example, if no stimulation conditions were
162 | used in the studies being integrated, enter "NA" for each element of the
163 | `condition` column). Alternatively, these columns can be omitted from
164 | the metadata table entirely. The following image provides a visual summary of the metadata table used throughout the `CytofIn` pipeline.
165 |
166 | 
167 |
168 | For the user's convenience, the `cytofin_generate_metadata_template`
169 | function is provided to generate an example metadata .csv file filled
170 | with dummy example data in a location specified by the user:
171 |
172 | ```{r, eval = FALSE}
173 | # specify the path where you'd like to store the template file
174 | my_path <- file.path(base_path, "template_folder")
175 |
176 | # generate the template file, which then can be edited manually
177 | cytofin_generate_metadata_template(template_path = my_path)
178 | ```
179 |
180 | The second argument for `cytofin_homogenize` is `panel_path`, a string
181 | that specifies the file path to a .csv or .xlsx file containing
182 | information about the panel(s) of each of the .fcs files being analyzed.
183 | Each row represents a channel (i.e. a protein measurement) to be
184 | included in the final, homogenized panel. This file must contain the
185 | following columns:
186 |
187 | - **metal_name -** A character vector representing the name of the
188 | metal isotope measured by each channel.
189 | - **antigen_name -** A character vector representing the name of the
190 | antigen associated with a given metal isotope in the consensus panel
191 | (the final antigen name to assign to a given channel during
192 | homogenization).
193 | - **antigen_pattern -** A regular expression used to match antigen
194 | names that may differ slightly across different .fcs files. For
195 | example, the regular expression "(C\|c)(D\|d)45" will detect all of
196 | the following channel names: "cd45", "CD45", "Cd45", or "cD45".
197 | - **lineage -** A numeric vector representing whether or not a marker
198 | is a lineage marker (1 if yes; 0 otherwise).
199 | - **functional -** A numeric vector representing whether or not a
200 | marker is a functional marker (1 if yes; 0 otherwise).
201 | - **general -** A numeric vector representing whether or not a marker
202 | is a "general" (i.e. neither a lineage nor a functional) marker (1
203 | if yes; 0 otherwise).
204 |
205 | The layout of this antigen table (and how it's used during .fcs file homogenization) is displayed in the picture below.
206 |
207 | 
208 |
209 | As in `cytofin_generate_metadata_template`, the `cytofin_generate_panel_template` function is provided to
210 | generate an example metadata .csv file filled with dummy example data:
211 |
212 | ```{r, eval = FALSE}
213 | # generate the template file, which then can be edited manually
214 | cytofin_generate_panel_template(template_path = my_path)
215 | ```
216 |
217 | For many users, the most difficult part of filling out the consensus
218 | panel information table will be designing the regular expressions for
219 | the `antigen_pattern` column. However, in most cases the required
220 | regular expressions will be quite simple; for a primer on regular
221 | expressions (and their use in the
222 | [`stringr`](https://stringr.tidyverse.org/) package) written by
223 | [RStudio](https://www.rstudio.com/about/), install the `stringr` package
224 | and read the following vignette:
225 |
226 | ```{r, eval = FALSE}
227 | vignette(topic = "regular-expressions", package = "stringr")
228 | ```
229 |
230 | The next two arguments for `cytofin_homogenize` are `input_data_path`
231 | and `output_data_path`, two strings that indicate which directory input
232 | .fcs files should be read from and which directory homogenized .fcs
233 | files should be written to, respectively. Lastly, the final two
234 | arguments are optional: `prefix` allows the user to specify the prefix
235 | appended to each input .fcs file name to get the name of the
236 | corresponding output (i.e. homogenized) .fcs file name, and `verbose` is
237 | a boolean value (default = FALSE) specifying if chatty print statements
238 | should be made while the homogenization is performed.
239 |
240 | Using these arguments, `cytofin_homogenize` can homogenize a set of
241 | CyTOF files with distinct antigen naming conventions. Specifically, the
242 | program performs a regular expression search to match the synonymous
243 | term in the panel and correct the antigen name with standardized names
244 | in the panel.
245 |
246 | Example function call:
247 |
248 | ```{r, warning = FALSE}
249 | # define input paths
250 | metadata_path <-
251 | system.file(
252 | file.path("extdata", "test_metadata_raw.csv"),
253 | package = "cytofin"
254 | )
255 |
256 | panel_path <-
257 | system.file(
258 | file.path("extdata", "test_panel.csv"),
259 | package = "cytofin"
260 | )
261 |
262 | input_data_path <-
263 | file.path(base_path, "raw_data")
264 |
265 | validation_data_path <-
266 | file.path(base_path, "validation_data")
267 |
268 | # define output path
269 | # --Change this line to wherever you want the output files saved!--
270 | output_data_path <- file.path(base_path, "homogenization_output")
271 |
272 | # call homogenization function
273 | cytofin_homogenize(
274 | metadata_path = metadata_path,
275 | panel_path = panel_path,
276 | input_data_path = input_data_path,
277 | output_data_path = output_data_path
278 | )
279 | ```
280 |
281 | This function call will save homogenized .fcs files to the directory
282 | located at `output_data_path`. These files will be different from the
283 | input .fcs files in the `input_data_path` directory in that they will
284 | only contain channels whose antigen names match the `antigen_pattern`
285 | column of the reference panel located at `panel_path`. All other
286 | channels will be removed, and the names of the channels with matches in
287 | `antigen_pattern` will be standardized to the names given in the
288 | `antigen_name` column of the reference panel.
289 |
290 | The input files for this homogenization run were as follows:
291 |
292 | ```{r}
293 | list.files(input_data_path, pattern = ".fcs$")
294 | ```
295 |
296 | ...and the corresponding output file saved in the `output_data_path`
297 | directory are now as follows:
298 |
299 | ```{r}
300 | list.files(output_data_path, pattern = ".fcs$")
301 | ```
302 |
303 | ## CyTOF batch normalization
304 |
305 | After dataset homogenization, **batch correction** (or **batch
306 | normalization**) can be performed across datasets.
307 |
308 | In short, `CytofIn` performs batch normalization though the use of
309 | user-identified **generalized anchors** - which are non-identical references assumed to have low variability across batches - that can be used to estimate batch effects from samples collated from heterogeneous sources. To batch normalize using healthy control samples (one per plate) as generalized anchors (which
310 | is ideal when such samples are available), use `cytofin_normalize`. To
311 | batch normalize using the antigen channels with the lowest variability across samples as generalized anchors (which is ideal when healthy samples are unavailable on all plates being analyzed), use `cytofin_normalize_nrs`.
312 |
313 | The use of both of these functions is detailed below.
314 |
315 | ### Batch normalization using external anchors (cytofin_normalize)
316 |
317 | #### Overview
318 |
319 | The `cytofin_normalize` uses user-identified external anchors on each
320 | CyTOF plate being integrated to correct batch effects on a
321 | plate-to-plate basis. One sample on each CyTOF barcoding plate should be
322 | chosen as that plate's external anchor. In general, external anchors
323 | should be chosen based on which samples are the most biologically
324 | similar to one another from plate to plate. For example, if healthy,
325 | non-stimulated samples are included on each CyTOF plate being
326 | integrated, the only expected variability between these samples other
327 | than batch effects would be person-to-person variability. Thus, these
328 | samples are likely to be biologically similar to one another and are
329 | suitable to be chosen as external anchors. Alternatively, if a single
330 | patient or cell line was included on every CyTOF plate being integrated,
331 | the samples corresponding to that patient or cell line on each plate are
332 | would also be suitable as external anchor choices.
333 |
334 | Once users have identified 1 external anchor per plate for `CytofIn`
335 | data integration, users must mark its row in the metadata table with a
336 | "1" in the `is_anchor` column (all other samples should be marked with
337 | "0"). `CytofIn` then uses these anchors to define a **universal mean**
338 | and **universal variance** that represent the central tendency and
339 | dispersion, respectively, of the target distribution to which all
340 | samples will be batch corrected. This correction will be performed with
341 | the user's choice from one of five batch correction functions.
342 |
343 | In short, `CytofIn`'s batch normalization procedure using external
344 | anchors has two steps:
345 |
346 | 1. Preparation of external anchors
347 | 2. Application of a transformation function that performs the batch
348 | correction (of which `CytofIn` provides 5 options)
349 |
350 | We detail function calls for each of these steps below.
351 |
352 | #### Step 1 - Anchor preparation
353 |
354 | The `cytofin_prep_anchors` function concatenates the identified anchor
355 | files and then calculates summary statistics that are used for batch
356 | correction in later steps of the pipeline. First, `CytofIn` calculates
357 | the mean and standard deviation of each channel in the homogenized
358 | dataset across all cells from samples identified as external anchors.
359 | These values represent the overall central tendency and dispersion,
360 | respectively, of each channel among the anchor samples on each CyTOF
361 | plate; thus, we call them the **universal means** and **universal
362 | variances** of the `CytofIn` integration. Accordingly, the universal
363 | mean and universal variance vectors will each have *g* elements, where
364 | *g* is the number of channels in the consensus antigen panel in the
365 | panel information table. The universal mean and universal variance
366 | vectors are used in the `meanshift`, `variance`, `z-score`, and
367 | `beadlike` methods of batch correction (see below).
368 |
369 | In addition, the mean of all of the elements of the universal mean
370 | vector (i.e. the mean of all channel means) and the mean of all of the
371 | elements of the universal variance vector (i.e. the mean of all channel
372 | variances) are calculated. These values represent the central tendency
373 | and dispersion of antigen measurements in general among the healthy
374 | control samples on each CyTOF plate and are thus no longer
375 | channel-specific. Thus, we call them the *bulk mean* and *bulk
376 | variance*, and they are used in the `meanshift_bulk` batch correction
377 | method implemented in `cytofin_homogenize`.
378 |
379 | To calculate these values, we use the `cytofin_prep_anchors` function.
380 | `cytofin_prep_anchors` returns the universal mean vector, universal
381 | variance vector, bulk mean, and bulk variance as a `list()`. In
382 | addition, users are given an option to save these statistics as an .rds
383 | file in a specified directory in order to avoid performing redundant
384 | calculations in future analyses.
385 |
386 | Specifically, `cytofin_prep_anchors` takes 4 required arguments:
387 |
388 | - `metadata_path`: A connection leading to an .xlsx or .csv file
389 | containing a metadata table with information about each file to be
390 | analyzed. This file should be identical to that used for
391 | `cytofin_homogenize`.
392 | - `panel_path`: A connection leading to an .xlsx or .csv file
393 | containing information about the standardized antigen panel in the
394 | homogenized dataset. This file should be identical to that used for
395 | `cytofin_homogenize`.
396 | - `input_data_path`: A connection to a directory containing the input
397 | .FCS files from which to draw summary statistics
398 | - `output_path`: A connection to a directory where the output .rds and
399 | .FCS files will be saved. The default is "none", in which case no
400 | output files will be stored (and the only effect of the function
401 | will be to return the calculated statistics as a `list()`).
402 |
403 | In addition, `cytofin_prep_anchors` also takes 2 optional arguments
404 | relating to the conventional arcsinh transformation performed on the raw
405 | ion counts of the input data. These optional arguments are as follows:
406 |
407 | - `shift_factor`: The scalar value `a` in the following equation used
408 | to transform CyTOF raw data ion counts using the hyperbolic arcsinh
409 | function: `new_x <- asinh(a + b * x)`. Defaults to 0.
410 |
411 | - `scale_factor`: The scalar value `b` in the following equation used
412 | to transform CyTOF raw data ion counts using the hyperbolic arcsinh
413 | function: `new_x <- asinh(a + b * x)`. Defaults to 0.2.
414 |
415 | Finally, here is an example functional call of `cytofin_prep_anchors`:
416 |
417 | ```{r}
418 | input_data_path <- file.path(base_path, "homogenization_output")
419 | output_path <- file.path(base_path, "anchor_prep_output")
420 |
421 | anchor_statistics <-
422 | cytofin_prep_anchors(
423 | metadata_path = metadata_path,
424 | panel_path = panel_path,
425 | input_data_path = input_data_path,
426 | output_path = output_path
427 | )
428 |
429 | print(anchor_statistics)
430 | ```
431 |
432 | As shown above, the returned value is a list with 4 items in it: the
433 | universal variance vector (`universal_var`), the universal mean vector
434 | (`universal_mean`), the bulk variance (`bulk_var`) and the bulk mean
435 | (`bulk_mean`). Note that the elements of `universal_var` and
436 | `universal_mean` are named with their corresponding metal names (not
437 | antigen names), as this interfaces a bit more conveniently with the
438 | `flowCore` functions that `CytofIn` uses under-the-hood.
439 |
440 | Importantly, you only need to use `cytofin_prep_anchors` if you plan to
441 | batch normalize your .fcs files using external anchors identified on
442 | each plate (using `cytofin_normalize`). If you plan to batch normalize
443 | your .fcs files using non-redundancy scores from each sample's most
444 | stable channels (using `cytofin_normalize_nrs`), you do not need to run
445 | `cytofin_prep_anchors` first.
446 |
447 | #### Step 2 - Batch normalization
448 |
449 | After the anchors' summary statistics are computed, batch correction
450 | using external anchors can be performed using either
451 | `cytofin_normalize`. This function can perform batch correction using 5
452 | different normalizations functions (which we call "modes"). Specifically, the options are called the "meanshift", "meanshift_bulk", "variance", "z-score", and "beadlike" normalization functions. Which of
453 | these is most applicable to a given analysis will differ from user to
454 | user. We recommended that users try using both and then manually
455 | inspect/visualize the batch-corrected data in order to determine which
456 | method they prefer.
457 |
458 | To perform batch normalization using external anchors identified on each
459 | plate, use `cytofin_normalize`. This batch normalization strategy
460 | assumes that the anchors on each plate are relatively similar to one
461 | another, and it uses this similarity to adjust the marker expression
462 | measurements on each plate based on how much each plate's anchor differs
463 | from the other anchors. The `cytofin_normalize` function takes several
464 | required arguments:
465 |
466 | - `metadata_path`: A connection leading to an .xlsx or .csv file
467 | containing a metadata table with information about each file to be
468 | analyzed. This file should be identical to that used for
469 | `cytofin_homogenize`.
470 | - `panel_path`: A connection leading to an .xlsx or .csv file
471 | containing information about the standardized antigen panel in the
472 | homogenized dataset. This file should be identical to that used for
473 | `cytofin_homogenize`.
474 | - `anchor_statistics`: Either a list of numeric values produced by the
475 | `cytofin_prep_anchors` function or a connection leading to an .rds
476 | object containing anchor statistics.
477 | - `input_data_path`: A connection to a directory containing the input
478 | .fcs files to be batch normalized. In most cases, this will be the
479 | directory to which the output .FCS files from `cytofin_homogenize`
480 | were written.
481 | - `output_data_path`: A connection to a directory where the output
482 | (i.e. batch normalized) .FCS files will be written.
483 | - `mode`: A string indicating which transformation function should be
484 | used for batch normalization ("meanshift", "meanshift_bulk",
485 | "variance", "z-score", or "beadlike").
486 |
487 | In addition to these required arguments, `cytofin_normalize` takes
488 | several optional arguments:
489 |
490 | - `input_prefix`: The string that was appended to the name of the raw
491 | input .fcs files of `cytofin_homogenize` to create their
492 | corresponding output file names. Defaults to "homogenized\_".
493 |
494 | - `output_prefix`: The string to be appended to the name of each input
495 | .fcs file to create the name of the corresponding output file
496 | (post-homogenization). Defaults to "normalized\_".
497 |
498 | - `shift_factor` and `scale_factor`: The scalar values *a* and *b*,
499 | respectively, to be used in the hyperbolic arc-sine function used to
500 | transform CyTOF ion counts according to the following equation:
501 | `new_x <- asinh(a + b * x)`. `shift_factor` defaults to 0 and
502 | `scale_factor` defaults to 0.2, which are customary values used by
503 | most scientists in the CyTOF community.
504 |
505 | Using these arguments, a call to `cytofin_normalize` will perform the
506 | batch correction and save the output (i.e. batch normalized) .fcs files
507 | to the directory specified by `output_data_path`. An example function
508 | call is given here:
509 |
510 | ```{r}
511 | output_data_path <-
512 | file.path(base_path, "normalization_results")
513 |
514 | norm_result <-
515 | cytofin_normalize(
516 | metadata_path = metadata_path,
517 | panel_path = panel_path,
518 | anchor_statistics = anchor_statistics,
519 | input_data_path = input_data_path,
520 | output_data_path = output_data_path,
521 | mode = "meanshift"
522 | )
523 | ```
524 |
525 | When this function is called, it has two effects. The first is to save
526 | the batch-normalized output .fcs files to the `output_data_path`
527 | directory. The second is to return a data.frame that stores mean and
528 | variance information about each input file (as well as its associated
529 | anchor) both before and after normalization. This data.frame can be
530 | passed directly into the `cytofin_make_plots` function to return 8
531 | diagnostic plots per sample illustrating the quality of the
532 | normalization:
533 |
534 | ```{r, eval = FALSE}
535 | # we make only the plot for the first input .fcs file
536 | # for illustrative purposes
537 | cytofin_make_plots(
538 | normalization_result = norm_result,
539 | which_rows = 1,
540 | val_path = "none"
541 | )
542 | ```
543 |
544 | ### Batch normalization using internal anchors (cytofin_normalize_nrs)
545 |
546 |
547 | In the event that external anchors are not available, `CytofIn` can use
548 | "internal anchors" within each sample for batch normalization.
549 | Specifically, instead of defining a single external anchor for all the
550 | samples on a given plate like `cytofin_normalize`, the
551 | `cytofin_normalize_nrs` function identifies the most stable channels in
552 | the dataset overall and uses them as internal anchors that are used to
553 | batch normalize all other channels from sample-to-sample. A schematic diagram of how `cytofin_normalize_nrs` works is provided below:
554 |
555 | 
556 |
557 | To identify
558 | the most stable channels in the combined dataset, `CytofIn` uses a
559 | PCA-based non-redundancy score (NRS) as described before (see
560 | [here](https://pubmed.ncbi.nlm.nih.gov/26095251/)). A minimum of 3
561 | channels should be selected to establish an internal reference from
562 | which signals can be calibrated between CyTOF files.
563 |
564 | To do this, `cytofin_normalize_nrs` takes several of the same arguments as
565 | `cytofin_normalize`, defined as above: `metadata_path`, `panel_path`,
566 | `input_data_path`, `output_data_path`, `input_prefix`, `output_prefix`,
567 | `shift_factor`, and `scale_factor`. In addition, it takes the following
568 | optional arguments:
569 |
570 | - `nchannels`: An integer representing the number of "most stable"
571 | (i.e. with the lowest non-redundancy scores) channels that should be
572 | used for batch normalization. Defaults to 3.
573 |
574 | - `make_plot`: A boolean value representing if, in addition to its
575 | other effects, `cytofin_normalize_nrs` should return a plot
576 | illustrating the distribution of non-redundancy scores for each
577 | channel among all .fcs files being batch normalized. Defaults to
578 | FALSE.
579 |
580 | These arguments can be used in a function call as follows:
581 |
582 | ```{r}
583 | # path to save the normalized .fcs files
584 | output_data_path <-
585 | file.path(base_path, "normalization_nrs_results")
586 |
587 | # call function
588 | norm_result_nrs <-
589 | cytofin_normalize_nrs(
590 | metadata_path = metadata_path,
591 | panel_path = panel_path,
592 | input_data_path = input_data_path,
593 | output_data_path = output_data_path,
594 | nchannels = 3,
595 | make_plot = FALSE
596 | )
597 | ```
598 |
599 | Just like `cytofin_normalize` above, `cytofin_normalize_nrs` has several
600 | effects. First, it writes batch-normalized .fcs files to
601 | `output_data_path` and makes a plot depicting sample-wise and
602 | channel-wise non-redundancy scores according to the value of
603 | `make_plot`. In addition, it returns a data.frame that can be passed
604 | into `cytofin_make_plots` to make diagnostic plots regarding the batch
605 | normalization procedure:
606 |
607 | ```{r, eval = FALSE}
608 | # show only 1 set of plots for illustrative purposes
609 | cytofin_make_plots(
610 | normalization_result = norm_result_nrs,
611 | which_rows = 7,
612 | val_path = validation_data_path
613 | )
614 | ```
615 |
616 | # Additional Information
617 |
618 | For questions about the `cytofin` R package, please email
619 | [kardavis\@stanford.edu](mailto:kardavis@stanford.edu) or open a GitHub
620 | issue [here](https://github.com/bennyyclo/Cytofin).
621 |
622 | ```{r}
623 | # session information for rendering this README file
624 | sessionInfo()
625 | ```
626 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output:
3 | github_document:
4 | toc: true
5 | toc_depth: 5
6 | editor_options:
7 | markdown:
8 | wrap: 72
9 | ---
10 |
11 |
12 |
13 | ```{r, include = FALSE}
14 | knitr::opts_chunk$set(
15 | collapse = TRUE,
16 | comment = "#>",
17 | fig.path = "man/figures/README-",
18 | out.width = "100%",
19 | dpi = 300
20 | )
21 | ```
22 |
23 | # cytofin
24 |
25 | CytofIn (**CyTOF** **In**tegration) is an R package for homogenizing and
26 | normalizing heterogeneous [mass cytometry
27 | (CyTOF)](https://pubmed.ncbi.nlm.nih.gov/21551058/) data from diverse
28 | data sources. Specifically, `CytofIn` provides functions that perform the
29 | following tasks:
30 |
31 | - **Dataset homogenization** - CyTOF datasets that were collected
32 | separately may differ in which markers were included in their
33 | antibody panels; in addition, they may use different naming
34 | conventions for their panels' shared markers. Thus, data mining
35 | across multiple CyTOF datasets requires **homogenization,** the
36 | process of aligning each dataset's antibody panels so that they can
37 | be analyzed together. In `CytofIn`, data homogenization (i.e. panel
38 | alignment) is performed with the `cytofin_homogenize` function that
39 | leverages user-provided panel information to combine datasets.
40 | - **Dataset normalization** - Combined analysis of multiple CyTOF
41 | datasets is likely to be confounded by dataset-to-dataset batch
42 | effects due to differences in instrumentation and experimental
43 | protocols between groups. To normalize multiple CyTOF datasets with
44 | respect to these batch effects, `CytofIn` provides 3 functions:
45 | `cytofin_prep_anchors`, `cytofin_normalize`, and
46 | `cytofin_normalize_nrs`.
47 | - **Visualization** - After batch normalization, the means and
48 | standard deviations for each of the input .fcs files (as well as
49 | their associated anchors) can be visualized using the
50 | `cytofin_make_plots` function.
51 |
52 | The general CytofIn workflow unfolds in 3 steps. First, users align the
53 | panels of the CyTOF datasets being integrated using
54 | `cytofin_homogenize()`. Second, users generate reference statistics from
55 | "generalized anchors" identified on each CyTOF plate (see below) using
56 | `cytofin_prep_anchors()`. Finally, users can then normalize/batch
57 | correct the datasets relative to one another using their choice of
58 | `cytofin_normalize()` or `cytofin_normalize_nrs()`, each of which
59 | performs the normalization procedure differently (see below).
60 |
61 | ## Installation
62 |
63 | To install CytofIn, run the following code:
64 |
65 | ```{r, eval = FALSE}
66 | library(devtools)
67 | install_github("bennyyclo/Cytofin")
68 | ```
69 |
70 | To attach the CytofIn package to your current R session, run the
71 | following line:
72 |
73 | ```{r}
74 | library(cytofin)
75 | ```
76 |
77 | ## Data for this vignette
78 |
79 | ### Establishing a root directory
80 |
81 | For the sake of this vignette, we will work within a single folder,
82 | where we will store the input data, the output data, and all
83 | intermediate files from the CytofIn pipeline. We will default to using
84 | the current working directory, but feel free to modify the following
85 | line of code to change which path you want to use.
86 |
87 | ```{r}
88 | # change this path to wherever you want this vignette to find and store
89 | # its input and output files
90 | base_path <- getwd()
91 | ```
92 |
93 | ```{r, include = FALSE}
94 | base_path <- file.path("~", "Desktop", "cytofin_tests")
95 | ```
96 |
97 | ### Downloading the data
98 |
99 | Now that we've identified the root directory we'll use for this
100 | vignette, we will create two folders in which we will store the raw
101 | input data and the validation (bead-normalized) data used in this
102 | vignette:
103 |
104 | ```{r}
105 | dir.create(file.path(base_path, "raw_data"), showWarnings = FALSE)
106 | dir.create(file.path(base_path, "validation_data"), showWarnings = FALSE)
107 | ```
108 |
109 | To fill each of these folders with the .fcs files we're analyzing in
110 | this vignette, please download the raw input files
111 | [here](https://flowrepository.org/id/FR-FCM-Z427) and the validation
112 | files [here](https://flowrepository.org/id/FR-FCM-Z42C) on
113 | [FlowRepository](https://flowrepository.org/). Once the files are
114 | downloaded, unzip them. Finally, move all of the unzipped .fcs files from each repository into the `raw_data` and
115 | `validation_data` folders that we just created, respectively.
116 |
117 | ## Usage
118 |
119 | ### CyTOF data homogenization (cytofin_homogenize)
120 |
121 | Here, the term "homogenization" refers to the process of aligning the
122 | antigen panels of multiple CyTOF experiments by (1) removing all
123 | channels that are not shared across all cohorts and (2) standardizing
124 | the antigen names used to refer to each channel so that existing
125 | analysis tools (like the `flowCore` and `tidyverse` packages) can be
126 | applied in later analytical steps. In CytofIn, dataset homogenization is
127 | performed using the `cytofin_homogenize()` function.
128 |
129 | The `cytofin_homogenize()` function takes several arguments. The first
130 | of these is `metadata_path`, a string that specifies the file path to a
131 | .csv or .xlsx metadata file containing information about each of the
132 | .fcs files being analyzed. Specifically, the metadata file will have one
133 | row for each .fcs file being analyzed and must contain the following
134 | columns (all of which will be converted to character vectors):
135 |
136 | - **filename -** Required. The name of the .fcs file within its local
137 | directory.
138 | - **cohort -** Required. The name of the cohort (i.e. experimental
139 | source) of each .fcs file.
140 | - **plate_number -** Required. The name of the CyTOF plate (e.g.
141 | "plate1", "plate2", etc.) on which the sample corresponding to each
142 | .fcs file was analyzed during data acquisition.
143 | - **patient_id -** Optional. The name of the patient to whom each .fcs
144 | file corresponds.
145 | - **condition -** Optional. The stimulation condition corresponding to
146 | each .fcs file (i.e. "basal", "IL-3", etc.).
147 | - **is_anchor -** Required. A numeric column indicating whether or not
148 | each sample should be used as an "anchor" for the batch correction
149 | procedure (1 if yes; 0 if no). Exactly one anchor should be
150 | identified for each CyTOF plate being analyzed.
151 | - **validation -** Optional. The name of the
152 | [bead-normalized](https://pubmed.ncbi.nlm.nih.gov/23512433/) .fcs
153 | file corresponding to each input file listed in the `filename`
154 | column (per gold-standard batch normalization procedure in CyTOF
155 | batch correction). Most users will ignore this column because
156 | bead-normalized data will not be available, but it can be used to
157 | validate the results of the CytofIn batch normalization algorithms
158 | if bead-normalized data are available.
159 |
160 | Importantly, only the fields marked as "required" are needed for
161 | `cytofin_homogenize()` to work; "NA" can be recorded for any/all
162 | optional columns that don't apply to the experimental design of the
163 | files being analyzed (for example, if no stimulation conditions were
164 | used in the studies being integrated, enter "NA" for each element of the
165 | `condition` column). Alternatively, these columns can be omitted from
166 | the metadata table entirely. The following image provides a visual summary of the metadata table used throughout the `CytofIn` pipeline.
167 |
168 | 
169 |
170 | For the user's convenience, the `cytofin_generate_metadata_template`
171 | function is provided to generate an example metadata .csv file filled
172 | with dummy example data in a location specified by the user:
173 |
174 | ```{r, eval = FALSE}
175 | # specify the path where you'd like to store the template file
176 | my_path <- file.path(base_path, "template_folder")
177 |
178 | # generate the template file, which then can be edited manually
179 | cytofin_generate_metadata_template(template_path = my_path)
180 | ```
181 |
182 | The second argument for `cytofin_homogenize` is `panel_path`, a string
183 | that specifies the file path to a .csv or .xlsx file containing
184 | information about the panel(s) of each of the .fcs files being analyzed.
185 | Each row represents a channel (i.e. a protein measurement) to be
186 | included in the final, homogenized panel. This file must contain the
187 | following columns:
188 |
189 | - **metal_name -** A character vector representing the name of the
190 | metal isotope measured by each channel.
191 | - **antigen_name -** A character vector representing the name of the
192 | antigen associated with a given metal isotope in the consensus panel
193 | (the final antigen name to assign to a given channel during
194 | homogenization).
195 | - **antigen_pattern -** A regular expression used to match antigen
196 | names that may differ slightly across different .fcs files. For
197 | example, the regular expression "(C\|c)(D\|d)45" will detect all of
198 | the following channel names: "cd45", "CD45", "Cd45", or "cD45".
199 | - **lineage -** A numeric vector representing whether or not a marker
200 | is a lineage marker (1 if yes; 0 otherwise).
201 | - **functional -** A numeric vector representing whether or not a
202 | marker is a functional marker (1 if yes; 0 otherwise).
203 | - **general -** A numeric vector representing whether or not a marker
204 | is a "general" (i.e. neither a lineage nor a functional) marker (1
205 | if yes; 0 otherwise).
206 |
207 | The layout of this antigen table (and how it's used during .fcs file homogenization) is displayed in the picture below.
208 |
209 | 
210 |
211 | As in `cytofin_generate_metadata_template`, the `cytofin_generate_panel_template` function is provided to
212 | generate an example metadata .csv file filled with dummy example data:
213 |
214 | ```{r, eval = FALSE}
215 | # generate the template file, which then can be edited manually
216 | cytofin_generate_panel_template(template_path = my_path)
217 | ```
218 |
219 | For many users, the most difficult part of filling out the consensus
220 | panel information table will be designing the regular expressions for
221 | the `antigen_pattern` column. However, in most cases the required
222 | regular expressions will be quite simple; for a primer on regular
223 | expressions (and their use in the
224 | [`stringr`](https://stringr.tidyverse.org/) package) written by
225 | [RStudio](https://www.rstudio.com/about/), install the `stringr` package
226 | and read the following vignette:
227 |
228 | ```{r, eval = FALSE}
229 | vignette(topic = "regular-expressions", package = "stringr")
230 | ```
231 |
232 | The next two arguments for `cytofin_homogenize` are `input_data_path`
233 | and `output_data_path`, two strings that indicate which directory input
234 | .fcs files should be read from and which directory homogenized .fcs
235 | files should be written to, respectively. Lastly, the final two
236 | arguments are optional: `prefix` allows the user to specify the prefix
237 | appended to each input .fcs file name to get the name of the
238 | corresponding output (i.e. homogenized) .fcs file name, and `verbose` is
239 | a boolean value (default = FALSE) specifying if chatty print statements
240 | should be made while the homogenization is performed.
241 |
242 | Using these arguments, `cytofin_homogenize` can homogenize a set of
243 | CyTOF files with distinct antigen naming conventions. Specifically, the
244 | program performs a regular expression search to match the synonymous
245 | term in the panel and correct the antigen name with standardized names
246 | in the panel.
247 |
248 | Example function call:
249 |
250 | ```{r, warning = FALSE}
251 | # define input paths
252 | metadata_path <-
253 | system.file(
254 | file.path("extdata", "test_metadata_raw.csv"),
255 | package = "cytofin"
256 | )
257 |
258 | panel_path <-
259 | system.file(
260 | file.path("extdata", "test_panel.csv"),
261 | package = "cytofin"
262 | )
263 |
264 | input_data_path <-
265 | file.path(base_path, "raw_data")
266 |
267 | validation_data_path <-
268 | file.path(base_path, "validation_data")
269 |
270 | # define output path
271 | # --Change this line to wherever you want the output files saved!--
272 | output_data_path <- file.path(base_path, "homogenization_output")
273 |
274 | # call homogenization function
275 | cytofin_homogenize(
276 | metadata_path = metadata_path,
277 | panel_path = panel_path,
278 | input_data_path = input_data_path,
279 | output_data_path = output_data_path
280 | )
281 | ```
282 |
283 | This function call will save homogenized .fcs files to the directory
284 | located at `output_data_path`. These files will be different from the
285 | input .fcs files in the `input_data_path` directory in that they will
286 | only contain channels whose antigen names match the `antigen_pattern`
287 | column of the reference panel located at `panel_path`. All other
288 | channels will be removed, and the names of the channels with matches in
289 | `antigen_pattern` will be standardized to the names given in the
290 | `antigen_name` column of the reference panel.
291 |
292 | The input files for this homogenization run were as follows:
293 |
294 | ```{r}
295 | list.files(input_data_path, pattern = ".fcs$")
296 | ```
297 |
298 | ...and the corresponding output file saved in the `output_data_path`
299 | directory are now as follows:
300 |
301 | ```{r}
302 | list.files(output_data_path, pattern = ".fcs$")
303 | ```
304 |
305 | ### CyTOF batch normalization
306 |
307 | After dataset homogenization, **batch correction** (or **batch
308 | normalization**) can be performed across datasets.
309 |
310 | In short, `CytofIn` performs batch normalization though the use of
311 | user-identified **generalized anchors** - which are non-identical references assumed to have low variability across batches - that can be used to estimate batch effects from samples collated from heterogeneous sources. To batch normalize using healthy control samples (one per plate) as generalized anchors (which
312 | is ideal when such samples are available), use `cytofin_normalize`. To
313 | batch normalize using the antigen channels with the lowest variability across samples as generalized anchors (which is ideal when healthy samples are unavailable on all plates being analyzed), use `cytofin_normalize_nrs`.
314 |
315 | The use of both of these functions is detailed below.
316 |
317 | #### Batch normalization using external anchors (cytofin_normalize)
318 |
319 | ##### Overview
320 |
321 | The `cytofin_normalize` uses user-identified external anchors on each
322 | CyTOF plate being integrated to correct batch effects on a
323 | plate-to-plate basis. One sample on each CyTOF barcoding plate should be
324 | chosen as that plate's external anchor. In general, external anchors
325 | should be chosen based on which samples are the most biologically
326 | similar to one another from plate to plate. For example, if healthy,
327 | non-stimulated samples are included on each CyTOF plate being
328 | integrated, the only expected variability between these samples other
329 | than batch effects would be person-to-person variability. Thus, these
330 | samples are likely to be biologically similar to one another and are
331 | suitable to be chosen as external anchors. Alternatively, if a single
332 | patient or cell line was included on every CyTOF plate being integrated,
333 | the samples corresponding to that patient or cell line on each plate are
334 | would also be suitable as external anchor choices.
335 |
336 | Once users have identified 1 external anchor per plate for `CytofIn`
337 | data integration, users must mark its row in the metadata table with a
338 | "1" in the `is_anchor` column (all other samples should be marked with
339 | "0"). `CytofIn` then uses these anchors to define a **universal mean**
340 | and **universal variance** that represent the central tendency and
341 | dispersion, respectively, of the target distribution to which all
342 | samples will be batch corrected. This correction will be performed with
343 | the user's choice from one of five batch correction functions.
344 |
345 | In short, `CytofIn`'s batch normalization procedure using external
346 | anchors has two steps:
347 |
348 | 1. Preparation of external anchors
349 | 2. Application of a transformation function that performs the batch
350 | correction (of which `CytofIn` provides 5 options)
351 |
352 | We detail function calls for each of these steps below.
353 |
354 | ##### Step 1 - Anchor preparation
355 |
356 | The `cytofin_prep_anchors` function concatenates the identified anchor
357 | files and then calculates summary statistics that are used for batch
358 | correction in later steps of the pipeline. First, `CytofIn` calculates
359 | the mean and standard deviation of each channel in the homogenized
360 | dataset across all cells from samples identified as external anchors.
361 | These values represent the overall central tendency and dispersion,
362 | respectively, of each channel among the anchor samples on each CyTOF
363 | plate; thus, we call them the **universal means** and **universal
364 | variances** of the `CytofIn` integration. Accordingly, the universal
365 | mean and universal variance vectors will each have *g* elements, where
366 | *g* is the number of channels in the consensus antigen panel in the
367 | panel information table. The universal mean and universal variance
368 | vectors are used in the `meanshift`, `variance`, `z-score`, and
369 | `beadlike` methods of batch correction (see below).
370 |
371 | In addition, the mean of all of the elements of the universal mean
372 | vector (i.e. the mean of all channel means) and the mean of all of the
373 | elements of the universal variance vector (i.e. the mean of all channel
374 | variances) are calculated. These values represent the central tendency
375 | and dispersion of antigen measurements in general among the healthy
376 | control samples on each CyTOF plate and are thus no longer
377 | channel-specific. Thus, we call them the *bulk mean* and *bulk
378 | variance*, and they are used in the `meanshift_bulk` batch correction
379 | method implemented in `cytofin_homogenize`.
380 |
381 | To calculate these values, we use the `cytofin_prep_anchors` function.
382 | `cytofin_prep_anchors` returns the universal mean vector, universal
383 | variance vector, bulk mean, and bulk variance as a `list()`. In
384 | addition, users are given an option to save these statistics as an .rds
385 | file in a specified directory in order to avoid performing redundant
386 | calculations in future analyses.
387 |
388 | Specifically, `cytofin_prep_anchors` takes 4 required arguments:
389 |
390 | - `metadata_path`: A connection leading to an .xlsx or .csv file
391 | containing a metadata table with information about each file to be
392 | analyzed. This file should be identical to that used for
393 | `cytofin_homogenize`.
394 | - `panel_path`: A connection leading to an .xlsx or .csv file
395 | containing information about the standardized antigen panel in the
396 | homogenized dataset. This file should be identical to that used for
397 | `cytofin_homogenize`.
398 | - `input_data_path`: A connection to a directory containing the input
399 | .FCS files from which to draw summary statistics
400 | - `output_path`: A connection to a directory where the output .rds and
401 | .FCS files will be saved. The default is "none", in which case no
402 | output files will be stored (and the only effect of the function
403 | will be to return the calculated statistics as a `list()`).
404 |
405 | In addition, `cytofin_prep_anchors` also takes 2 optional arguments
406 | relating to the conventional arcsinh transformation performed on the raw
407 | ion counts of the input data. These optional arguments are as follows:
408 |
409 | - `shift_factor`: The scalar value `a` in the following equation used
410 | to transform CyTOF raw data ion counts using the hyperbolic arcsinh
411 | function: `new_x <- asinh(a + b * x)`. Defaults to 0.
412 |
413 | - `scale_factor`: The scalar value `b` in the following equation used
414 | to transform CyTOF raw data ion counts using the hyperbolic arcsinh
415 | function: `new_x <- asinh(a + b * x)`. Defaults to 0.2.
416 |
417 | Finally, here is an example functional call of `cytofin_prep_anchors`:
418 |
419 | ```{r}
420 | input_data_path <- file.path(base_path, "homogenization_output")
421 | output_path <- file.path(base_path, "anchor_prep_output")
422 |
423 | anchor_statistics <-
424 | cytofin_prep_anchors(
425 | metadata_path = metadata_path,
426 | panel_path = panel_path,
427 | input_data_path = input_data_path,
428 | output_path = output_path
429 | )
430 |
431 | print(anchor_statistics)
432 | ```
433 |
434 | As shown above, the returned value is a list with 4 items in it: the
435 | universal variance vector (`universal_var`), the universal mean vector
436 | (`universal_mean`), the bulk variance (`bulk_var`) and the bulk mean
437 | (`bulk_mean`). Note that the elements of `universal_var` and
438 | `universal_mean` are named with their corresponding metal names (not
439 | antigen names), as this interfaces a bit more conveniently with the
440 | `flowCore` functions that `CytofIn` uses under-the-hood.
441 |
442 | Importantly, you only need to use `cytofin_prep_anchors` if you plan to
443 | batch normalize your .fcs files using external anchors identified on
444 | each plate (using `cytofin_normalize`). If you plan to batch normalize
445 | your .fcs files using non-redundancy scores from each sample's most
446 | stable channels (using `cytofin_normalize_nrs`), you do not need to run
447 | `cytofin_prep_anchors` first.
448 |
449 | ##### Step 2 - Batch normalization
450 |
451 | After the anchors' summary statistics are computed, batch correction
452 | using external anchors can be performed using either
453 | `cytofin_normalize`. This function can perform batch correction using 5
454 | different normalizations functions (which we call "modes"). Specifically, the options are called the "meanshift", "meanshift_bulk", "variance", "z-score", and "beadlike" normalization functions. Which of
455 | these is most applicable to a given analysis will differ from user to
456 | user. We recommended that users try using both and then manually
457 | inspect/visualize the batch-corrected data in order to determine which
458 | method they prefer.
459 |
460 | To perform batch normalization using external anchors identified on each
461 | plate, use `cytofin_normalize`. This batch normalization strategy
462 | assumes that the anchors on each plate are relatively similar to one
463 | another, and it uses this similarity to adjust the marker expression
464 | measurements on each plate based on how much each plate's anchor differs
465 | from the other anchors. The `cytofin_normalize` function takes several
466 | required arguments:
467 |
468 | - `metadata_path`: A connection leading to an .xlsx or .csv file
469 | containing a metadata table with information about each file to be
470 | analyzed. This file should be identical to that used for
471 | `cytofin_homogenize`.
472 | - `panel_path`: A connection leading to an .xlsx or .csv file
473 | containing information about the standardized antigen panel in the
474 | homogenized dataset. This file should be identical to that used for
475 | `cytofin_homogenize`.
476 | - `anchor_statistics`: Either a list of numeric values produced by the
477 | `cytofin_prep_anchors` function or a connection leading to an .rds
478 | object containing anchor statistics.
479 | - `input_data_path`: A connection to a directory containing the input
480 | .fcs files to be batch normalized. In most cases, this will be the
481 | directory to which the output .FCS files from `cytofin_homogenize`
482 | were written.
483 | - `output_data_path`: A connection to a directory where the output
484 | (i.e. batch normalized) .FCS files will be written.
485 | - `mode`: A string indicating which transformation function should be
486 | used for batch normalization ("meanshift", "meanshift_bulk",
487 | "variance", "z-score", or "beadlike").
488 |
489 | In addition to these required arguments, `cytofin_normalize` takes
490 | several optional arguments:
491 |
492 | - `input_prefix`: The string that was appended to the name of the raw
493 | input .fcs files of `cytofin_homogenize` to create their
494 | corresponding output file names. Defaults to "homogenized\_".
495 |
496 | - `output_prefix`: The string to be appended to the name of each input
497 | .fcs file to create the name of the corresponding output file
498 | (post-homogenization). Defaults to "normalized\_".
499 |
500 | - `shift_factor` and `scale_factor`: The scalar values *a* and *b*,
501 | respectively, to be used in the hyperbolic arc-sine function used to
502 | transform CyTOF ion counts according to the following equation:
503 | `new_x <- asinh(a + b * x)`. `shift_factor` defaults to 0 and
504 | `scale_factor` defaults to 0.2, which are customary values used by
505 | most scientists in the CyTOF community.
506 |
507 | Using these arguments, a call to `cytofin_normalize` will perform the
508 | batch correction and save the output (i.e. batch normalized) .fcs files
509 | to the directory specified by `output_data_path`. An example function
510 | call is given here:
511 |
512 | ```{r}
513 | output_data_path <-
514 | file.path(base_path, "normalization_results")
515 |
516 | norm_result <-
517 | cytofin_normalize(
518 | metadata_path = metadata_path,
519 | panel_path = panel_path,
520 | anchor_statistics = anchor_statistics,
521 | input_data_path = input_data_path,
522 | output_data_path = output_data_path,
523 | mode = "meanshift"
524 | )
525 | ```
526 |
527 | When this function is called, it has two effects. The first is to save
528 | the batch-normalized output .fcs files to the `output_data_path`
529 | directory. The second is to return a data.frame that stores mean and
530 | variance information about each input file (as well as its associated
531 | anchor) both before and after normalization. This data.frame can be
532 | passed directly into the `cytofin_make_plots` function to return 8
533 | diagnostic plots per sample illustrating the quality of the
534 | normalization:
535 |
536 | ```{r}
537 | # we make only the plot for the first input .fcs file
538 | # for illustrative purposes
539 | cytofin_make_plots(
540 | normalization_result = norm_result,
541 | which_rows = 1,
542 | val_path = "none"
543 | )
544 | ```
545 |
546 | #### Batch normalization using internal anchors (cytofin_normalize_nrs)
547 |
548 |
549 | In the event that external anchors are not available, `CytofIn` can use
550 | "internal anchors" within each sample for batch normalization.
551 | Specifically, instead of defining a single external anchor for all the
552 | samples on a given plate like `cytofin_normalize`, the
553 | `cytofin_normalize_nrs` function identifies the most stable channels in
554 | the dataset overall and uses them as internal anchors that are used to
555 | batch normalize all other channels from sample-to-sample. A schematic diagram of how `cytofin_normalize_nrs` works is provided below:
556 |
557 | 
558 |
559 | In words, to identify
560 | the most stable channels in the combined dataset, `CytofIn` uses a
561 | PCA-based non-redundancy score (NRS) as described before (see
562 | [here](https://pubmed.ncbi.nlm.nih.gov/26095251/)). A minimum of 3
563 | channels should be selected to establish an internal reference from
564 | which signals can be calibrated between CyTOF files.
565 |
566 | To do this, `cytofin_normalize_nrs` takes several of the same arguments as
567 | `cytofin_normalize`, defined as above: `metadata_path`, `panel_path`,
568 | `input_data_path`, `output_data_path`, `input_prefix`, `output_prefix`,
569 | `shift_factor`, and `scale_factor`. In addition, it takes the following
570 | optional arguments:
571 |
572 | - `nchannels`: An integer representing the number of "most stable"
573 | (i.e. with the lowest non-redundancy scores) channels that should be
574 | used for batch normalization. Defaults to 3.
575 |
576 | - `make_plot`: A boolean value representing if, in addition to its
577 | other effects, `cytofin_normalize_nrs` should return a plot
578 | illustrating the distribution of non-redundancy scores for each
579 | channel among all .fcs files being batch normalized. Defaults to
580 | FALSE.
581 |
582 | These arguments can be used in a function call as follows:
583 |
584 | ```{r}
585 | # path to save the normalized .fcs files
586 | output_data_path <-
587 | file.path(base_path, "normalization_nrs_results")
588 |
589 | # call function
590 | norm_result_nrs <-
591 | cytofin_normalize_nrs(
592 | metadata_path = metadata_path,
593 | panel_path = panel_path,
594 | input_data_path = input_data_path,
595 | output_data_path = output_data_path,
596 | nchannels = 3,
597 | make_plot = FALSE
598 | )
599 | ```
600 |
601 | Just like `cytofin_normalize` above, `cytofin_normalize_nrs` has several
602 | effects. First, it writes batch-normalized .fcs files to
603 | `output_data_path` and makes a plot depicting sample-wise and
604 | channel-wise non-redundancy scores according to the value of
605 | `make_plot`. In addition, it returns a data.frame that can be passed
606 | into `cytofin_make_plots` to make diagnostic plots regarding the batch
607 | normalization procedure:
608 |
609 | ```{r}
610 | # show only 1 set of plots for illustrative purposes
611 | cytofin_make_plots(
612 | normalization_result = norm_result_nrs,
613 | which_rows = 7,
614 | val_path = validation_data_path
615 | )
616 | ```
617 |
618 | ## Additional Information
619 |
620 | For questions about the `cytofin` R package, please email
621 | [kardavis\@stanford.edu](mailto:kardavis@stanford.edu) or open a GitHub
622 | issue [here](https://github.com/bennyyclo/Cytofin).
623 |
624 | ```{r}
625 | # session information for rendering this README file
626 | sessionInfo()
627 | ```
628 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | - [cytofin](#cytofin)
3 | - [Installation](#installation)
4 | - [Data for this vignette](#data-for-this-vignette)
5 | - [Establishing a root
6 | directory](#establishing-a-root-directory)
7 | - [Downloading the data](#downloading-the-data)
8 | - [Usage](#usage)
9 | - [CyTOF data homogenization
10 | (cytofin\_homogenize)](#cytof-data-homogenization-cytofin_homogenize)
11 | - [CyTOF batch normalization](#cytof-batch-normalization)
12 | - [Batch normalization using external anchors
13 | (cytofin\_normalize)](#batch-normalization-using-external-anchors-cytofin_normalize)
14 | - [Overview](#overview)
15 | - [Step 1 - Anchor
16 | preparation](#step-1---anchor-preparation)
17 | - [Step 2 - Batch
18 | normalization](#step-2---batch-normalization)
19 | - [Batch normalization using internal anchors
20 | (cytofin\_normalize\_nrs)](#batch-normalization-using-internal-anchors-cytofin_normalize_nrs)
21 | - [Additional Information](#additional-information)
22 |
23 |
24 |
25 | # cytofin
26 |
27 | CytofIn (**CyTOF** **In**tegration) is an R package for homogenizing and
28 | normalizing heterogeneous [mass cytometry
29 | (CyTOF)](https://pubmed.ncbi.nlm.nih.gov/21551058/) data from diverse
30 | data sources. Specifically, `CytofIn` provides functions that perform
31 | the following tasks:
32 |
33 | - **Dataset homogenization** - CyTOF datasets that were collected
34 | separately may differ in which markers were included in their
35 | antibody panels; in addition, they may use different naming
36 | conventions for their panels’ shared markers. Thus, data mining
37 | across multiple CyTOF datasets requires **homogenization,** the
38 | process of aligning each dataset’s antibody panels so that they can
39 | be analyzed together. In `CytofIn`, data homogenization (i.e. panel
40 | alignment) is performed with the `cytofin_homogenize` function that
41 | leverages user-provided panel information to combine datasets.
42 | - **Dataset normalization** - Combined analysis of multiple CyTOF
43 | datasets is likely to be confounded by dataset-to-dataset batch
44 | effects due to differences in instrumentation and experimental
45 | protocols between groups. To normalize multiple CyTOF datasets with
46 | respect to these batch effects, `CytofIn` provides 3 functions:
47 | `cytofin_prep_anchors`, `cytofin_normalize`, and
48 | `cytofin_normalize_nrs`.
49 | - **Visualization** - After batch normalization, the means and
50 | standard deviations for each of the input .fcs files (as well as
51 | their associated anchors) can be visualized using the
52 | `cytofin_make_plots` function.
53 |
54 | The general CytofIn workflow unfolds in 3 steps. First, users align the
55 | panels of the CyTOF datasets being integrated using
56 | `cytofin_homogenize()`. Second, users generate reference statistics from
57 | “generalized anchors” identified on each CyTOF plate (see below) using
58 | `cytofin_prep_anchors()`. Finally, users can then normalize/batch
59 | correct the datasets relative to one another using their choice of
60 | `cytofin_normalize()` or `cytofin_normalize_nrs()`, each of which
61 | performs the normalization procedure differently (see below).
62 |
63 | ## Installation
64 |
65 | To install CytofIn, run the following code:
66 |
67 | ``` r
68 | library(devtools)
69 | install_github("bennyyclo/Cytofin")
70 | ```
71 | Please also ensure that the flowcore package is installed:
72 |
73 | ``` r
74 | if (!requireNamespace("BiocManager", quietly = TRUE))
75 | install.packages("BiocManager")
76 |
77 | BiocManager::install("flowCore")
78 | ```
79 |
80 | To attach the CytofIn package to your current R session, run the
81 | following line:
82 |
83 | ``` r
84 | library(cytofin)
85 | ```
86 |
87 | ## Data for this vignette
88 |
89 | ### Establishing a root directory
90 |
91 | For the sake of this vignette, we will work within a single folder,
92 | where we will store the input data, the output data, and all
93 | intermediate files from the CytofIn pipeline. We will default to using
94 | the current working directory, but feel free to modify the following
95 | line of code to change which path you want to use.
96 |
97 | ``` r
98 | # change this path to wherever you want this vignette to find and store
99 | # its input and output files
100 | base_path <- getwd()
101 | ```
102 |
103 | ### Downloading the data
104 |
105 | Now that we’ve identified the root directory we’ll use for this
106 | vignette, we will create two folders in which we will store the raw
107 | input data and the validation (bead-normalized) data used in this
108 | vignette:
109 |
110 | ``` r
111 | dir.create(file.path(base_path, "raw_data"), showWarnings = FALSE)
112 | dir.create(file.path(base_path, "validation_data"), showWarnings = FALSE)
113 | ```
114 |
115 | To fill each of these folders with the .fcs files we’re analyzing in
116 | this vignette, please download the raw input files
117 | [here](https://flowrepository.org/id/FR-FCM-Z427) and the validation
118 | files [here](https://flowrepository.org/id/FR-FCM-Z42C) on
119 | [FlowRepository](https://flowrepository.org/). Once the files are
120 | downloaded, unzip them. Finally, move all of the unzipped .fcs files
121 | from each repository into the `raw_data` and `validation_data` folders
122 | that we just created, respectively.
123 |
124 | ## Usage
125 |
126 | ### CyTOF data homogenization (cytofin\_homogenize)
127 |
128 | Here, the term “homogenization” refers to the process of aligning the
129 | antigen panels of multiple CyTOF experiments by (1) removing all
130 | channels that are not shared across all cohorts and (2) standardizing
131 | the antigen names used to refer to each channel so that existing
132 | analysis tools (like the `flowCore` and `tidyverse` packages) can be
133 | applied in later analytical steps. In CytofIn, dataset homogenization is
134 | performed using the `cytofin_homogenize()` function.
135 |
136 | The `cytofin_homogenize()` function takes several arguments. The first
137 | of these is `metadata_path`, a string that specifies the file path to a
138 | .csv or .xlsx metadata file containing information about each of the
139 | .fcs files being analyzed. Specifically, the metadata file will have one
140 | row for each .fcs file being analyzed and must contain the following
141 | columns (all of which will be converted to character vectors):
142 |
143 | - **filename -** Required. The name of the .fcs file within its local
144 | directory.
145 | - **cohort -** Required. The name of the cohort (i.e. experimental
146 | source) of each .fcs file.
147 | - **plate\_number -** Required. The name of the CyTOF plate (e.g.
148 | “plate1”, “plate2”, etc.) on which the sample corresponding to each
149 | .fcs file was analyzed during data acquisition.
150 | - **patient\_id -** Optional. The name of the patient to whom each
151 | .fcs file corresponds.
152 | - **condition -** Optional. The stimulation condition corresponding to
153 | each .fcs file (i.e. “basal”, “IL-3”, etc.).
154 | - **is\_anchor -** Required. A numeric column indicating whether or
155 | not each sample should be used as an “anchor” for the batch
156 | correction procedure (1 if yes; 0 if no). Exactly one anchor should
157 | be identified for each CyTOF plate being analyzed.
158 | - **validation -** Optional. The name of the
159 | [bead-normalized](https://pubmed.ncbi.nlm.nih.gov/23512433/) .fcs
160 | file corresponding to each input file listed in the `filename`
161 | column (per gold-standard batch normalization procedure in CyTOF
162 | batch correction). Most users will ignore this column because
163 | bead-normalized data will not be available, but it can be used to
164 | validate the results of the CytofIn batch normalization algorithms
165 | if bead-normalized data are available.
166 |
167 | Importantly, only the fields marked as “required” are needed for
168 | `cytofin_homogenize()` to work; “NA” can be recorded for any/all
169 | optional columns that don’t apply to the experimental design of the
170 | files being analyzed (for example, if no stimulation conditions were
171 | used in the studies being integrated, enter “NA” for each element of the
172 | `condition` column). Alternatively, these columns can be omitted from
173 | the metadata table entirely. The following image provides a visual
174 | summary of the metadata table used throughout the `CytofIn` pipeline.
175 |
176 | 
177 |
178 | For the user’s convenience, the `cytofin_generate_metadata_template`
179 | function is provided to generate an example metadata .csv file filled
180 | with dummy example data in a location specified by the user:
181 |
182 | ``` r
183 | # specify the path where you'd like to store the template file
184 | my_path <- file.path(base_path, "template_folder")
185 |
186 | # generate the template file, which then can be edited manually
187 | cytofin_generate_metadata_template(template_path = my_path)
188 | ```
189 |
190 | The second argument for `cytofin_homogenize` is `panel_path`, a string
191 | that specifies the file path to a .csv or .xlsx file containing
192 | information about the panel(s) of each of the .fcs files being analyzed.
193 | Each row represents a channel (i.e. a protein measurement) to be
194 | included in the final, homogenized panel. This file must contain the
195 | following columns:
196 |
197 | - **metal\_name -** A character vector representing the name of the
198 | metal isotope measured by each channel.
199 | - **antigen\_name -** A character vector representing the name of the
200 | antigen associated with a given metal isotope in the consensus panel
201 | (the final antigen name to assign to a given channel during
202 | homogenization).
203 | - **antigen\_pattern -** A regular expression used to match antigen
204 | names that may differ slightly across different .fcs files. For
205 | example, the regular expression “(C\|c)(D\|d)45” will detect all of
206 | the following channel names: “cd45”, “CD45”, “Cd45”, or “cD45”.
207 | - **lineage -** A numeric vector representing whether or not a marker
208 | is a lineage marker (1 if yes; 0 otherwise).
209 | - **functional -** A numeric vector representing whether or not a
210 | marker is a functional marker (1 if yes; 0 otherwise).
211 | - **general -** A numeric vector representing whether or not a marker
212 | is a “general” (i.e. neither a lineage nor a functional) marker (1
213 | if yes; 0 otherwise).
214 |
215 | The layout of this antigen table (and how it’s used during .fcs file
216 | homogenization) is displayed in the picture below.
217 |
218 | 
219 |
220 | As in `cytofin_generate_metadata_template`, the
221 | `cytofin_generate_panel_template` function is provided to generate an
222 | example metadata .csv file filled with dummy example data:
223 |
224 | ``` r
225 | # generate the template file, which then can be edited manually
226 | cytofin_generate_panel_template(template_path = my_path)
227 | ```
228 |
229 | For many users, the most difficult part of filling out the consensus
230 | panel information table will be designing the regular expressions for
231 | the `antigen_pattern` column. However, in most cases the required
232 | regular expressions will be quite simple; for a primer on regular
233 | expressions (and their use in the
234 | [`stringr`](https://stringr.tidyverse.org/) package) written by
235 | [RStudio](https://www.rstudio.com/about/), install the `stringr` package
236 | and read the following vignette:
237 |
238 | ``` r
239 | vignette(topic = "regular-expressions", package = "stringr")
240 | ```
241 |
242 | The next two arguments for `cytofin_homogenize` are `input_data_path`
243 | and `output_data_path`, two strings that indicate which directory input
244 | .fcs files should be read from and which directory homogenized .fcs
245 | files should be written to, respectively. Lastly, the final two
246 | arguments are optional: `prefix` allows the user to specify the prefix
247 | appended to each input .fcs file name to get the name of the
248 | corresponding output (i.e. homogenized) .fcs file name, and `verbose` is
249 | a boolean value (default = FALSE) specifying if chatty print statements
250 | should be made while the homogenization is performed.
251 |
252 | Using these arguments, `cytofin_homogenize` can homogenize a set of
253 | CyTOF files with distinct antigen naming conventions. Specifically, the
254 | program performs a regular expression search to match the synonymous
255 | term in the panel and correct the antigen name with standardized names
256 | in the panel.
257 |
258 | Example function call:
259 |
260 | ``` r
261 | # define input paths
262 | metadata_path <-
263 | system.file(
264 | file.path("extdata", "test_metadata_raw.csv"),
265 | package = "cytofin"
266 | )
267 |
268 | panel_path <-
269 | system.file(
270 | file.path("extdata", "test_panel.csv"),
271 | package = "cytofin"
272 | )
273 |
274 | input_data_path <-
275 | file.path(base_path, "raw_data")
276 |
277 | validation_data_path <-
278 | file.path(base_path, "validation_data")
279 |
280 | # define output path
281 | # --Change this line to wherever you want the output files saved!--
282 | output_data_path <- file.path(base_path, "homogenization_output")
283 |
284 | # call homogenization function
285 | cytofin_homogenize(
286 | metadata_path = metadata_path,
287 | panel_path = panel_path,
288 | input_data_path = input_data_path,
289 | output_data_path = output_data_path
290 | )
291 | ```
292 |
293 | This function call will save homogenized .fcs files to the directory
294 | located at `output_data_path`. These files will be different from the
295 | input .fcs files in the `input_data_path` directory in that they will
296 | only contain channels whose antigen names match the `antigen_pattern`
297 | column of the reference panel located at `panel_path`. All other
298 | channels will be removed, and the names of the channels with matches in
299 | `antigen_pattern` will be standardized to the names given in the
300 | `antigen_name` column of the reference panel.
301 |
302 | The input files for this homogenization run were as follows:
303 |
304 | ``` r
305 | list.files(input_data_path, pattern = ".fcs$")
306 | #> [1] "ALL05v2_Plate2_healthy basal1.fcs" "ALL05v2_Plate2_UPN94 das.fcs"
307 | #> [3] "ALL08_Plate8_Healthy03 basal.fcs" "ALL08_Plate8_UPN26 basal.fcs"
308 | #> [5] "CRLF2_Plate1_Healthy 04 BCR.fcs" "CRLF2_Plate1_UPN53 das + TSLP.fcs"
309 | #> [7] "MS_Plate5_Healthy BM.fcs" "MS_Plate5_SU978 Basal.fcs"
310 | #> [9] "SJ_Plate2_Healthy_BM.fcs" "SJ_Plate2_TB010950_Basal.fcs"
311 | ```
312 |
313 | …and the corresponding output file saved in the `output_data_path`
314 | directory are now as follows:
315 |
316 | ``` r
317 | list.files(output_data_path, pattern = ".fcs$")
318 | #> [1] "homogenized_ALL05v2_Plate2_healthy basal1.fcs"
319 | #> [2] "homogenized_ALL05v2_Plate2_UPN94 das.fcs"
320 | #> [3] "homogenized_ALL08_Plate8_Healthy03 basal.fcs"
321 | #> [4] "homogenized_ALL08_Plate8_UPN26 basal.fcs"
322 | #> [5] "homogenized_CRLF2_Plate1_Healthy 04 BCR.fcs"
323 | #> [6] "homogenized_CRLF2_Plate1_UPN53 das + TSLP.fcs"
324 | #> [7] "homogenized_MS_Plate5_Healthy BM.fcs"
325 | #> [8] "homogenized_MS_Plate5_SU978 Basal.fcs"
326 | #> [9] "homogenized_SJ_Plate2_Healthy_BM.fcs"
327 | #> [10] "homogenized_SJ_Plate2_TB010950_Basal.fcs"
328 | ```
329 |
330 | ### CyTOF batch normalization
331 |
332 | After dataset homogenization, **batch correction** (or **batch
333 | normalization**) can be performed across datasets.
334 |
335 | In short, `CytofIn` performs batch normalization though the use of
336 | user-identified **generalized anchors** - which are non-identical
337 | references assumed to have low variability across batches - that can be
338 | used to estimate batch effects from samples collated from heterogeneous
339 | sources. To batch normalize using healthy control samples (one per
340 | plate) as generalized anchors (which is ideal when such samples are
341 | available), use `cytofin_normalize`. To batch normalize using the
342 | antigen channels with the lowest variability across samples as
343 | generalized anchors (which is ideal when healthy samples are unavailable
344 | on all plates being analyzed), use `cytofin_normalize_nrs`.
345 |
346 | The use of both of these functions is detailed below.
347 |
348 | #### Batch normalization using external anchors (cytofin\_normalize)
349 |
350 | ##### Overview
351 |
352 | The `cytofin_normalize` uses user-identified external anchors on each
353 | CyTOF plate being integrated to correct batch effects on a
354 | plate-to-plate basis. One sample on each CyTOF barcoding plate should be
355 | chosen as that plate’s external anchor. In general, external anchors
356 | should be chosen based on which samples are the most biologically
357 | similar to one another from plate to plate. For example, if healthy,
358 | non-stimulated samples are included on each CyTOF plate being
359 | integrated, the only expected variability between these samples other
360 | than batch effects would be person-to-person variability. Thus, these
361 | samples are likely to be biologically similar to one another and are
362 | suitable to be chosen as external anchors. Alternatively, if a single
363 | patient or cell line was included on every CyTOF plate being integrated,
364 | the samples corresponding to that patient or cell line on each plate
365 | would also be suitable as external anchor choices.
366 |
367 | Once users have identified 1 external anchor per plate for `CytofIn`
368 | data integration, users must mark its row in the metadata table with a
369 | “1” in the `is_anchor` column (all other samples should be marked with
370 | “0”). `CytofIn` then uses these anchors to define a **universal mean**
371 | and **universal variance** that represent the central tendency and
372 | dispersion, respectively, of the target distribution to which all
373 | samples will be batch corrected. This correction will be performed with
374 | the user’s choice from one of five batch correction functions.
375 |
376 | In short, `CytofIn`’s batch normalization procedure using external
377 | anchors has two steps:
378 |
379 | 1. Preparation of external anchors
380 | 2. Application of a transformation function that performs the batch
381 | correction (of which `CytofIn` provides 5 options)
382 |
383 | We detail function calls for each of these steps below.
384 |
385 | ##### Step 1 - Anchor preparation
386 |
387 | The `cytofin_prep_anchors` function concatenates the identified anchor
388 | files and then calculates summary statistics that are used for batch
389 | correction in later steps of the pipeline. First, `CytofIn` calculates
390 | the mean and standard deviation of each channel in the homogenized
391 | dataset across all cells from samples identified as external anchors.
392 | These values represent the overall central tendency and dispersion,
393 | respectively, of each channel among the anchor samples on each CyTOF
394 | plate; thus, we call them the **universal means** and **universal
395 | variances** of the `CytofIn` integration. Accordingly, the universal
396 | mean and universal variance vectors will each have *g* elements, where
397 | *g* is the number of channels in the consensus antigen panel in the
398 | panel information table. The universal mean and universal variance
399 | vectors are used in the `meanshift`, `variance`, `z-score`, and
400 | `beadlike` methods of batch correction (see below).
401 |
402 | In addition, the mean of all of the elements of the universal mean
403 | vector (i.e. the mean of all channel means) and the mean of all of the
404 | elements of the universal variance vector (i.e. the mean of all channel
405 | variances) are calculated. These values represent the central tendency
406 | and dispersion of antigen measurements in general among the healthy
407 | control samples on each CyTOF plate and are thus no longer
408 | channel-specific. Thus, we call them the *bulk mean* and *bulk
409 | variance*, and they are used in the `meanshift_bulk` batch correction
410 | method implemented in `cytofin_homogenize`.
411 |
412 | To calculate these values, we use the `cytofin_prep_anchors` function.
413 | `cytofin_prep_anchors` returns the universal mean vector, universal
414 | variance vector, bulk mean, and bulk variance as a `list()`. In
415 | addition, users are given an option to save these statistics as an .rds
416 | file in a specified directory in order to avoid performing redundant
417 | calculations in future analyses.
418 |
419 | Specifically, `cytofin_prep_anchors` takes 4 required arguments:
420 |
421 | - `metadata_path`: A directory leading to an .xlsx or .csv file
422 | containing a metadata table with information about each file to be
423 | analyzed. This file should be identical to that used for
424 | `cytofin_homogenize`.
425 | - `panel_path`: A directory leading to an .xlsx or .csv file
426 | containing information about the standardized antigen panel in the
427 | homogenized dataset. This file should be identical to that used for
428 | `cytofin_homogenize`.
429 | - `input_data_path`: A directory containing the input
430 | .FCS files from which to draw summary statistics
431 | - `output_path`: A directory where the output .rds and
432 | .FCS files will be saved. The default is “none”, in which case no
433 | output files will be stored (and the only effect of the function
434 | will be to return the calculated statistics as a `list()`).
435 |
436 | In addition, `cytofin_prep_anchors` also takes 2 optional arguments
437 | relating to the conventional arcsinh transformation performed on the raw
438 | ion counts of the input data. These optional arguments are as follows:
439 |
440 | - `shift_factor`: The scalar value `a` in the following equation used
441 | to transform CyTOF raw data ion counts using the hyperbolic arcsinh
442 | function: `new_x <- asinh(a + b * x)`. Defaults to 0.
443 |
444 | - `scale_factor`: The scalar value `b` in the following equation used
445 | to transform CyTOF raw data ion counts using the hyperbolic arcsinh
446 | function: `new_x <- asinh(a + b * x)`. Defaults to 0.2.
447 |
448 | Finally, here is an example functional call of `cytofin_prep_anchors`:
449 |
450 | ``` r
451 | input_data_path <- file.path(base_path, "homogenization_output")
452 | output_path <- file.path(base_path, "anchor_prep_output")
453 |
454 | anchor_statistics <-
455 | cytofin_prep_anchors(
456 | metadata_path = metadata_path,
457 | panel_path = panel_path,
458 | input_data_path = input_data_path,
459 | output_path = output_path
460 | )
461 |
462 | print(anchor_statistics)
463 | #> $universal_var
464 | #> Time Event_length (Pd102)Di (Pd104)Di (Pd105)Di (Pd106)Di
465 | #> 1.28235792 0.16399756 6.78770451 0.89290897 5.74351522 4.00916670
466 | #> (Pd108)Di (Pd110)Di (In113)Di (In115)Di (La139)Di (Pr141)Di
467 | #> 6.47944462 6.14839951 3.14291787 3.69776978 0.31651260 0.20067263
468 | #> (Nd142)Di (Nd143)Di (Nd144)Di (Nd145)Di (Nd146)Di (Sm147)Di
469 | #> 0.88280840 0.50837979 0.18512779 0.27893442 0.79089548 1.30174061
470 | #> (Nd148)Di (Sm149)Di (Nd150)Di (Sm152)Di (Eu153)Di (Sm154)Di
471 | #> 1.53148051 0.24234410 0.19237185 0.78984151 3.36668746 0.64687396
472 | #> (Gd156)Di (Gd158)Di (Gd160)Di (Dy161)Di (Dy162)Di (Dy163)Di
473 | #> 0.62963342 0.21865740 2.88801028 0.07940630 0.12194444 0.07128214
474 | #> (Dy164)Di (Ho165)Di (Er166)Di (Er167)Di (Er168)Di (Er170)Di
475 | #> 0.44285804 1.04235848 0.28206380 4.31831331 3.59089444 3.35406088
476 | #> (Yb171)Di (Yb172)Di (Yb173)Di (Yb174)Di (Lu175)Di (Yb176)Di
477 | #> 1.95310084 0.67905696 0.13911985 6.12832312 1.77734024 0.53625671
478 | #> (Ir191)Di (Ir193)Di
479 | #> 3.21574811 3.27089639
480 | #>
481 | #> $universal_mean
482 | #> Time Event_length (Pd102)Di (Pd104)Di (Pd105)Di (Pd106)Di
483 | #> 14.50995327 2.30820954 3.48055714 1.06062913 4.08199057 4.77092034
484 | #> (Pd108)Di (Pd110)Di (In113)Di (In115)Di (La139)Di (Pr141)Di
485 | #> 2.69248853 3.31279576 1.34656332 2.31588156 0.35046633 0.19319399
486 | #> (Nd142)Di (Nd143)Di (Nd144)Di (Nd145)Di (Nd146)Di (Sm147)Di
487 | #> 0.57791130 0.34730008 0.20086489 0.34646560 0.61382685 0.56851774
488 | #> (Nd148)Di (Sm149)Di (Nd150)Di (Sm152)Di (Eu153)Di (Sm154)Di
489 | #> 1.13302732 0.15299272 0.19208744 0.43406391 2.13362865 0.45270859
490 | #> (Gd156)Di (Gd158)Di (Gd160)Di (Dy161)Di (Dy162)Di (Dy163)Di
491 | #> 0.34711746 0.17472376 1.30261426 0.11212254 0.13257570 0.07266354
492 | #> (Dy164)Di (Ho165)Di (Er166)Di (Er167)Di (Er168)Di (Er170)Di
493 | #> 0.22465161 0.48758658 0.28522175 2.63843957 2.43044297 0.80540655
494 | #> (Yb171)Di (Yb172)Di (Yb173)Di (Yb174)Di (Lu175)Di (Yb176)Di
495 | #> 1.30095098 0.65077576 0.15830507 2.43474419 1.14821570 0.54578885
496 | #> (Ir191)Di (Ir193)Di
497 | #> 3.80031272 4.48577210
498 | #>
499 | #> $bulk_var
500 | #> [1] 1.467075
501 | #>
502 | #> $bulk_mean
503 | #> [1] 0.969387
504 | ```
505 |
506 | As shown above, the returned value is a list with 4 items in it: the
507 | universal variance vector (`universal_var`), the universal mean vector
508 | (`universal_mean`), the bulk variance (`bulk_var`) and the bulk mean
509 | (`bulk_mean`). Note that the elements of `universal_var` and
510 | `universal_mean` are named with their corresponding metal names (not
511 | antigen names), as this interfaces a bit more conveniently with the
512 | `flowCore` functions that `CytofIn` uses under-the-hood.
513 |
514 | Importantly, you only need to use `cytofin_prep_anchors` if you plan to
515 | batch normalize your .fcs files using external anchors identified on
516 | each plate (using `cytofin_normalize`). If you plan to batch normalize
517 | your .fcs files using non-redundancy scores from each sample’s most
518 | stable channels (using `cytofin_normalize_nrs`), you do not need to run
519 | `cytofin_prep_anchors` first.
520 |
521 | ##### Step 2 - Batch normalization
522 |
523 | After the anchors’ summary statistics are computed, batch correction
524 | using external anchors can be performed using either
525 | `cytofin_normalize`. This function can perform batch correction using 5
526 | different normalizations functions (which we call “modes”).
527 | Specifically, the options are called the “meanshift”, “meanshift\_bulk”,
528 | “variance”, “z-score”, and “beadlike” normalization functions. Which of
529 | these is most applicable to a given analysis will differ from user to
530 | user. We recommended that users try using both and then manually
531 | inspect/visualize the batch-corrected data in order to determine which
532 | method they prefer.
533 |
534 | To perform batch normalization using external anchors identified on each
535 | plate, use `cytofin_normalize`. This batch normalization strategy
536 | assumes that the anchors on each plate are relatively similar to one
537 | another, and it uses this similarity to adjust the marker expression
538 | measurements on each plate based on how much each plate’s anchor differs
539 | from the other anchors. The `cytofin_normalize` function takes several
540 | required arguments:
541 |
542 | - `metadata_path`: A directory leading to an .xlsx or .csv file
543 | containing a metadata table with information about each file to be
544 | analyzed. This file should be identical to that used for
545 | `cytofin_homogenize`.
546 | - `panel_path`: A directory leading to an .xlsx or .csv file
547 | containing information about the standardized antigen panel in the
548 | homogenized dataset. This file should be identical to that used for
549 | `cytofin_homogenize`.
550 | - `anchor_statistics`: Either a list of numeric values produced by the
551 | `cytofin_prep_anchors` function or a connection leading to an .rds
552 | object containing anchor statistics.
553 | - `input_data_path`: A directory containing the input
554 | .fcs files to be batch normalized. In most cases, this will be the
555 | directory to which the output .FCS files from `cytofin_homogenize`
556 | were written.
557 | - `output_data_path`: A directory where the output
558 | (i.e. batch normalized) .FCS files will be written.
559 | - `mode`: A string indicating which transformation function should be
560 | used for batch normalization (“meanshift”, “meanshift\_bulk”,
561 | “variance”, “z-score”, or “beadlike”).
562 |
563 | In addition to these required arguments, `cytofin_normalize` takes
564 | several optional arguments:
565 |
566 | - `input_prefix`: The string that was appended to the name of the raw
567 | input .fcs files of `cytofin_homogenize` to create their
568 | corresponding output file names. Defaults to “homogenized\_”.
569 |
570 | - `output_prefix`: The string to be appended to the name of each input
571 | .fcs file to create the name of the corresponding output file
572 | (post-homogenization). Defaults to “normalized\_”.
573 |
574 | - `shift_factor` and `scale_factor`: The scalar values *a* and *b*,
575 | respectively, to be used in the hyperbolic arc-sine function used to
576 | transform CyTOF ion counts according to the following equation:
577 | `new_x <- asinh(a + b * x)`. `shift_factor` defaults to 0 and
578 | `scale_factor` defaults to 0.2, which are customary values used by
579 | most scientists in the CyTOF community.
580 |
581 | Using these arguments, a call to `cytofin_normalize` will perform the
582 | batch correction and save the output (i.e. batch normalized) .fcs files
583 | to the directory specified by `output_data_path`. An example function
584 | call is given here:
585 |
586 | ``` r
587 | output_data_path <-
588 | file.path(base_path, "normalization_results")
589 |
590 | norm_result <-
591 | cytofin_normalize(
592 | metadata_path = metadata_path,
593 | panel_path = panel_path,
594 | anchor_statistics = anchor_statistics,
595 | input_data_path = input_data_path,
596 | output_data_path = output_data_path,
597 | mode = "meanshift"
598 | )
599 | ```
600 |
601 | When this function is called, it has two effects. The first is to save
602 | the batch-normalized output .fcs files to the `output_data_path`
603 | directory. The second is to return a data.frame that stores mean and
604 | variance information about each input file (as well as its associated
605 | anchor) both before and after normalization. This data.frame can be
606 | passed directly into the `cytofin_make_plots` function to return 8
607 | diagnostic plots per sample illustrating the quality of the
608 | normalization:
609 |
610 | ``` r
611 | # we make only the plot for the first input .fcs file
612 | # for illustrative purposes
613 | cytofin_make_plots(
614 | normalization_result = norm_result,
615 | which_rows = 1,
616 | val_path = "none"
617 | )
618 | ```
619 |
620 |
621 |
622 | #### Batch normalization using internal anchors (cytofin\_normalize\_nrs)
623 |
624 | In the event that external anchors are not available, `CytofIn` can use
625 | “internal anchors” within each sample for batch normalization.
626 | Specifically, instead of defining a single external anchor for all the
627 | samples on a given plate like `cytofin_normalize`, the
628 | `cytofin_normalize_nrs` function identifies the most stable channels in
629 | the dataset overall and uses them as internal anchors that are used to
630 | batch normalize all other channels from sample-to-sample. A schematic
631 | diagram of how `cytofin_normalize_nrs` works is provided below:
632 |
633 | 
634 |
635 | In words, to identify the most stable channels in the combined dataset,
636 | `CytofIn` uses a PCA-based non-redundancy score (NRS) as described
637 | before (see [here](https://pubmed.ncbi.nlm.nih.gov/26095251/)). A
638 | minimum of 3 channels should be selected to establish an internal
639 | reference from which signals can be calibrated between CyTOF files.
640 |
641 | To do this, `cytofin_normalize_nrs` takes several of the same arguments
642 | as `cytofin_normalize`, defined as above: `metadata_path`, `panel_path`,
643 | `input_data_path`, `output_data_path`, `input_prefix`, `output_prefix`,
644 | `shift_factor`, and `scale_factor`. In addition, it takes the following
645 | optional arguments:
646 |
647 | - `nchannels`: An integer representing the number of “most stable”
648 | (i.e. with the lowest non-redundancy scores) channels that should be
649 | used for batch normalization. Defaults to 3.
650 |
651 | - `make_plot`: A boolean value representing if, in addition to its
652 | other effects, `cytofin_normalize_nrs` should return a plot
653 | illustrating the distribution of non-redundancy scores for each
654 | channel among all .fcs files being batch normalized. Defaults to
655 | FALSE.
656 |
657 | These arguments can be used in a function call as follows:
658 |
659 | ``` r
660 | # path to save the normalized .fcs files
661 | output_data_path <-
662 | file.path(base_path, "normalization_nrs_results")
663 |
664 | # call function
665 | norm_result_nrs <-
666 | cytofin_normalize_nrs(
667 | metadata_path = metadata_path,
668 | panel_path = panel_path,
669 | input_data_path = input_data_path,
670 | output_data_path = output_data_path,
671 | nchannels = 3,
672 | make_plot = FALSE
673 | )
674 | ```
675 |
676 | Just like `cytofin_normalize` above, `cytofin_normalize_nrs` has several
677 | effects. First, it writes batch-normalized .fcs files to
678 | `output_data_path` and makes a plot depicting sample-wise and
679 | channel-wise non-redundancy scores according to the value of
680 | `make_plot`. In addition, it returns a data.frame that can be passed
681 | into `cytofin_make_plots` to make diagnostic plots regarding the batch
682 | normalization procedure:
683 |
684 | ``` r
685 | # show only 1 set of plots for illustrative purposes
686 | cytofin_make_plots(
687 | normalization_result = norm_result_nrs,
688 | which_rows = 7,
689 | val_path = validation_data_path
690 | )
691 | ```
692 |
693 |
694 |
695 | ## Additional Information
696 |
697 | For questions about the `cytofin` R package, please email
698 | or open a GitHub issue
699 | [here](https://github.com/bennyyclo/Cytofin).
700 |
701 | ``` r
702 | # session information for rendering this README file
703 | sessionInfo()
704 | #> R version 4.0.3 (2020-10-10)
705 | #> Platform: x86_64-apple-darwin17.0 (64-bit)
706 | #> Running under: macOS Big Sur 10.16
707 | #>
708 | #> Matrix products: default
709 | #> BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
710 | #> LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
711 | #>
712 | #> locale:
713 | #> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
714 | #>
715 | #> attached base packages:
716 | #> [1] stats graphics grDevices utils datasets methods base
717 | #>
718 | #> other attached packages:
719 | #> [1] cytofin_0.0.0.9000
720 | #>
721 | #> loaded via a namespace (and not attached):
722 | #> [1] Rcpp_1.0.6 highr_0.9 compiler_4.0.3
723 | #> [4] pillar_1.6.0 cytolib_2.2.1 tools_4.0.3
724 | #> [7] digest_0.6.27 evaluate_0.14 lifecycle_1.0.0
725 | #> [10] tibble_3.1.0 pkgconfig_2.0.3 rlang_0.4.10
726 | #> [13] DBI_1.1.1 yaml_2.2.1 parallel_4.0.3
727 | #> [16] xfun_0.22 dplyr_1.0.5 stringr_1.4.0
728 | #> [19] knitr_1.32 hms_1.0.0 generics_0.1.0
729 | #> [22] S4Vectors_0.28.1 vctrs_0.3.7 stats4_4.0.3
730 | #> [25] tidyselect_1.1.0 glue_1.4.2 Biobase_2.50.0
731 | #> [28] R6_2.5.0 fansi_0.4.2 rmarkdown_2.7
732 | #> [31] readr_1.4.0 tidyr_1.1.3 RProtoBufLib_2.2.0
733 | #> [34] purrr_0.3.4 magrittr_2.0.1 matrixStats_0.58.0
734 | #> [37] htmltools_0.5.1.1 ellipsis_0.3.1 BiocGenerics_0.36.1
735 | #> [40] assertthat_0.2.1 flowCore_2.2.0 utf8_1.2.1
736 | #> [43] stringi_1.5.3 RcppParallel_5.1.2 crayon_1.4.1
737 | ```
738 |
--------------------------------------------------------------------------------