├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   ├── docker.yaml
    │   └── pkgdown.yaml
├── .gitignore
├── ADTnorm.Rproj
├── DESCRIPTION
├── LICENSE.md
├── NAMESPACE
├── R
    ├── ADTnorm.R
    ├── arcsinh_transform.R
    ├── clean_adt_name.R
    ├── data.R
    ├── detect_impute_outlier_valley.R
    ├── get_customize_landmark.R
    ├── get_neighbors.R
    ├── get_peak_midpoint.R
    ├── get_peak_mode.R
    ├── get_valley_location.R
    ├── globals.R
    ├── landmark_fill_na.R
    ├── load_landmarks.R
    ├── peak_alignment.R
    ├── plot_adt_density_each.R
    ├── plot_adt_density_with_peak_valley.R
    └── plot_adt_density_with_peak_valley_each.R
├── README.md
├── data
    ├── cell_x_adt.rda
    └── cell_x_feature.rda
├── inst
    └── Dockerfile
├── man
    ├── .DS_Store
    ├── ADTnorm.Rd
    ├── arcsinh_transform.Rd
    ├── cell_x_adt.Rd
    ├── cell_x_feature.Rd
    ├── clean_adt_name.Rd
    ├── detect_impute_outlier_valley.Rd
    ├── figures
    │   ├── ADTnorm.png
    │   ├── ArcsinhTransform_log10_CD4.png
    │   ├── PublicData_samplelevel_adtnorm.png
    │   ├── PublicData_samplelevel_raw.png
    │   ├── RawCount.png
    │   ├── ShinyR.png
    │   ├── lower_peak_thres.png
    │   ├── multi_sample_per_batch.png
    │   ├── peak_type.png
    │   ├── pipeline_202208.png
    │   ├── shoulder_valley.png
    │   ├── shoulder_valley_CD3.png
    │   ├── target_landmark_location.png
    │   └── valley_density_adjust.png
    ├── get_customize_landmark.Rd
    ├── get_neighbors.Rd
    ├── get_peak_midpoint.Rd
    ├── get_peak_mode.Rd
    ├── get_valley_location.Rd
    ├── landmark_fill_na.Rd
    ├── load_landmarks.Rd
    ├── peak_alignment.Rd
    ├── plot_adt_density_each.Rd
    ├── plot_adt_density_with_peak_valley.Rd
    └── plot_adt_density_with_peak_valley_each.Rd
├── manuscript
    ├── BenchmarkingAnalysis
    │   ├── ComparativeAnalysis
    │   │   ├── WNN_integrateRNA_ADT.R
    │   │   ├── auto_gating.R
    │   │   ├── create_TcellOnly_publicData.R
    │   │   ├── evaluate_norm.R
    │   │   ├── publicData_CITEseq_umap.R
    │   │   └── stain_quality.R
    │   └── ImplementMethods
    │   │   ├── analysis_compareMethods.R
    │   │   ├── arcsinh_centered_scaled_transform.R
    │   │   ├── arcsinh_centered_transform.R
    │   │   ├── arcsinh_clr_transform.R
    │   │   ├── arcsinh_transform.R
    │   │   ├── centered_arcsinh_transform.R
    │   │   ├── clr_transformation.R
    │   │   ├── cytofruv_transform.R
    │   │   ├── decontPro_transform.R
    │   │   ├── fdanorm_transform.R
    │   │   ├── gaussnorm_transform.R
    │   │   ├── get_peak_mode.R
    │   │   ├── get_valley.R
    │   │   ├── harmony_transform.R
    │   │   ├── landmark_fill_na.R
    │   │   ├── sciPENN_PublicDataset.py
    │   │   ├── totalVI_publicDataSet.ipynb
    │   │   └── warpset_transform.R
    ├── COVID19
    │   ├── ADTnorm_parallel.R
    │   ├── RNA_Mono_DE.R
    │   ├── analysis.R
    │   └── auto_gating.R
    ├── HematopoieticStudy
    │   ├── hema_progenitor_titration.R
    │   └── hema_titrated_final.R
    └── TitrationStudy
    │   └── analysis.R
├── tests
    ├── testthat.R
    └── testthat
    │   └── test-ADTnorm.R
└── vignettes
    ├── .RData
    └── ADTnorm-tutorial.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^ADTnorm\.Rproj$
2 | ^\.Rproj\.user$
3 | ^\.github$
4 | ^LICENSE\.md$
5 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |       R_KEEP_PKG_SOURCE: yes
17 |     steps:
18 |       - uses: actions/checkout@v2
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::rcmdcheck, any::XML
27 |           needs: check
28 | 
29 |       - uses: r-lib/actions/check-r-package@v2
30 | 


--------------------------------------------------------------------------------
/.github/workflows/docker.yaml:
--------------------------------------------------------------------------------
 1 | # https://github.com/docker/build-push-action
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | name: docker
10 | 
11 | jobs:
12 |   docker:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Set up QEMU
16 |         uses: docker/setup-qemu-action@v1
17 |       - name: Set up Docker Buildx
18 |         uses: docker/setup-buildx-action@v1
19 |       - name: Login to GitHub Container Registry
20 |         uses: docker/login-action@v1
21 |         with:
22 |           registry: ghcr.io
23 |           username: ${{ github.repository_owner }}
24 |           password: ${{ secrets.GITHUB_TOKEN }}
25 |       - name: Build and push
26 |         id: docker_build
27 |         uses: docker/build-push-action@v2
28 |         with:
29 |           file: ./inst/Dockerfile
30 |           push: true
31 |           tags: |
32 |             ghcr.io/yezhengstat/adtnorm:latest
33 |       - name: Image digest
34 |         run: echo ${{ steps.docker_build.outputs.digest }}
35 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |     tags:
 7 |       -'*'
 8 | 
 9 | name: pkgdown
10 | 
11 | jobs:
12 |   pkgdown:
13 |     runs-on: macOS-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |     steps:
17 |       - uses: actions/checkout@v2
18 | 
19 |       - uses: r-lib/actions/setup-r@v2
20 | 
21 |       - uses: r-lib/actions/setup-pandoc@v2
22 | 
23 |       - name: Query dependencies
24 |         run: |
25 |           install.packages('remotes')
26 |           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
27 |           writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
28 |         shell: Rscript {0}
29 | 
30 |       - name: Restore R package cache
31 |         uses: actions/cache@v2
32 |         with:
33 |           path: ${{ env.R_LIBS_USER }}
34 |           key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
35 |           restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
36 | 
37 |       - name: Install dependencies
38 |         run: |
39 |           remotes::install_deps(dependencies = TRUE)
40 |           install.packages("pkgdown", type = "binary")
41 |         shell: Rscript {0}
42 | 
43 |       - name: Install package
44 |         run: R CMD INSTALL .
45 | 
46 |       - name: Deploy package
47 |         run: |
48 |           git config --local user.email "actions@github.com"
49 |           git config --local user.name "GitHub Actions"
50 |           Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | test.R
3 | .Rhistory
4 | 


--------------------------------------------------------------------------------
/ADTnorm.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | LineEndingConversion: Posix
18 | 
19 | BuildType: Package
20 | PackageUseDevtools: Yes
21 | PackageInstallArgs: --no-multiarch --with-keep.source
22 | PackageRoxygenize: rd,collate,namespace
23 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: ADTnorm
 2 | Title: Robust Normalization and Integration of Single-cell Protein Expression across CITE-seq Datasets
 3 | Version: 1.0
 4 | Author: Ye Zheng <yzheng23@fredhutch.org>
 5 | Maintainer: Ye Zheng <yzheng23@fredhutch.org>
 6 | Authors@R: 
 7 |     c(person(given = "Ye",
 8 |            family = "Zheng",
 9 |            role = c("aut", "cre"),
10 |            email = "yzheng23@fredhutch.org",
11 |            comment = c(ORCID = "0000-0002-8806-2761")),
12 |     person(given = "Ju Yeong",
13 |            family = "Kim",
14 |            role = c("ctb"),
15 |            email = "jkim2345@fredhutch.org"),
16 |     person(given = "Daniel",
17 |            family = "Caron",
18 |            role = c("ctb"),
19 |            email = "dpc2136@cumc.columbia.edu"),
20 |     person(given = "Helen",
21 |            family = "Lindsay",
22 |            role = c("ctb"),
23 |            email = "Helen.Lindsay@chuv.ch"))
24 | Description: CITE-seq technology enables the direct measurement of protein expression, known as antibody-derived tags (ADT), in addition to RNA expression. The increase in the copy number of protein molecules leads to a more robust detection of protein features compared to RNA, providing a deep definition of cell types. However, due to added discrepancies of antibodies, such as the different types or concentrations of IgG antibodies, the batch effects of the ADT component of CITE-seq can dominate over biological variations, especially for the across-study integration. We present ADTnorm as a normalization and integration method designed explicitly for the ADT counts of CITE-seq data. Benchmarking with existing scaling and normalization methods, ADTnorm achieves a fast and accurate matching of the negative and positive peaks of the ADT counts across samples, efficiently removing technical variations across batches. Further quantitative evaluations confirm that ADTnorm achieves the best cell-type separation while maintaining the minimal batch effect. Therefore, ADTnorm facilitates the scalable ADT count integration of massive public CITE-seq datasets with distinguished experimental designs, which are essential for creating a corpus of well-annotated single-cell data with deep and standardized annotations.
25 | License: GPL (>= 3)
26 | Encoding: UTF-8
27 | LazyData: true
28 | LazyDataCompression: xz
29 | Depends:
30 |     R (>= 4.0.0)
31 | Imports:
32 |     grDevices,
33 |     stats,
34 |     utils,
35 |     magrittr,
36 |     flowCore,
37 |     flowStats,
38 |     EMDomics,
39 |     fda,
40 |     RColorBrewer,
41 |     ggpubr,
42 |     ggridges,
43 |     shiny,
44 |     DT,
45 |     dplyr,
46 |     tidyr,
47 |     ggplot2
48 | Suggests:
49 |     knitr,
50 |     rmarkdown,
51 |     markdown,
52 |     testthat (>= 3.0.0)
53 | Roxygen: list(markdown = TRUE)
54 | RoxygenNote: 7.2.3
55 | VignetteBuilder: knitr
56 | biocViews: SingleCell
57 | Config/testthat/edition: 3
58 | URL: https://yezhengSTAT.github.io/ADTnorm
59 | BugReports: https://github.com/yezhengSTAT/ADTnorm/issues
60 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(ADTnorm)
 4 | export(arcsinh_transform)
 5 | export(clean_adt_name)
 6 | export(detect_impute_outlier_valley)
 7 | export(get_customize_landmark)
 8 | export(get_neighbors)
 9 | export(get_peak_midpoint)
10 | export(get_peak_mode)
11 | export(get_valley_location)
12 | export(landmark_fill_na)
13 | export(load_landmarks)
14 | export(peak_alignment)
15 | export(plot_adt_density_each)
16 | export(plot_adt_density_with_peak_valley)
17 | export(plot_adt_density_with_peak_valley_each)
18 | import(dplyr)
19 | import(ggplot2)
20 | import(ggridges)
21 | import(shiny)
22 | import(tidyr)
23 | importFrom(magrittr,"%$%")
24 | importFrom(stats,quantile)
25 | 


--------------------------------------------------------------------------------
/R/arcsinh_transform.R:
--------------------------------------------------------------------------------
 1 | #' arcsinh transformation.
 2 | #'
 3 | #' This function transforms the input cell_x_adt matrix by arcsinh with co-factor 5. The definition of this function is x_new <- asinh(a + b * x) + c)
 4 | #' @param cell_x_adt Matrix where rows are cells and columns are ADT markers.
 5 | #' @param parameter_list Parameter list for a: positive double that corresponds to a shift about 0; b: positive double that corresponds to a scale factor; c: positive double. By default a = 1, b = 1/5 and c = 0.
 6 | #' @export
 7 | #' @examples
 8 | #' \dontrun{
 9 | #' arcsinh_transform(cell_x_adt)
10 | #' }
11 | 
12 | arcsinh_transform = function(cell_x_adt = NULL, parameter_list = NULL){
13 |     ## parameters
14 |     a = 1
15 |     b = 1/5
16 |     c = 0
17 |     if(!is.null(parameter_list)){
18 |         if("a" %in% names(parameter_list)){
19 |             a = parameter_list[["a"]]
20 |         }
21 |         if("b" %in% names(parameter_list)){
22 |             b = parameter_list[["b"]]
23 |         }
24 |         if("c" %in% names(parameter_list)){
25 |             c = parameter_list[["c"]]
26 |         }
27 |     }
28 |     ## transformation
29 |     asinhTrans = flowCore::arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c)
30 | 
31 |     ## output
32 |     out = asinhTrans(cell_x_adt)
33 |     return(out)
34 | }
35 | 


--------------------------------------------------------------------------------
/R/clean_adt_name.R:
--------------------------------------------------------------------------------
 1 | #' Clean ADT marker name
 2 | #'
 3 | #' This function enables the general cleaning of ADT marker names. Regardless, users should try cleaning and unifying their ADT marker name first. Please also ensure there is no "/" in the ADT name, such as "TCRγ/δ".
 4 | #' @param adt_name A vector of ADT marker name.
 5 | #' @export
 6 | #' @examples
 7 | #' \dontrun{
 8 | #' clean_adt_name(colnames(cell_x_adt))
 9 | #' }
10 | 
11 | 
12 | clean_adt_name <- function(adt_name) {
13 |     adt_rename <- adt_name %>%
14 |         gsub("CD19-CAR", "CARCD19", .) %>%
15 |         gsub("_PROT", "", .) %>%
16 |         gsub("_TotalSeqB", "", .) %>%
17 |         gsub("_control", "", .) %>%
18 |         gsub("-GA.*", "", .) %>%
19 |         gsub("-GC.*", "", .) %>%
20 |         gsub("-GT.*", "", .) %>%
21 |         gsub("-GG.*", "", .) %>%
22 |         gsub("-CA.*", "", .) %>%
23 |         gsub("-CC.*", "", .) %>%
24 |         gsub("-CT.*", "", .) %>%
25 |         gsub("-CG.*", "", .) %>%
26 |         gsub("-AA.*", "", .) %>%
27 |         gsub("-AC.*", "", .) %>%
28 |         gsub("-AT.*", "", .) %>%
29 |         gsub("-AG.*", "", .) %>%
30 |         gsub("-TA.*", "", .) %>%
31 |         gsub("-TC.*", "", .) %>%
32 |         gsub("-TG.*", "", .) %>%
33 |         gsub("-TT.*", "", .) %>%
34 |         gsub("_.*", "", .) %>%
35 |         gsub("ADT-", "", .)
36 | 
37 |     if (!("CD8" %in% adt_rename)) {
38 |         if ("CD8a" %in% adt_rename | "CD8A" %in% adt_rename) {
39 |             ind <- which(adt_rename == "CD8a" | adt_rename == "CD8A")
40 |             adt_rename[ind] <- "CD8"
41 |         } else if ("CD8b" %in% adt_rename | "CD8B" %in% adt_rename) {
42 |             ind <- which(adt_rename == "CD8B" | adt_rename == "CD8b")
43 |             adt_rename[ind] <- "CD8"
44 |         }
45 |     }
46 |     adt_rename <- replace(adt_rename, adt_rename == "MouseIgG1kappaisotype", "IgG1")
47 |     adt_rename <- replace(adt_rename, adt_rename == "MouseIgG2akappaisotype", "IgG2a")
48 |     adt_rename <- replace(adt_rename, adt_rename == "Mouse IgG2bkIsotype", "IgG2b")
49 |     adt_rename <- replace(adt_rename, adt_rename == "RatIgG2bkIsotype", "IgG2b-Rat")
50 | 
51 |     adt_rename <- replace(adt_rename, adt_rename == "Mouse-IgG1", "IgG1")
52 |     adt_rename <- replace(adt_rename, adt_rename == "Mouse-IgG2a", "IgG2a")
53 |     adt_rename <- replace(adt_rename, adt_rename == "Mouse-IgG2b", "IgG2b")
54 |     adt_rename <- replace(adt_rename, adt_rename == "Rat-IgG2b", "IgG2b-Rat")
55 | 
56 |     adt_rename <- adt_rename %>% gsub("HLA-DR", "HLADR", .) %>% gsub("HLA\\.DR", "HLADR", .) %>% gsub("PD-1", "PD1", .) %>% gsub(" ", "", .) %>% gsub("\\s*\\([^\\)]+\\)", "", .) %>% gsub("CD279", "PD1", .) %>% gsub("CD274", "PDL1", .) %>% gsub("\\.", "", .) %>% gsub("-", "", .) %>% gsub("/", "",.)
57 | 
58 |     return(adt_rename)
59 | }
60 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' A matrix of raw count for the cell by ADT markers
 2 | #'
 3 | #' A dataset containing 422682 cells and 9 ADT markers for the CITE-seq raw measurement of 13 publicly available CITE-seq datasets.
 4 | #'
 5 | #' @format A data frame with 422682 rows and 9 variables:
 6 | #' \describe{
 7 | #'   \item{CD3}{CD3 ADT marker raw count across each cell}
 8 | #'   \item{CD4}{CD4 ADT marker raw count across each cell}
 9 | #'   \item{CD8}{CD8 ADT marker raw count across each cell}
10 | #'   \item{CD14}{CD14 ADT marker raw count across each cell}
11 | #'   \item{CD19}{CD19 ADT marker raw count across each cell}
12 | #'   \item{CD25}{CD25 ADT marker raw count across each cell}
13 | #'   \item{CD45RA}{CD45RA ADT marker raw count across each cell}
14 | #'   \item{CD56}{CD56 ADT marker raw count across each cell}
15 | #'   \item{CD127}{CD127 ADT marker raw count across each cell}
16 | #' }
17 | #' @source See detailed description in the manuscript
18 | "cell_x_adt"
19 | 
20 | #' A matrix of raw count for the cell by features
21 | #'
22 | #' A dataset containing 422682 cells and 7 feature categories for the CITE-seq raw measurement of 13 publicly available CITE-seq datasets.
23 | #'
24 | #' @format A data frame with 422682 rows and 7 variables:
25 | #' \describe{
26 | #'   \item{sample}{Sample name. In this demo data, the sample name is the same as the study_name, assuming that one study is one sample.}
27 | #'   \item{batch}{Batch ID. In this demo data, the batch ID is the same as the study_name.}
28 | #'   \item{sample_status}{Sample status, i.e., Healthy, MALTtumor, HIV Vaccine, Lupus, B-ALL, AML.}
29 | #'   \item{study_name}{Name of the data set/study.}
30 | #'   \item{ADTseqDepth}{Total UMI per cell.}
31 | #'   \item{cell_type_l1}{Broad level of cell type annotation using manual gating.}
32 | #'   \item{cell_type_l2}{Fine level of cell type annotation using manual gating.}
33 | #' }
34 | #' @source See detailed description in the manuscript
35 | "cell_x_feature"


--------------------------------------------------------------------------------
/R/detect_impute_outlier_valley.R:
--------------------------------------------------------------------------------
 1 | #' Identify the valley outliers and impute by valley by closet neighbors.
 2 | #'
 3 | #' This function identifies the valley(s) that tend to be outliers compared to other valley locations and tries to find the closest samples with similar density distribution to impute the valley. If no neighbor sample is detected, the valley will remain as original.
 4 | #' @param valley_location_res Matrix of valley landmark locations with rows being samples and columns being the valleys.
 5 | #' @param adt_marker_select The marker whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample of the same ADT marker.
 6 | #' @param cell_x_adt Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.
 7 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.
 8 | #' @param scale Scale level to defining outlier. A larger scale value corresponds to more severe outliers.
 9 | #' @param method Outlier detection methods, choose from "MAD" (Median Absolute Deviation) or "IQR" (InterQuartile Range). The default is MAD.
10 | #' @param nearest_neighbor_n Number of top nearest neighbor samples to detect.
11 | #' @param nearest_neighbor_threshold Threshold to call neighbor samples.
12 | #' @examples
13 | #' \dontrun{
14 | #' detect_impute_outlier_valley(valley_location_res, cell_x_feature)
15 | #' }
16 | # require(EMDomics)
17 | # require(dplyr)
18 | #' @export
19 | detect_impute_outlier_valley <- function(valley_location_res, adt_marker_select, cell_x_adt, cell_x_feature, scale = 3, method = "MAD", nearest_neighbor_n = 3, nearest_neighbor_threshold = 0.75){
20 | 
21 |     if(!(method %in% c("MAD", "IQR"))){
22 |         stop("Please choose the outlier valley detection method to be 'MAD' or 'IQR'.")
23 |     }
24 |     ## get batch information
25 |     valley_df <- valley_location_res %>% data.frame %>% mutate(sample = rownames(valley_location_res))
26 |     valley_df <- left_join(valley_df, cell_x_feature %>% select(sample, batch) %>% unique, by = "sample")
27 | 
28 |     ## within each batch find the valley outlier and impute by the nearest neighbor samples' valley
29 |     for(batch_each in cell_x_feature$batch %>% unique){
30 | 
31 |         ## get the sample id within each batch
32 |         sample_select <- which(valley_df$batch == batch_each)
33 | 
34 |         if(length(sample_select) > 2){ ## more than two sample per batch
35 |             ## for each valley
36 |             for(c in 1:ncol(valley_location_res)){
37 | 
38 |                 ## choose outlier detection method
39 |                 if(method == "MAD"){
40 |                     row_index <- sample_select[which(abs(valley_location_res[sample_select, c] - stats::median(valley_location_res[sample_select, c], na.rm = TRUE)) > stats::mad(valley_location_res[sample_select, c], na.rm = TRUE) * scale)]
41 | 
42 |                 }else if(method == "IQR"){
43 |                     row_index <- sample_select[which(stats::quantile(valley_location_res[sample_select, c], 0.75, na.rm = TRUE) + scale * stats::IQR(valley_location_res[sample_select, c], na.rm = TRUE) < valley_location_res[sample_select, c])]
44 |                 }else{
45 |                     return("Please select method from MAD or IQR")
46 |                 }
47 | 
48 |                 ## for each detected outlier sample, find the nearest neighbors
49 | 
50 |                 if(length(row_index) > 0){
51 |                     print(paste0("Outlier valley for sample: ", valley_df$sample[row_index], " Valley: ", c))
52 |                     for(target_sample in valley_df$sample[row_index]){
53 |                         ## neighbors at most 3 and earth mover distance <= 0.75 by default
54 |                         target_neighbors <- get_neighbors(target_sample, adt_marker_select, cell_x_adt, cell_x_feature, nearest_neighbor_n = nearest_neighbor_n, nearest_neighbor_threshold = nearest_neighbor_threshold) #valley_df$sample[row_index],
55 |                         print(paste0("Outlier sample ", target_sample, " nearest neighbors: ", target_neighbors))
56 | 
57 |                         ## if there is qualified neighbors to impute
58 |                         ## otherwise, this is a unique sample marker distribution. Leave original valley value.
59 |                         if(length(target_neighbors) > 0){
60 |                             valley_location_res[target_sample, c] <- valley_location_res[target_neighbors, c] %>% stats::median
61 |                         }
62 |                     }
63 |                 }
64 |             }
65 |         }
66 | 
67 |       }
68 |     return(valley_location_res)
69 | }
70 | 


--------------------------------------------------------------------------------
/R/get_customize_landmark.R:
--------------------------------------------------------------------------------
  1 | #' Prompt Shiny browser to manually customize peaks and valleys locations.
  2 | #'
  3 | #' This function will launch a shiny app allowing the user to set the location of peaks and valleys manually. The function will output the landmark positions that the user has set.
  4 | #' @param cell_x_adt_sample Matrix of ADT counts of the selected marker, with columns of sample and batch information for each row of cells.
  5 | #' @param landmark_pos Matrix of landmark location including peaks and valleys.
  6 | #' @param bw Bandwidth for the density plot.
  7 | #' @param adt_marker_select_name The ADT marker needed to be manually processed to set the landmarks.
  8 | #' @param brewer_palettes Set the color scheme of the color brewer. The default is "Set1".
  9 | #' @examples
 10 | #' \dontrun{
 11 | #' get_customize_landmark(
 12 | #' cell_x_adt_sample, 
 13 | #' landmark_pos, 
 14 | #' bw, 
 15 | #' adt_marker_select_name, 
 16 | #' brewer_palettes = "Set1"
 17 | #' )}
 18 | #' @export
 19 | #' @import ggplot2 ggridges shiny
 20 | get_customize_landmark = function(cell_x_adt_sample, landmark_pos, bw, adt_marker_select_name, brewer_palettes = "Set1"){
 21 | 
 22 |   max_value = ceiling(max(cell_x_adt_sample[, 1], na.rm = TRUE)) + 2
 23 |   # cell_x_adt_sample_filter = cell_x_adt_sample %>% dplyr::filter(!is.na(adt))
 24 |   sample_num = levels(cell_x_adt_sample$sample) %>% length
 25 |   plot_height = paste0(sample_num * 50, "px")
 26 |   cell_x_adt_sample$sample_ordered = factor(cell_x_adt_sample$sample, levels = levels(cell_x_adt_sample$sample))
 27 |   ## Create an example UI
 28 |   ui <- create_ui(landmark_pos, max_value, bw, plot_height)
 29 |   server <- create_server(landmark_pos, cell_x_adt_sample, bw, adt_marker_select_name, brewer_palettes, max_value)
 30 | 
 31 |   ## Return user input, vals can change
 32 |   res <- shiny::runApp(shinyApp(ui, server))
 33 |   return(res)
 34 | 
 35 | }
 36 | 
 37 | flat_table <- function(tab) {
 38 |   tmp <- lapply(1:nrow(tab), function(i) {
 39 |     data.frame(
 40 |       peaky = i,
 41 |       y = rownames(tab)[i],
 42 |       x = sapply(1:ncol(tab), function(j) {
 43 |         tab[i,j]
 44 |       }),
 45 |       name = colnames(tab),
 46 |       type = sub("\\d+$", "", colnames(tab))
 47 |     )
 48 |   })
 49 |   do.call(rbind, tmp)
 50 | }
 51 | 
 52 | create_ui <- function(landmark_pos, max_value, bw, plot_height) {
 53 |   ui <- fluidPage(
 54 |     sidebarLayout(
 55 |       sidebarPanel(
 56 |         selectInput(inputId = "landmark_select",
 57 |                     label = "Select the sample to tune:",
 58 |                     choices = rev(rownames(landmark_pos))),
 59 |         uiOutput(outputId = "landmark_slider_ui")
 60 |       ),
 61 |       mainPanel(
 62 |         sliderInput(inputId = "bandwidth_slider",
 63 |                     label = "Bandwidth for Density Visualization Below:",
 64 |                     min = 0,
 65 |                     max = 3,
 66 |                     value = bw,
 67 |                     step = 0.05, width = '1000px'),
 68 |         plotOutput("plot", height = plot_height, width = '1000px'),
 69 |         DT::dataTableOutput("table"),
 70 |         shiny::actionButton("done", "Step 2: Record User Input and Exit")
 71 |       )
 72 |     ))
 73 |   return(ui)
 74 | }
 75 | 
 76 | 
 77 | create_server <- function(landmark_pos, cell_x_adt_sample, bw = 0.1, adt_marker_select_name, brewer_palettes = "Set1", max_value) {
 78 |   
 79 |   fillColor <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(8, brewer_palettes))(length(unique(cell_x_adt_sample$batch)))
 80 | 
 81 |   
 82 |   server <- function(input, output, session) {
 83 |     vals <- shiny::reactiveValues(
 84 |       landmark_pos = landmark_pos,
 85 |       slider_values = flat_table(landmark_pos),
 86 |       fig = NULL
 87 |     )
 88 |     
 89 |     output$table <- DT::renderDataTable(vals$landmark_pos %>% DT::datatable() %>% DT::formatRound(columns = colnames(landmark_pos), digits = 3))
 90 | 
 91 |       output$plot <- renderPlot({
 92 |         bw_value <- input$bandwidth_slider
 93 |         # no_mis_idx = !is.na(cell_x_adt_sample$adt)
 94 |         
 95 |         vals$fig <- ggplot(cell_x_adt_sample, aes_string(x = "adt", y = "sample_ordered")) + 
 96 |           ggridges::geom_density_ridges2(aes(fill = factor(batch)), bandwidth = bw_value) +
 97 |           theme_bw(base_size = 20) +
 98 |           ggpubr::rotate_x_text(angle = 90) +
 99 |           ggpubr::rremove("legend") +
100 |           xlab(paste0(adt_marker_select_name, " ADT Counts")) +
101 |           ylab("Sample") +
102 |           scale_fill_manual(values = fillColor) +
103 |           scale_x_continuous(breaks = seq(0, max_value, 0.5)) +
104 |           # geom_point(data = vals$slider_values, aes_string(x = "x", y = "y", shape = "type"), size = 5)
105 |           geom_segment(
106 |               data = vals$slider_values,
107 |               aes(x = x, xend = x, y = peaky, yend = peaky+0.5, color = type),
108 |               size = 1
109 |             ) +
110 |             scale_color_manual(values = c("peak" = "black", "valley" = "grey"))
111 | 
112 |         vals$fig
113 |       })
114 | 
115 |     
116 |     output$landmark_slider_ui <- renderUI({
117 |       if (!is.null(input$landmark_select)) {
118 |         fluidRow(
119 |           column(12,
120 |                  h2(input$landmark_select),
121 |                  lapply(1:ncol(landmark_pos), function(j) {
122 |                    i <- which(rownames(landmark_pos) == input$landmark_select)
123 |                    val <- vals$landmark_pos[i, j]
124 |                    na_val <- is.na(val)
125 |                    val <- ifelse(na_val, 0, val)
126 |                    fluidRow(
127 |                      checkboxInput(
128 |                        inputId = paste0("na_checkbox", j),
129 |                        label = "Set as NA",
130 |                        value = na_val
131 |                      ),
132 |                      sliderInput(
133 |                        inputId = paste0(colnames(landmark_pos)[j]),
134 |                        label = colnames(landmark_pos)[j],
135 |                        min = 0,
136 |                        max = max_value,
137 |                        value = val,
138 |                        step = 0.0001
139 |                      )
140 |                    )
141 |                  })),
142 |           shiny::actionButton("applied", "Step 1: Set Customized Location")
143 |         )
144 |       }
145 |     })
146 |     
147 |     observeEvent(input$applied, {
148 |       i <- which(rownames(landmark_pos) == input$landmark_select)
149 |       for (j in 1:ncol(landmark_pos)) {
150 |         if (input[[paste0("na_checkbox", j)]]) {
151 |           vals$landmark_pos[i, j] <- NA
152 |         } else {
153 |           vals$landmark_pos[i, j] <- input[[paste0(colnames(landmark_pos)[j])]]
154 |         }
155 |       }
156 |       output$table <- DT::renderDataTable(vals$landmark_pos %>% DT::datatable() %>% DT::formatRound(columns = colnames(landmark_pos), digits = 3))
157 |       
158 |       slider_values <- flat_table(vals$landmark_pos)
159 |       
160 |         output$plot <- renderPlot({
161 |           bw_value <- input$bandwidth_slider
162 |           
163 |           vals$fig <- ggplot(cell_x_adt_sample, aes_string(x = "adt", y = "sample_ordered")) + 
164 |             ggridges::geom_density_ridges2(aes(fill = factor(batch)), bandwidth = bw_value) +
165 |             theme_bw(base_size = 20) +
166 |             ggpubr::rotate_x_text(angle = 90) +
167 |             ggpubr::rremove("legend") +
168 |             xlab(paste0(adt_marker_select_name, " ADT Counts")) +
169 |             ylab("Sample") +
170 |             scale_fill_manual(values = fillColor) +
171 |             scale_x_continuous(breaks = seq(0, max_value, 0.5)) + 
172 |             # geom_point(data = slider_values, aes_string(x = "x", y = "y", shape = "type"), size = 5) 
173 |             geom_segment(
174 |               data = slider_values,
175 |               aes(x = x, xend = x, y = peaky, yend = peaky+0.5, color = type),
176 |               size = 1
177 |             ) +
178 |             scale_color_manual(values = c("peak" = "black", "valley" = "grey"))
179 |           vals$fig
180 |           })
181 |     })
182 |     
183 |     shiny::observeEvent(input$done, {
184 |       shiny::stopApp(vals$landmark_pos)
185 |     })
186 |   }
187 |   
188 |   return(server)
189 | 
190 | }
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 


--------------------------------------------------------------------------------
/R/get_neighbors.R:
--------------------------------------------------------------------------------
 1 | #' Find the closest neighbors to impute outlier valleys.
 2 | #'
 3 | #' This function identifies the valley that tends to be outliers compared to other valley locations and tries to find the closest samples that have similar density distribution to input the valley. If no neighbor sample is detected, the valley will remain as original.
 4 | #' @param target_sample The target sample whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample.
 5 | #' @param adt_marker_select The marker whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample of the same ADT marker.
 6 | #' @param cell_x_adt Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.
 7 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.
 8 | #' @param nearest_neighbor_n Number of top nearest neighbor samples to detect.
 9 | #' @param nearest_neighbor_threshold Threshold to call neighbor samples.
10 | #' @export
11 | #' @examples
12 | #' \dontrun{
13 | #' get_neighbors(target_sample, adt_marker_select, cell_x_adt, cell_x_feature)
14 | #' }
15 | # require(EMDomics)
16 | # require(dplyr)
17 | get_neighbors <- function(target_sample, adt_marker_select, cell_x_adt, cell_x_feature, nearest_neighbor_n = 3, nearest_neighbor_threshold = NULL){
18 | 
19 |     knn_res <- c()
20 |     target_cell_ind <- which(cell_x_feature$sample == target_sample)
21 |     sample_list <- setdiff(cell_x_feature$sample %>% unique, target_sample)
22 |     for(sample in sample_list){
23 |          cell_ind <- which(cell_x_feature$sample == sample)
24 |          exp_data <- c(cell_x_adt[target_cell_ind, adt_marker_select], cell_x_adt[cell_ind, adt_marker_select])
25 |          names(exp_data) <- rownames(cell_x_adt)[c(target_cell_ind, cell_ind)]
26 |          labels <- c(rep("target", length(target_cell_ind)), rep("sample", length(cell_ind)))
27 |          names(labels) <- rownames(cell_x_adt)[c(target_cell_ind, cell_ind)]
28 | 
29 |          knn_res <- c(knn_res, EMDomics::calculate_emd_gene(exp_data, labels, names(exp_data)))
30 |     }
31 |     names(knn_res) <- sample_list
32 |     if(is.null(nearest_neighbor_threshold)){
33 |         return(knn_res %>% sort %>% utils::head(nearest_neighbor_n) %>% names)
34 |     }else{
35 |         return(knn_res[knn_res <= nearest_neighbor_threshold] %>% sort %>% utils::head(nearest_neighbor_n) %>% names)
36 |     }
37 | 
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/R/get_valley_location.R:
--------------------------------------------------------------------------------
  1 | #' Get the valley landmark locations.
  2 | #'
  3 | #' This function detects the valley locations either between every two peak landmarks or cut at the right heavy tails. If the specified positive uni-peak, the valley location will be set at the left side of the uni-peak.
  4 | #' @param cell_x_adt Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.
  5 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.
  6 | #' @param adt_marker_select Markers to normalize. Leaving empty to process all the ADT markers in the cell_x_adt matrix.
  7 | #' @param peak_mode_res The peak landmark results coming out of `get_peak_mode` or `get_peak_midpoint` function.
  8 | #' @param shoulder_valley Indictor to specify whether a shoulder valley is expected in case of the heavy right tail where the population of cells should be considered as positive population.
  9 | #' @param positive_peak A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells. The only CD3 peak should be aligned with the positive peaks of other samples.
 10 | #' @param multi_sample_per_batch Set it to TRUE to discard the positive peak that only appears in one sample per batch (sample number is >=3 per batch).
 11 | #' @param adjust Parameter for `density` function: bandwidth used is actually adjust*bw. This makes it easy to specify values like 'half the default' bandwidth.
 12 | #' @param min_fc Mimimal fold change between the highest peak density height and candidate valley density height. The default is 20.
 13 | #' @param shoulder_valley_slope The slope on the ADT marker density distribution to call shoulder valley.
 14 | #' @param neg_candidate_thres The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95% quantile of IgG samples is large.
 15 | #' @param lower_peak_thres The minimal ADT marker density height to call it a real peak. Set it to 0.01 to avoid suspicious positive peaks. Set it to 0.001 or smaller to include some small but tend to be real positive peaks, especially for markers like CD19.
 16 | #' @param arcsinh_transform_flag The flag indicates if the input is raw count and arcsinh transformation is implemented.
 17 | #' @examples
 18 | #' \dontrun{
 19 | #' get_valley_location(cell_x_adt, cell_x_feature, peak_mode_res)
 20 | #' }
 21 | #' @export
 22 | #' @importFrom magrittr %$%
 23 | get_valley_location = function(cell_x_adt = NULL, cell_x_feature = NULL, adt_marker_select = NULL, peak_mode_res = NULL, shoulder_valley = TRUE, positive_peak = NULL, multi_sample_per_batch = FALSE, adjust = 1.5, min_fc = 20, shoulder_valley_slope = -1, lower_peak_thres = 0.01, neg_candidate_thres = asinh(10/5 + 1), arcsinh_transform_flag = TRUE) {
 24 | 
 25 |     peak_landmark_list = peak_mode_res
 26 | 
 27 |     ## tag sample if it is the only sample within each batch that have different number of peaks.
 28 |     # batch_num = cell_x_feature$batch %>% unique %>% length
 29 |     tag_row = vector("list", length = nrow(peak_landmark_list))
 30 |     names(tag_row) = rownames(peak_landmark_list)
 31 | 
 32 |     if(multi_sample_per_batch && ncol(peak_landmark_list) > 1){ ## multiple sample per batch remove the positive peak that only appear in one sample
 33 |         for(batch_each in unique(cell_x_feature$batch)){
 34 |             sample_index = cell_x_feature %>% dplyr::filter(batch == batch_each) %$% sample %>% unique
 35 |             ## if there is only one second peak, remove it
 36 |             if(length(sample_index) > 2){
 37 |                 col_index = which(colSums(!is.na(peak_landmark_list[sample_index, ])) == 1) ## which col has only 1 peak per batch
 38 |                 # common_peak_col = which(colSums(!is.na(peak_landmark_list[sample_index, ])) != 1) #ncol(peak_landmark_list[sample_index, ]) - length(col_index)
 39 |                 for(c in col_index){ ## across those columns that only have 1 peak per batch
 40 |                     row_index = sample_index[which(!is.na(peak_landmark_list[sample_index, c]))] ## get the sample name
 41 |                     tag_row[[row_index]] = c(tag_row[[row_index]], c) #common_peak_n #non-zero value, record the peak number that is common across samples in this batch
 42 |                 }
 43 |             }
 44 |         }
 45 |     }
 46 | 
 47 |     valley_location_list = matrix(NA, nrow = nrow(peak_landmark_list), ncol = max(1, ncol(peak_landmark_list) - 1))
 48 |     rownames(valley_location_list) = cell_x_feature$sample %>% levels()
 49 |     for (sample_name in cell_x_feature$sample %>% levels()) {
 50 |         # cell_ind = which(cell_x_feature$sample == sample_name)
 51 |         cell_ind_tmp = which(cell_x_feature$sample == sample_name)
 52 |         cell_notNA = which(!is.na(cell_x_adt[cell_ind_tmp, adt_marker_select]))
 53 |         cell_ind = cell_ind_tmp[cell_notNA]
 54 |         if(length(cell_ind) > 0){
 55 |             peak_landmark = peak_landmark_list[sample_name, ]
 56 |             if(arcsinh_transform_flag){
 57 |                 zero_prop = sum(cell_x_adt[cell_ind, adt_marker_select] < 2) / length(cell_x_adt[cell_ind, adt_marker_select])
 58 |             }else{
 59 |                 zero_prop = sum(cell_x_adt[cell_ind, adt_marker_select] < neg_candidate_thres) / length(cell_x_adt[cell_ind, adt_marker_select])
 60 |             }
 61 | 
 62 |             ## check if user define single peak to be positive peak
 63 |             pos_marker_index = which(paste0("tmpName", positive_peak$ADT_index) == adt_marker_select)
 64 |             pos_sample_index = which(positive_peak$sample == sample_name)
 65 |             if (length(intersect(pos_marker_index, pos_sample_index)) > 0) {
 66 |                 lower_valley = TRUE
 67 |             } else {
 68 |                 lower_valley = FALSE
 69 |             }
 70 | 
 71 | 
 72 |             density_res = stats::density(
 73 |                 cell_x_adt[which(cell_x_feature$sample == sample_name), adt_marker_select],
 74 |                 adjust = adjust, na.rm = TRUE
 75 |             )
 76 |             x = density_res$x
 77 |             y = density_res$y
 78 | 
 79 |             sign_diff = sign(diff(y))
 80 |             diff_sign_diff = diff(sign_diff)
 81 |             peak = which(diff_sign_diff == -2) + 1
 82 |             valley = which(diff_sign_diff == 2) + 1
 83 | 
 84 |             x_valley = x[valley]
 85 |             y_valley = y[valley]
 86 |             x_peak = x[peak]
 87 |             real_peak = peak_landmark[!is.na(peak_landmark)] # peak
 88 |             np = length(real_peak)
 89 | 
 90 |             ## if this sample is tagged, choose the highest peak
 91 |             if (!is.null(tag_row[[sample_name]])) {## there are peaks need to be removed
 92 |                 peak_landmark_y = c()
 93 |                 for (peak_landmark_each in peak_landmark) {
 94 |                     peak_landmark_y = c(peak_landmark_y, y[which.min(abs(x - peak_landmark_each))]) ## get the density value for the peak location
 95 |                 }
 96 |                 common_peak_n = length(peak_landmark_y) - length(tag_row[[sample_name]])
 97 |                 real_peak = real_peak[sort(order(peak_landmark_y, decreasing = T)[1:common_peak_n])] ## get the highest peak
 98 |                 np = length(real_peak) ## update number of peak i.e, np
 99 |                 peak_landmark_list[sample_name, ] = NA
100 |                 peak_landmark_list[sample_name, (1:ncol(peak_landmark_list))[-tag_row[[sample_name]]]] = real_peak ## remove the outlier peak
101 |             }
102 | 
103 | 
104 |             if (np > 1) { ## two peaks or more
105 |                 real_valley = c()
106 |                 for (i in 1:(np - 1)) {
107 |                     tmp_valley = x_valley[(x_valley > real_peak[i]) & (x_valley < real_peak[i + 1])]
108 |                     tmp_real_valley = tmp_valley[which.min(y_valley[(x_valley > real_peak[i]) & (x_valley < real_peak[i + 1])])]
109 |                     if(length(tmp_real_valley) == 0){ ## no minimal point between two peak, ensure tmp_real_valley is not empty
110 |                         tmp_real_valley = (real_peak[i] + real_peak[i+1]) / 2
111 |                     }
112 |                     if(i == 1 && shoulder_valley){
113 |                         shoulder_cand_index = which(diff(y)/diff(x) > shoulder_valley_slope)
114 |                         first_peak_index = (which(x > max(x_peak[1], real_peak[1])) %>% min) + 50
115 |                         x_shoulder = x[shoulder_cand_index[shoulder_cand_index > first_peak_index][1]]
116 |                         real_valley = c(real_valley, min(x_shoulder, tmp_real_valley, na.rm = T))
117 |                     }else{
118 |                         if(length(tmp_valley) == 0){ ## there is no local minimum that fall within two peaks
119 |                             real_valley = c(real_valley, (real_peak[i] + real_peak[i+1]) / 2) ## use the midpoint of two peak location --- maybe shoulder peak instead?
120 |                         }else{
121 |                             real_valley = c(real_valley, tmp_real_valley) #c(real_valley, tmp_valley[which.min(y_valley[(x_valley > real_peak[i]) & (x_valley < real_peak[i + 1])])]) ## local minimum based on the density
122 |                         }
123 | 
124 |                     }
125 |                 }
126 |             } else if(np == 1) { ## one peak
127 |                 if (lower_valley == FALSE) { ## one peak is negative peak
128 |                     real_valley =  x_valley[x_valley > real_peak[1] + 0.1][1]  #x[which(y < max(y) / min_fc)[which(y < max(y) / min_fc) > max(which(y == max(y)), which(x > real_peak[1]) %>% min())] %>% min()]
129 |                     if(shoulder_valley){
130 |                         ## if one peak & go for shoulder threshold
131 |                         shoulder_cand_index = which(diff(y)/diff(x) > shoulder_valley_slope)
132 |                         first_peak_index = (which(x > max(x_peak[1], real_peak[1])) %>% min) + 50
133 |                         x_shoulder = x[shoulder_cand_index[shoulder_cand_index > first_peak_index][1]]
134 |                         if(is.na(x_shoulder) & is.na(real_valley)){
135 |                             stop("No valley is detected. Please consider increasing 'valley_density_adjust'.")
136 |                         }else{
137 |                             real_valley = min(x_shoulder, real_valley, na.rm = T)
138 |                         }
139 |                         
140 | 
141 |                     }else{
142 |                         ## check if no valley is detected due to shoulder peak
143 |                         # if(length(y_valley[x_valley > real_peak[1]]) == 0 || (y_valley[x_valley >  real_peak[1] + 0.1][1] < 0.05)){
144 |                         #     ## if one peak consider the shoulder point
145 |                         #     shoulder_cand_index = which(diff(y)/diff(x) > shoulder_valley_slope)
146 |                         #     first_peak_index = (which(x > max(x_peak[1], real_peak[1])) %>% min) + 50
147 |                         #     x_shoulder = x[shoulder_cand_index[shoulder_cand_index > first_peak_index][1]]
148 |                         #     real_valley = min(x_shoulder, real_valley, na.rm = T)
149 |                         # }
150 |                         if (length(y_valley[!is.na(x_valley) & x_valley > real_peak[1]]) == 0 || (y_valley[!is.na(x_valley) & x_valley > real_peak[1] + 0.1][1] < 0.05)) {
151 |                             shoulder_cand_index = which(diff(y)/diff(x) > shoulder_valley_slope)
152 |                             first_peak_index = (which(x > max(x_peak[1], real_peak[1])) %>% min) + 50
153 |                             x_shoulder = x[shoulder_cand_index[shoulder_cand_index > first_peak_index][1]]
154 |                             real_valley = min(x_shoulder, real_valley, na.rm = TRUE)
155 |                         }
156 |                     }
157 |                     if (zero_prop > 0.8) {
158 |                         real_valley = max(neg_candidate_thres, real_valley)
159 |                     }
160 |                 } else { ## one peak is positive peak
161 |                     real_valley = x[which((y < max(y) / min_fc) | (y < lower_peak_thres))[which((y < max(y) / min_fc) | (y < lower_peak_thres))< min(which(y == max(y)), which(x < real_peak[1]) %>% max())] %>% max()]
162 |                     # peak_landmark_list[sample_name, ] = asinh(0/5 + 1)
163 |                     # peak_landmark_list[sample_name, ncol(peak_landmark_list)] = real_peak[1]
164 |                     # real_valley = min(real_valley, min(cell_x_adt[cell_ind, adt_marker_select])) #min(real_valley, asinh(0/5 + 1))
165 |                 }
166 |             } else { ## no peak
167 |                 real_valley = NA
168 |             }
169 |             ## check if no valley is detected due to shoulder peak
170 |             if (length(real_valley) == 0 || all(is.na(real_valley))) {
171 |                 if(length(real_peak) >= 2){ ## midpoint of two peak
172 |                     real_valley = (real_peak[1] + real_peak[2]) / 2
173 |                 }else{## shoulder valley
174 |                     shoulder_cand_index = which(diff(y)/diff(x) > shoulder_valley_slope)
175 |                     first_peak_index = (which(x > max(x_peak[1], real_peak[1])) %>% min) + 50
176 |                     x_shoulder = x[shoulder_cand_index[shoulder_cand_index > first_peak_index][1]]
177 |                     if(!is.na(x_shoulder)){
178 |                         real_valley = min(x_shoulder, real_valley, na.rm = T)
179 |                     }
180 |                 }
181 | 
182 |             }
183 |             valley_location_list[sample_name, 1:length(real_valley)] = real_valley
184 |         }
185 | 
186 | 
187 |     }
188 | 
189 |     ## update peak_landmark_list if needed
190 |     if(any(colSums(!is.na(peak_landmark_list)) == 0)){ ## remove column that only has NA values
191 |         col_index = which(colSums(!is.na(peak_landmark_list)) == 0)
192 |         peak_landmark_update = matrix(NA, nrow(peak_landmark_list), ncol(peak_landmark_list) - length(col_index))
193 |         i = 1
194 |         for (c in 1:ncol(peak_landmark_list)) {
195 |             if (!(c %in% col_index)) {
196 |             peak_landmark_update[, i] = peak_landmark_list[, c]
197 |             i = i + 1
198 |             }
199 |         }
200 |         rownames(peak_landmark_update) = rownames(peak_landmark_list)
201 |         peak_landmark_list = peak_landmark_update
202 |     }
203 |     return(list(valley_location_list = valley_location_list, peak_landmark_list = peak_landmark_list))
204 | }
205 | 


--------------------------------------------------------------------------------
/R/globals.R:
--------------------------------------------------------------------------------
1 | utils::globalVariables(c("peakx", "peaky", "peaks", "batch", "counts", "."))
2 | 


--------------------------------------------------------------------------------
/R/landmark_fill_na.R:
--------------------------------------------------------------------------------
 1 | #' Merge locations of peak and valley landmarks
 2 | #'
 3 | #' This function merges the peak and valley landmarks locations and fills in NA if the landmark is not detected.
 4 | #' @param peak_landmark_list Matrix of peak landmark detection results. Rows are samples, and column(s) are the peak locations.
 5 | #' @param valley_landmark_list Matrix of valley landmark detection results. Rows are samples, and column(s) are the valley locations.
 6 | #' @param landmark_align_type Algin the peak and valleys using one of the "negPeak", "negPeak_valley", "negPeak_valley_posPeak", and "valley" alignment modes.
 7 | #' @param midpoint_type Fill in the missing first valley by the midpoint of two positive peaks ("midpoint") or impute by other valleys ("valley").
 8 | #' @param neg_candidate_thres The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95% quantile of IgG samples is large.
 9 | #' @export
10 | #' @examples
11 | #' \dontrun{
12 | #' landmark_fill_na(
13 | #'   peak_landmark_list = peak_landmark_list,
14 | #'   valley_landmark_list = valley_landmark_list,
15 | #'   landmark_align_type = "negPeak_valley_posPeak"
16 | #' )
17 | #' }
18 | landmark_fill_na <-  function(peak_landmark_list = NULL, valley_landmark_list = NULL, landmark_align_type = NULL, midpoint_type = "valley", neg_candidate_thres = asinh(10/5 + 1)){
19 |     if(!landmark_align_type %in% c("negPeak", "negPeak_valley", "negPeak_valley_posPeak", "valley")){
20 |       return("Please provide one of the landmark_align_type from: negPeak, negPeak_valley, negPeak_valley_posPeak, valley")
21 |     }
22 |     if(landmark_align_type == "valley"){
23 |       ## only use the first valley to align
24 |       landmark_matrix <- valley_landmark_list[, 1] %>% t %>% t
25 |       landmark_matrix[is.na(landmark_matrix), 1] <- neg_candidate_thres #2 ## fill in na by background level 2 after arcinsh_b5_a1 transformation
26 |     }else{
27 |       ## involve negative peaks in landmark alignment
28 |       if(ncol(peak_landmark_list) == 1){
29 |         ## only have negative peaks
30 |         landmark_matrix <- cbind(
31 |           peak_landmark_list[, 1],
32 |           valley_landmark_list[, 1]
33 |         )
34 |         ## fill in na
35 |         landmark_matrix[is.na(landmark_matrix[, 1]), 1] <- stats::median(landmark_matrix[!is.na(landmark_matrix[, 1]), 1])
36 |         alter_pos = stats::median(landmark_matrix[!is.na(landmark_matrix[, 2]), 2])
37 |         for(tmpIndex in which(is.na(landmark_matrix[, 2]))){
38 |           if(neg_candidate_thres > landmark_matrix[tmpIndex, 1]){
39 |             landmark_matrix[tmpIndex, 2] <- neg_candidate_thres
40 |           }else{
41 |             landmark_matrix[tmpIndex, 2] <- alter_pos
42 |           }
43 |         }
44 |         # landmark_matrix[is.na(landmark_matrix[, 2]), 2] <- neg_candidate_thres
45 |       }else{
46 |         ## have positive peaks
47 |         if(ncol(peak_landmark_list) > 2 && midpoint_type == "midpoint"){
48 |           landmark_matrix <- cbind(
49 |             peak_landmark_list[, 1],
50 |             rowMeans(valley_landmark_list, na.rm = TRUE),
51 |             peak_landmark_list[, ncol(peak_landmark_list)]
52 |           )
53 |         }else{
54 |           landmark_matrix <- cbind(
55 |             peak_landmark_list[, 1],
56 |             valley_landmark_list[, 1],
57 |             peak_landmark_list[, ncol(peak_landmark_list)]
58 |           )
59 |         }
60 | 
61 |         ## fill in na
62 |         ## fill in valley first
63 |         landmark_matrix[is.na(landmark_matrix[, 2]), 2] <- neg_candidate_thres
64 |         ## due to user_define_peak where unique peak is deemed positive.
65 |         ## fill in either 0.5 or half of the first valley
66 |         landmark_matrix[is.na(landmark_matrix[, 1]), 1] <- landmark_matrix[is.na(landmark_matrix[, 1]), 2]/2 #0.5
67 |         ## fill in the last positive peak: add on the valley using the median distance from the last positive peak to the first valley
68 |         landmark_matrix[is.na(landmark_matrix[, 3]), 3] <- landmark_matrix[is.na(landmark_matrix[, 3]), 2] + stats::median(landmark_matrix[!is.na(landmark_matrix[, 3]), 3] - landmark_matrix[!is.na(landmark_matrix[, 3]), 2])
69 | 
70 |       }
71 |     }
72 | 
73 |     ## only provide negative peak location
74 |     if (landmark_align_type == "negPeak") {
75 |       return(landmark_matrix[, 1] %>% t %>% t)
76 |     }
77 | 
78 |     ## only provide negative peak and valley location
79 |     if (landmark_align_type == "negPeak_valley") {
80 |       return(landmark_matrix[, 1:2])
81 |     }
82 | 
83 |     ## provide negative peak, first valley and last postiive peak location
84 |     return(landmark_matrix)
85 | }
86 | 


--------------------------------------------------------------------------------
/R/load_landmarks.R:
--------------------------------------------------------------------------------
 1 | #' Load saved landmarks into a list of matrices, as required for input into "override_landmark".
 2 | #'
 3 | #' @param dir Directory where landmark location .rds files are stored
 4 | #' @param append_rds Whether to append "/RDS" to the provided directory path. The default is TRUE, which aligns with ADTnorm's default save location.
 5 | #' @export
 6 | #' @examples
 7 | #' \dontrun{
 8 | #' override_landmark = load_landmarks(dir = save_outpath)
 9 | #' }
10 | 
11 | load_landmarks = function(dir, append_rds = TRUE){
12 |     if(append_rds)
13 |         dir <- paste(dir,'RDS',sep='/')
14 |     
15 |     override_landmark <- list()
16 |     for(file in list.files(dir,pattern='^peak_valley_locations_.*.rds',full.names=TRUE)){
17 |         file_items <- utils::tail(unlist(strsplit(file,'/')),1)
18 |         file_items <- unlist(strsplit(file_items,'_'))
19 |         marker <- file_items[4]
20 |         override_landmark[[marker]] <- readRDS(file)
21 |     }
22 |     return(override_landmark)
23 | }


--------------------------------------------------------------------------------
/R/peak_alignment.R:
--------------------------------------------------------------------------------
  1 | #' Align the peak and valley landmarks by the warpset function
  2 | #'
  3 | #' This function monotonously transforms the ADT marker counts to align the landmarks detected in previous steps. By aligning the landmarks, ADTnorm removes the batch effect and allows integration across batches/studies.
  4 | #' @param cell_x_adt Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.
  5 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.
  6 | #' @param landmark_matrix Matrix of peak and valley landmarks after filling in NA using the `landmark_fill_na` function.
  7 | #' @param target_landmark Leave it as NULL to align the landmark to the mean location across samples. Denote it by a vector of the same length as the column number of the landmark to align the negative peak, valley, and positive peak(s) to the specified fixed location.
  8 | #' @export
  9 | #' @examples
 10 | #' \dontrun{
 11 | #' peak_alignment(cell_x_adt, cell_x_feature, landmark_matrix)
 12 | #' }
 13 | # require(dplyr)
 14 | # require(flowStats)
 15 | # require(fda)
 16 | peak_alignment = function(cell_x_adt, cell_x_feature = NULL, landmark_matrix = NULL, target_landmark = NULL) {
 17 |   ## get parameters
 18 |   grouping = NULL
 19 |   monwrd = TRUE
 20 |   subsample = NULL
 21 |   peakNr = NULL
 22 |   clipRange = 0.01
 23 |   nbreaks = 11 #11
 24 |   bwFac = 2
 25 |   warpFuns = FALSE
 26 |   chunksinze = 10
 27 |   newNcFile = NULL
 28 |   z = NULL
 29 |   nb = 1001
 30 | 
 31 |   exp_data = cell_x_adt
 32 |   cell_x_adt_norm = cell_x_adt
 33 |   samples = levels(cell_x_feature$sample) ## sampleNames(exp_data)
 34 | 
 35 |   ## set up fda parameters
 36 |   extend = 0.15
 37 |   from = min(c(min(cell_x_adt, na.rm = TRUE),  target_landmark[1], min(landmark_matrix))) - diff(range(cell_x_adt, na.rm = TRUE)) * extend
 38 |   to = max(c(max(cell_x_adt, na.rm = TRUE), target_landmark[length(target_landmark)], max(landmark_matrix))) + diff(range(cell_x_adt, na.rm = TRUE)) * extend
 39 |   
 40 |   lower_bound = min(cell_x_adt, na.rm = TRUE) #- diff(range(cell_x_adt, na.rm = TRUE)) * extend
 41 |   upper_bound = max(cell_x_adt, na.rm = TRUE) #+ diff(range(cell_x_adt, na.rm = TRUE)) * extend
 42 |   
 43 |   wbasis = fda::create.bspline.basis(
 44 |     rangeval = c(from, to),
 45 |     norder = 4, breaks = seq(from, to, len = nbreaks)
 46 |   )
 47 |   Wfd0 = fda::fd(matrix(0, wbasis$nbasis, 1), wbasis)
 48 |   WfdPar = fda::fdPar(Wfd0, 1, 1e-5)
 49 | 
 50 | 
 51 |   density_y = c()
 52 |   for(sample in samples){
 53 |     cell_ind_tmp = which(cell_x_feature$sample == sample)
 54 |     cell_ind = cell_ind_tmp[which(!is.na(cell_x_adt[cell_ind_tmp]))]
 55 |     if(length(cell_ind) > 0){
 56 |       density_y = cbind(density_y, stats::density(cell_x_adt[cell_ind], from = from, to = to, n = nb, na.rm = TRUE)$y)
 57 |     }else{
 58 |       density_y = cbind(density_y, rep(NA, nb))
 59 |     }
 60 | 
 61 |   }
 62 |   colnames(density_y) = samples
 63 | 
 64 |   arg_vals = seq(from, to, len = nb)
 65 |   fdobj = fda::Data2fd(arg_vals, density_y, wbasis)
 66 | 
 67 |   if (ncol(landmark_matrix) == 1) { ## only one peak no valley: offset
 68 |     offsets = landmark_matrix - stats::median(landmark_matrix, na.rm = TRUE)
 69 |     names(offsets) = samples
 70 |     funs = funsBack = vector("list", nrow(landmark_matrix))
 71 |     names(funs) = samples
 72 |     names(funsBack) = samples
 73 |     for (j in seq_along(funs)) {
 74 |       funs[[samples[j]]] = function(x) x - z
 75 |       e1 = new.env(hash = TRUE)
 76 |       e1$z = offsets[samples[j]]
 77 |       environment(funs[[samples[j]]]) = e1
 78 |       funsBack[[samples[j]]] = function(x) x + z
 79 |       e2 = new.env(hash = TRUE)
 80 |       e2$z = offsets[samples[j]]
 81 |       environment(funsBack[[samples[j]]]) = e2
 82 |     }
 83 |   } else { ## more than one landmark: warping
 84 |     ## if any valley is beyond the upper bound of range, replace by the upper bound
 85 |     if(any(landmark_matrix[, 2] > upper_bound)){
 86 |       landmark_matrix[which(landmark_matrix[, 2] > upper_bound), 2] = max(cell_x_adt, na.rm = TRUE) 
 87 |       print(paste0("Warning: some valley landmarks are larger the upper bound of the range. They are replaced by the maximum value of cell_x_adt. Please consider reduce 'neg_candidate_thres' value."))
 88 |     }
 89 |     if(any(landmark_matrix[, 1] < lower_bound)){
 90 |       landmark_matrix[which(landmark_matrix[, 1] < lower_bound), 1] = min(cell_x_adt, na.rm = TRUE) 
 91 |       print("Warning: some valley landmarks are smaller than the lower bound of the range. They are replaced by the minimum value of cell_x_adt.")
 92 |     }
 93 |     args = list("unregfd" = fdobj, "fdobj"=fdobj, "ximarks"=landmark_matrix, "WfdPar"=WfdPar, "monwrd"=monwrd)
 94 |     if(!is.null(target_landmark)){
 95 |       args[['x0marks']] = target_landmark
 96 |     }else{
 97 |       args[['x0marks']] = colMeans(landmark_matrix, na.rm = TRUE)
 98 |     }
 99 |     args_run = args[intersect(names(formals(fda::landmarkreg)), names(args))]
100 |     regDens = do.call(fda::landmarkreg, args_run, quote = TRUE)
101 | 
102 |     # if(is.null(target_landmark)){
103 |     #   regDens = fda::landmarkreg(fdobj, landmark_matrix, WfdPar = WfdPar, monwrd = monwrd)
104 |     # }else{
105 |     #   regDens = fda::landmarkreg(fdobj, landmark_matrix, x0marks = target_landmark, WfdPar = WfdPar, monwrd = monwrd)
106 |     # }
107 |     # if(is.null(target_landmark)){
108 |     #   regDens = fda::landmarkreg(fdobj, landmark_matrix, WfdPar = WfdPar)
109 |     # }else{
110 |     #   regDens = fda::landmarkreg(fdobj, landmark_matrix, x0marks = target_landmark, WfdPar = WfdPar)
111 |     # }
112 |     warpfdobj = regDens$warpfd
113 |     warpedX = fda::eval.fd(warpfdobj, arg_vals)
114 |     warpedX[1, ] = utils::head(arg_vals, 1)
115 |     warpedX[nrow(warpedX), ] = utils::tail(arg_vals, 1)
116 |     funs = apply(warpedX, 2, stats::approxfun, arg_vals)
117 |     funsBack = apply(warpedX, 2, function(a, b) stats::approxfun(b, a), arg_vals)
118 |   }
119 | 
120 | 
121 |   names(funs) = names(funsBack) = samples
122 | 
123 |   warped_landmark_matrix = landmark_matrix
124 |   leftBoard = rightBoard = vector("list", length(funs))
125 |   newRange = c(Inf, -Inf)
126 | 
127 |   ## transform the raw data using the warping functions
128 |   for (i in seq_along(funs)) {
129 |     # cell_index = which(cell_x_feature$sample == samples[i])
130 |     cell_ind_tmp = which(cell_x_feature$sample == samples[i])
131 |     cell_index = cell_ind_tmp[which(!is.na(cell_x_adt[cell_ind_tmp]))]
132 |     if(length(cell_index) > 0){
133 |       thisDat = t(t(cell_x_adt[cell_index]))
134 |       newDat = as.matrix(funs[[i]](thisDat))
135 |       newDat[is.na(newDat)] = thisDat[is.na(newDat)]
136 |       cell_x_adt_norm[cell_index] = newDat
137 |       warped_landmark_matrix[i, ] = funs[[i]](landmark_matrix[i, ])
138 |     }else{
139 |       warped_landmark_matrix[i, ] = NA
140 |     }
141 | 
142 |   }
143 |   return(list(cell_x_adt_norm = cell_x_adt_norm, landmark_matrix_norm = warped_landmark_matrix))
144 | }
145 | 


--------------------------------------------------------------------------------
/R/plot_adt_density_each.R:
--------------------------------------------------------------------------------
 1 | #' Plot the expression density profile for ONE ADT marker
 2 | #'
 3 | #' This function plots the ADT expression density profile for only one ADT marker. Each track is a sample. Color by batch
 4 | #' @param adt_count Matrix of ADT raw counts in cells (rows) by one target ADT marker (column) format.
 5 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.
 6 | #' @param brewer_palettes Set the color scheme of the color brewer.
 7 | #' @param parameter_list Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot.
 8 | #' @export
 9 | #' @examples
10 | #' \dontrun{
11 | #' plot_adt_density_each(
12 | #'  cell_x_adt,
13 | #'  cell_x_feature,
14 | #'  brewer_palettes = "Set1",
15 | #'  parameter_list = list(bw = 0.1, run_label = "ADTnorm")
16 | #' )
17 | #' }
18 | # require(ggplot2)
19 | # require(RColorBrewer)
20 | # require(tidyr)
21 | # require(ggridges)
22 | # require(ggpubr)
23 | plot_adt_density_each = function(adt_count, cell_x_feature, brewer_palettes, parameter_list = NULL) {
24 |     if (is.null(parameter_list)) {
25 |         return("parameter_list is NULL!")
26 |     }
27 |     parameter_list_name = names(parameter_list)
28 | 
29 |     run_label = ""
30 |     bw = 1
31 |     if (!is.null(parameter_list)) {
32 |         if ("run_label" %in% parameter_list_name) {
33 |             run_label = parameter_list[["run_label"]]
34 |         }
35 |         if("bw" %in% parameter_list_name){
36 |             bw = parameter_list$bw
37 |         }
38 |     }
39 | 
40 |     # peak_landmark_list = parameter_list$peak_landmark_list
41 |     # valley_landmark_list = parameter_list$valley_landmark_list
42 |     # brewer_palettes = parameter_list$brewer_palettes
43 | 
44 |     # If there is no batch, add a dummy variable
45 |     if (! "batch" %in% colnames(cell_x_feature)){ cell_x_feature$batch <- 1 }
46 |     
47 |     tmpProfile = data.frame(counts = adt_count) %>%
48 |         mutate(
49 |             sample = rep(cell_x_feature$sample, 1),
50 |             batch = rep(cell_x_feature$batch, 1)
51 |         )
52 | 
53 |     # peak_location = list()
54 |     # valley_location = list()
55 |     # for (i in 1:ncol(peak_landmark_list)) {
56 |     #     peak_location[[i]] = data.frame(
57 |     #         # ADT = adt_marker_select,
58 |     #         sample = cell_x_feature$sample %>% levels(),
59 |     #         peakx = peak_landmark_list[, i],
60 |     #         peaky = 0.5,
61 |     #         peaks = 1:length(levels(cell_x_feature$sample))
62 |     #     )
63 |     #     if (i <= ncol(valley_landmark_list)) {
64 |     #         valley_location[[i]] = data.frame(
65 |     #             # ADT = adt_marker_select,
66 |     #             sample = cell_x_feature$sample %>% levels(),
67 |     #             peakx = valley_landmark_list[, i],
68 |     #             peaky = 0.5,
69 |     #             peaks = 1:length(levels(cell_x_feature$sample))
70 |     #         )
71 |     #     }
72 |     # }
73 |     fillColor = grDevices::colorRampPalette(RColorBrewer::brewer.pal(8, brewer_palettes))(length(unique(tmpProfile$batch)))
74 |     resPlot = ggplot(tmpProfile, aes(x = counts, y = sample)) +
75 |         ggridges::geom_density_ridges(aes(fill = factor(batch)), bandwidth = bw) +
76 |         # geom_segment(data = peak_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) +
77 |         # geom_segment(data = valley_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") +
78 |         # facet_wrap(~ factor(ADT), scales = "free_x") +
79 |         theme_bw(base_size = 20) +
80 |         xlab(run_label) +
81 |         ylab("") +
82 |         ggpubr::rotate_x_text(angle = 90) +
83 |         ggpubr::rremove("legend") +
84 |         scale_fill_manual(values = fillColor) +
85 |         ggpubr::rremove("legend.title")
86 | 
87 | 
88 |     return(resPlot)
89 | }
90 | 


--------------------------------------------------------------------------------
/R/plot_adt_density_with_peak_valley.R:
--------------------------------------------------------------------------------
  1 | #' Plot ADT marker expression density profile with identified peak and valley locations
  2 | #'
  3 | #' This function plots the ADT expression density profile with identified peak and valley locations. Each panel is an ADT marker, and each track is a sample. Color by batch
  4 | #' @param cell_x_adt Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.
  5 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.
  6 | #' @param adt_marker_select The target ADT marker(s) that the denstiy plot is about. Leave it NULL will generate figures for all the ADT markers available in cell_x_adt.
  7 | #' @param peak_landmark_list Matrix of peak landmark locations with rows being samples and columns being the peaks.
  8 | #' @param valley_landmark_list Matrix of valley landmark locations with rows being samples and columns being the valleys.
  9 | #' @param brewer_palettes Set the color scheme of the color brewer.
 10 | #' @param parameter_list Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot.
 11 | #' @export
 12 | #' @examples
 13 | #' \dontrun{
 14 | #' plot_adt_density_with_peak_valley(
 15 | #'   cell_x_adt,
 16 | #'   cell_x_feature,
 17 | #'   adt_marker_select = c("CD3", "CD4", "CD8", "CD19"),
 18 | #'   peak_landmark_list = peak_mode_norm_res,
 19 | #'   valley_landmark_list = valley_location_norm_res,
 20 | #'   brewer_palettes = "Set1",
 21 | #'   parameter_list = list(bw = 0.1, run_label = "ADTnorm")
 22 | #' )
 23 | #' }
 24 | # require(ggplot2)
 25 | # require(RColorBrewer)
 26 | # require(tidyr)
 27 | # require(ggridges)
 28 | # require(ggpubr)
 29 | plot_adt_density_with_peak_valley = function(cell_x_adt, cell_x_feature, adt_marker_select = NULL, peak_landmark_list, valley_landmark_list = NULL, brewer_palettes = "Set1", parameter_list = NULL) {
 30 |     if (is.null(parameter_list)) {
 31 |         return("parameter_list is NULL!")
 32 |     }
 33 |     parameter_list_name = names(parameter_list)
 34 | 
 35 |     run_label = ""
 36 |     bw = 1
 37 |     if (!is.null(parameter_list)) {
 38 |         if ("run_label" %in% parameter_list_name) {
 39 |             run_label = parameter_list[["run_label"]]
 40 |         }
 41 |         if("bw" %in% parameter_list_name){
 42 |             bw = parameter_list$bw
 43 |         }
 44 |     }
 45 | 
 46 |     # peak_landmark_list = parameter_list$peak_landmark_list
 47 |     # valley_landmark_list = parameter_list$valley_landmark_list
 48 |     # brewer_palettes = parameter_list$brewer_palettes
 49 |     if(is.null(adt_marker_select)){
 50 |         adt_marker_select = colnames(cell_x_adt)
 51 |     }
 52 |     
 53 |     # If there is no batch, add a dummy variable
 54 |     if (! "batch" %in% colnames(cell_x_feature)){ cell_x_feature$batch <- 1 }
 55 |     
 56 |     tmpProfile = cell_x_adt %>% data.frame %>%
 57 |         dplyr::select(all_of(adt_marker_select)) %>%
 58 |         data.frame() %>%
 59 |         tidyr::gather(key = "ADT", value = "counts") %>%
 60 |         mutate(
 61 |             sample = rep(cell_x_feature$sample, length(adt_marker_select)),
 62 |             batch = rep(cell_x_feature$batch, length(adt_marker_select))
 63 |         )
 64 | 
 65 |     peak_location = list()
 66 |     valley_location = list()
 67 |     for (i in 1:ncol(peak_landmark_list)) {
 68 |         peak_location[[i]] = data.frame(
 69 |             ADT = adt_marker_select,
 70 |             sample = cell_x_feature$sample %>% levels(),
 71 |             peakx = peak_landmark_list[, i],
 72 |             peaky = 0.5,
 73 |             peaks = 1:length(levels(cell_x_feature$sample))
 74 |         )
 75 |         if ((!is.null(valley_landmark_list)) && i <= ncol(valley_landmark_list)) {
 76 |             valley_location[[i]] = data.frame(
 77 |                 ADT = adt_marker_select,
 78 |                 sample = cell_x_feature$sample %>% levels(),
 79 |                 peakx = valley_landmark_list[, i],
 80 |                 peaky = 0.5,
 81 |                 peaks = 1:length(levels(cell_x_feature$sample))
 82 |             )
 83 |         }
 84 |     }
 85 |     fillColor = grDevices::colorRampPalette(RColorBrewer::brewer.pal(8, brewer_palettes))(length(unique(tmpProfile$batch)))
 86 | 
 87 |     resPlot = ggplot(tmpProfile, aes(x = counts, y = sample)) +
 88 |         ggridges::geom_density_ridges(aes(fill = factor(batch)), bandwidth = bw) +
 89 |         geom_segment(data = peak_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) +
 90 |         # geom_segment(data = valley_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") +
 91 |         facet_wrap(~ factor(ADT), scales = "free_x") +
 92 |         theme_bw(base_size = 20) +
 93 |         xlab(run_label) +
 94 |         ylab("") +
 95 |         ggpubr::rotate_x_text(angle = 90) +
 96 |         ggpubr::rremove("legend") +
 97 |         scale_fill_manual(values = fillColor) +
 98 |         ggpubr::rremove("legend.title")
 99 | 
100 |     if(is.null(valley_landmark_list)){
101 |         resPlot = resPlot + geom_segment(data = valley_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey")
102 |     }
103 | 
104 |     if (ncol(peak_landmark_list) >= 2) {
105 |         resPlot = resPlot +
106 |             geom_segment(data = peak_location[[2]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1)
107 |     }
108 | 
109 |     if (ncol(peak_landmark_list) >= 3) {
110 |         resPlot = resPlot +
111 |             geom_segment(data = peak_location[[3]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) +
112 |             geom_segment(data = valley_location[[2]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey")
113 |     }
114 |     if (ncol(peak_landmark_list) >= 4) {
115 |         resPlot = resPlot +
116 |             geom_segment(data = peak_location[[4]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) +
117 |             geom_segment(data = valley_location[[3]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey")
118 |     }
119 | 
120 |     return(resPlot)
121 | }
122 | 


--------------------------------------------------------------------------------
/R/plot_adt_density_with_peak_valley_each.R:
--------------------------------------------------------------------------------
  1 | #' Plot the expression density profile for ONE ADT marker with identified peak and valley locations
  2 | #'
  3 | #' This function plots the ADT expression density profile with identified peak and valley locations for only one ADT marker. Each track is a sample. Color by batch
  4 | #' @param adt_count Matrix of ADT raw counts in cells (rows) by one target ADT marker (column) format.
  5 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.
  6 | #' @param peak_landmark_list Matrix of peak landmark locations with rows being samples and columns being the peaks.
  7 | #' @param valley_landmark_list Matrix of valley landmark locations with rows being samples and columns being the valleys.
  8 | #' @param brewer_palettes Set the color scheme of the color brewer.
  9 | #' @param parameter_list Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot.
 10 | #' @export
 11 | #' @examples
 12 | #' \dontrun{
 13 | #' plot_adt_density_with_peak_valley_each(
 14 | #'   cell_x_adt,
 15 | #'   cell_x_feature,
 16 | #'   peak_landmark_list = peak_mode_norm_res,
 17 | #'   valley_landmark_list = valley_location_norm_res,
 18 | #'   brewer_palettes = "Set1",
 19 | #'   parameter_list = list(bw = 0.1, run_label = "ADTnorm")
 20 | #' )
 21 | #' }
 22 | # require(ggplot2)
 23 | # require(RColorBrewer)
 24 | # require(tidyr)
 25 | # require(ggridges)
 26 | # require(ggpubr)
 27 | plot_adt_density_with_peak_valley_each = function(adt_count, cell_x_feature, peak_landmark_list, valley_landmark_list = NULL, brewer_palettes = "Set1", parameter_list = NULL) {
 28 |     if (is.null(parameter_list)) {
 29 |         return("parameter_list is NULL!")
 30 |     }
 31 |     parameter_list_name = names(parameter_list)
 32 | 
 33 |     run_label = ""
 34 |     bw = 1
 35 |     if (!is.null(parameter_list)) {
 36 |         if ("run_label" %in% parameter_list_name) {
 37 |             run_label = parameter_list[["run_label"]]
 38 |         }
 39 |         if("bw" %in% parameter_list_name){
 40 |             bw = parameter_list$bw
 41 |         }
 42 |     }
 43 | 
 44 |     # peak_landmark_list = parameter_list$peak_landmark_list
 45 |     # valley_landmark_list = parameter_list$valley_landmark_list
 46 |     # brewer_palettes = parameter_list$brewer_palettes
 47 |   
 48 |     # If there is no batch, add a dummy variable
 49 |     if (! "batch" %in% colnames(cell_x_feature)){ cell_x_feature$batch <- 1 }
 50 |       
 51 |     tmpProfile = data.frame(counts = adt_count) %>%
 52 |         mutate(
 53 |             sample = rep(cell_x_feature$sample, 1),
 54 |             batch = rep(cell_x_feature$batch, 1)
 55 |         ) #%>% dplyr::filter(!is.na(counts))
 56 | 
 57 |     peak_location = list()
 58 |     valley_location = list()
 59 |     for (i in 1:ncol(peak_landmark_list)) {
 60 |         peak_location[[i]] = data.frame(
 61 |             # ADT = adt_marker_select,
 62 |             sample = cell_x_feature$sample %>% levels(),
 63 |             peakx = peak_landmark_list[, i],
 64 |             peaky = 0.5,
 65 |             peaks = 1:length(levels(cell_x_feature$sample))
 66 |         )
 67 |         if ((!is.null(ncol(valley_landmark_list))) && i <= ncol(valley_landmark_list)) {
 68 |             valley_location[[i]] = data.frame(
 69 |                 # ADT = adt_marker_select,
 70 |                 sample = cell_x_feature$sample %>% levels(),
 71 |                 peakx = valley_landmark_list[, i],
 72 |                 peaky = 0.5,
 73 |                 peaks = 1:length(levels(cell_x_feature$sample))
 74 |             )
 75 |         }
 76 |     }
 77 |     fillColor = grDevices::colorRampPalette(RColorBrewer::brewer.pal(8, brewer_palettes))(length(unique(tmpProfile$batch)))
 78 |     resPlot = ggplot(tmpProfile, aes(x = counts, y = sample)) +
 79 |         ggridges::geom_density_ridges(aes(fill = factor(batch)), bandwidth = bw) +
 80 |         geom_segment(data = peak_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) +
 81 |         # geom_segment(data = valley_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") +
 82 |         # facet_wrap(~ factor(ADT), scales = "free_x") +
 83 |         theme_bw(base_size = 20) +
 84 |         xlab(run_label) +
 85 |         ylab("") +
 86 |         ggpubr::rotate_x_text(angle = 90) +
 87 |         ggpubr::rremove("legend") +
 88 |         scale_fill_manual(values = fillColor) +
 89 |         ggpubr::rremove("legend.title")
 90 | 
 91 |     if(!is.null(valley_landmark_list)){
 92 |         resPlot = resPlot + 
 93 |         geom_segment(data = valley_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") 
 94 |     }
 95 | 
 96 |     if (ncol(peak_landmark_list) >= 2) {
 97 |         resPlot = resPlot +
 98 |             geom_segment(data = peak_location[[2]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1)
 99 |     }
100 | 
101 |     if (ncol(peak_landmark_list) >= 3) {
102 |         resPlot = resPlot +
103 |             geom_segment(data = peak_location[[3]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) +
104 |             geom_segment(data = valley_location[[2]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey")
105 |     }
106 |     if (ncol(peak_landmark_list) >= 4) {
107 |         resPlot = resPlot +
108 |             geom_segment(data = peak_location[[4]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) +
109 |             geom_segment(data = valley_location[[3]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey")
110 |     }
111 | 
112 |     return(resPlot)
113 | }
114 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ADTnorm
  2 | 
  3 | <!-- badges: start -->
  4 | [![R-CMD-check](https://github.com/yezhengSTAT/ADTnorm/workflows/R-CMD-check/badge.svg)](https://github.com/yezhengSTAT/ADTnorm/actions)
  5 | [![docker](https://github.com/yezhengSTAT/ADTnorm/workflows/docker/badge.svg)](https://github.com/yezhengSTAT/ADTnorm/pkgs/container/adtnorm)
  6 | <!-- badges: end -->
  7 | 
  8 | ## What is `ADTnorm`?
  9 | 
 10 | CITE-seq enables paired measurement of surface protein and mRNA expression in single cells using antibodies conjugated to oligonucleotide tags. Due to the high copy number of surface protein molecules, sequencing antibody-derived tags (ADTs) allows for robust protein detection, improving cell-type identification. However, variability in antibody staining leads to batch effects in the ADT expression, obscuring biological variation, reducing interpretability, and obstructing cross-study analyses. Here, we present ADTnorm, a normalization and integration method designed explicitly for ADT abundance. Benchmarking against 14 existing scaling and normalization methods, we show that ADTnorm accurately aligns populations with negative- and positive-expression of surface protein markers across 13 public datasets, effectively removing technical variation across batches and improving cell-type separation. ADTnorm enables efficient integration of public CITE-seq datasets, each with unique experimental designs, paving the way for atlas-level analyses. Beyond normalization, ADTnorm aids in automated threshold-gating as well as assessment of antibody staining quality for titration optimization and antibody panel selection. 
 11 | 
 12 | This repository is the ADTnorm R package. We also provide a [Python wrapper](https://github.com/donnafarberlab/ADTnormPy) by [Daniel P. Caron](https://github.com/Daniel-Caron).
 13 | 
 14 | Manuscript: [Zheng et al. ADTnorm: Robust Integration of Single-cell Protein Measurement across CITE-seq Datasets. BioRxiv. 2024](https://www.biorxiv.org/content/10.1101/2022.04.29.489989v2)
 15 | 
 16 | 
 17 | ## ADT Normalization Pipeline
 18 | 
 19 | <img src="./man/figures/pipeline_202208.png" alt="ADTnorm" width="600px">
 20 | 
 21 | 
 22 | ## Installation
 23 | 
 24 | ``` R
 25 | # install.packages("remotes")
 26 | remotes::install_github("yezhengSTAT/ADTnorm", build_vignettes = FALSE)
 27 | ```
 28 | 
 29 | ### Using Docker
 30 | 
 31 | There are many dependencies in `ADTnorm`, so it takes a long time to install them all. Instead, you can use the Docker image of `ADTnorm`.
 32 | 
 33 | ``` sh
 34 | docker pull ghcr.io/yezhengstat/adtnorm:latest
 35 | docker run \
 36 |   -it \
 37 |   --user rstudio \
 38 |   --volume <yourDataDirectory>:/home/rstudio/data \
 39 |   yezhengstat/adtnorm:latest \
 40 |   R
 41 | ```
 42 | 
 43 | Replace `<yourDataDirectory>` with the local directory path (absolute path) where you have the input data and would like to store the output files. For more information on using docker containers, please read [this documentation](https://github.com/Bioconductor/bioconductor_docker/blob/master/README.md#using-the-containers) by Bioconductor.
 44 | 
 45 | 
 46 | ## Input Data
 47 | 
 48 | The 13 public datasets used in the [manuscript](https://www.biorxiv.org/content/10.1101/2022.04.29.489989v2) are also included in the R package as a demo data set. They can be loaded by
 49 | 
 50 | ```{r loaddata, eval = FALSE}
 51 | data(cell_x_adt)
 52 | data(cell_x_feature) 
 53 | ```
 54 | 
 55 | - ```cell_x_adt``` contains raw counts for ADT markers in each cell.  It is a data frame with 422682 cells (rows) and 9 ADT markers (columns): CD3, CD4, CD8, CD14, CD19, CD25, CD45RA, CD56, CD127.
 56 | 
 57 | ```
 58 |   CD3  CD4 CD8 CD14 CD19 CD25 CD45RA CD56 CD127
 59 | 1  18  138  13  491    3    9    110   17     7
 60 | 2  30  119  19  472    3    5    125  248     8
 61 | 3  18  207  10 1289    8   15   5268   26    12
 62 | 4  18   11  17   20    5   15   4743  491    16
 63 | 5   5   14  14   19    4   16   4108  458    17
 64 | 6  21 1014  29 2428    7   52    227   29    15
 65 | ```
 66 | 
 67 | - ```cell_x_feature``` is a data frame with 422682 cells (rows) and 7  feature variables (columns):
 68 |   
 69 |     - sample: Sample name used in original data of each study.
 70 |     
 71 |     - batch: Batch information provided from each study.
 72 |     
 73 |     - sample_status: Sample status, i.e., Healthy, MALTtumor, HIV Vaccine, Lupus, B-ALL, AML.
 74 | 
 75 |     - study_name: Name of the data set/study.
 76 | 
 77 |     - ADTseqDepth: Total UMI per cell.
 78 |     
 79 |     - cell_type_l1: Broad level of cell type annotation using manual gating.
 80 |     
 81 |     - cell_type_l2: Fine level of cell type annotation using manual gating.
 82 | 
 83 | 
 84 | ```
 85 |                 sample               batch sample_status   study_name
 86 | 1 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1       healthy 10X_pbmc_10k
 87 | 2 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1       healthy 10X_pbmc_10k
 88 | 3 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1       healthy 10X_pbmc_10k
 89 | 4 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1       healthy 10X_pbmc_10k
 90 | 5 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1       healthy 10X_pbmc_10k
 91 | 6 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1       healthy 10X_pbmc_10k
 92 |   ADTseqDepth cell_type_l1       cell_type_l2
 93 | 1         981    monocytes classical monocyte
 94 | 2        1475    monocytes classical monocyte
 95 | 3        7149    monocytes classical monocyte
 96 | 4        6831           NK           CD16+ NK
 97 | 5        6839           NK           CD16+ NK
 98 | 6        4720    monocytes classical monocyte
 99 | ```
100 | 
101 | ## Usage
102 | 
103 | 
104 | **For more detailed and typical parameter tuning examples, please visit [tutorial website](https://yezhengstat.github.io/ADTnorm/articles/ADTnorm-tutorial.html). We will illustrate using the demo data.**
105 | 
106 | 
107 | ### Case 1. Consider one study as a sample and normalize across studies.
108 | ```R
109 | library(ADTnorm)
110 | save_outpath <- "/path/to/output/location"
111 | run_name <- "ADTnorm_demoRun"
112 | data(cell_x_adt)
113 | data(cell_x_feature) 
114 | 
115 | cell_x_feature$sample = factor(cell_x_feature$study_name) ## consider each study as one sample
116 | cell_x_feature$batch = factor(cell_x_feature$study_name) ## consider each study as a batch
117 | 
118 | cell_x_adt_norm <- ADTnorm(
119 |   cell_x_adt = cell_x_adt, 
120 |   cell_x_feature = cell_x_feature,
121 |   save_outpath = save_outpath, 
122 |   study_name = run_name, 
123 |   marker_to_process = c("CD3", "CD4", "CD8", "CD45RA"), 
124 |   trimodal_marker = c("CD4", "CD45RA"), 
125 |   positive_peak = list(ADT = "CD3", sample = "buus_2021_T"),
126 |   save_fig = TRUE
127 | )
128 | ```
129 | 
130 | ### Case 2. Consider each healthy donor/patient per time point/condition/response/etc as one sample and normalize across the individual sample. 
131 | ``` R
132 | library(ADTnorm)
133 | save_outpath <- "/path/to/output/location"
134 | run_name <- "ADTnorm_demoRun"
135 | data(cell_x_adt)
136 | data(cell_x_feature) 
137 | 
138 | cell_x_feature$batch = factor(cell_x_feature$study_name) ## consider each study as a batch
139 | 
140 | cell_x_adt_norm <- ADTnorm(
141 |   cell_x_adt = cell_x_adt, 
142 |   cell_x_feature = cell_x_feature,
143 |   save_outpath = save_outpath, 
144 |   study_name = run_name, 
145 |   marker_to_process = c("CD3", "CD4", "CD8", "CD45RA"), 
146 |   trimodal_marker = c("CD4", "CD45RA"), 
147 |   positive_peak = list(ADT = "CD3", sample = "buus_2021_T"),
148 |   save_fig = TRUE
149 | )
150 | ```
151 | 
152 | 
153 | Basic parameters introduction. The full parameter explanation for the ```ADTnorm``` function can be found at [Reference - ADTnorm](https://yezhengstat.github.io/ADTnorm/reference/ADTnorm.html). 
154 | 
155 | ```
156 | cell_x_adt:         Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.
157 | 
158 | cell_x_feature:     Matrix of cells (rows) by cell features (columns) such as sample, batch, and cell type-related information. Please note "sample" column is mandatory and should be the smallest unit to group the cells. At this resolution, ADTnorm will identify peaks and valleys to implement normalization. Please ensure the samples have different names across batches/conditions/studies. "batch" column is optional. It can be batches/conditions/studies/etc, that group the samples based on whether the samples are collected from the same batch run or experiment. This column is needed if the ```multi_sample_per_batch``` parameter is turned on to remove outlier positive peaks per batch or ```detect_outlier_valley``` for detecting and imputing outlier valleys per batch. If the "batch" column is not provided, it will be set as the same as the "sample" column. In the intermediate density plots that ADTnorm provides, density plots will be colored by the "batch" column.
159 | 
160 | save_outpath:       The path to save the results.
161 | 
162 | study_name:         Name of this run.
163 | 
164 | marker_to_process:  Markers to normalize. Leave empty to process all the ADT markers in the cell_x_adt matrix.
165 | 
166 | bimodal_marker:     Specify ADT markers that are likely to have two peaks based on researchers' prior knowledge or preliminary observation of the particular data to be processed. Leaving it as default, ADTnorm will try to find the bimodal peak in all markers that are not listed in `trimodal_marker.`
167 | 
168 | trimodal_marker:    Index of the ADT markers that tend to have three peaks based on researchers' prior knowledge (e.g., CD4) or preliminary observation of the particular data to be processed.
169 | 
170 | positive_peak:      A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells, the only CD3 peak should be aligned to the positive peaks of other samples.
171 | 
172 | save_fig:  Save the density plot figure for checking the peak and valley location detection.
173 | ```
174 | 
175 | **For more detailed and typical parameter tuning examples, please visit [tutorial website](https://yezhengstat.github.io/ADTnorm/articles/ADTnorm-tutorial.html). We will illustrate using the demo data.**
176 | 
177 | 
178 | ## Results
179 | 
180 | ```ADTnorm``` function will generate a matrix of rows of the same number as input ```cell_x_adt``` row number and columns are ADT markers specified in ```marker_to_process```. The value in the matrix is normalized value by ADTnorm. In the `save_outpath` specified by the users, there will be two subfolders, `figures` and `RDS`, containing the intermediate object and density plot of detected peak and valley landmarks before and after ADTnorm. Those figures can be used to check whether certain ADT markers need further parameter tuning.  
181 | 
182 | ### Case 1. Consider one study as a sample and normalize across studies.
183 | 
184 | #### Arcsinh Transformation on Raw Counts 
185 | 
186 | <img src="./man/figures/RawCount.png" alt="Arcsinh Transformation" width="700px">
187 | 
188 | #### ADTnorm Counts
189 | 
190 | <img src="./man/figures/ADTnorm.png" alt="ADTnorm Normalization" width="700px">
191 | 
192 | ### Case 2. Consider each healthy donor/patient per time point/condition/response/etc as one sample and normalize across the individual sample. 
193 | 
194 | 
195 | #### Arcsinh Transformation on Raw Counts 
196 | 
197 | Color-coded by studies as batches.
198 | 
199 | <img src="./man/figures/PublicData_samplelevel_raw.png" alt="Arcsinh Transformation" width="1000px">
200 | 
201 | #### ADTnorm Counts
202 | 
203 | <img src="./man/figures/PublicData_samplelevel_adtnorm.png" alt="ADTnorm Normalization" width="1000px">
204 | 
205 | ## Manual Adjustment of Landmark Locations by R Shiny
206 | 
207 | ```customize_landmark```: By setting it to TRUE, ADTnorm will trigger the interactive landmark tuning function and pop out a shiny application for the user's manual setting of peak and valley locations. The procedure for adjusting the landmarks (peaks and valleys) is below.
208 | 
209 | <img src="./man/figures/ShinyR.png" alt="ShinyR" width="1000px">
210 | 
211 | Please note:
212 | 
213 | - We recommend using this function after initial rounds of ADTnorm normalization with a few parameter tuning attempts. It is better to narrow down a few ADT markers that need manual tuning and provide the list to ```marker_to_process``` as the interactive function will pop out for every marker being processed. 
214 | 
215 | - If zigzag discrete negative peaks are observed, users can first increase the "Bandwidth for Density Visualization" at the top of the right panel to smooth out the discrete negative peaks before setting the landmarks.
216 | 
217 | - Currently, the shiny browser support setting any landmark (peaks or valleys) to NA as missing. However, it does not support inserting new landmark(s). For example, if the marker density distribution shows a triple peak pattern but ADTnorm only detects two peaks across all the samples. Shiny browser does not allow manual insertion of a new peak and valley, but the user can tune the other parameters to push ADTnorm to detect three peaks: specify the target marker as ```trimodal_marker```, reducing the ```bw_smallest_tri``` or setting smaller bandwidth value and specify for the target ADT marker through ```bw_smallest_adjustments```.
218 | 
219 | **For more detailed and typical parameter tuning examples, please visit [tutorial website](https://yezhengstat.github.io/ADTnorm/articles/ADTnorm-tutorial.html). We will illustrate using the demo data.**
220 | 
221 | ## Contact for questions, discussions, or potential collaborations
222 | 
223 | [Ye Zheng](https://yezhengstat.github.io/)
224 | 
225 | Email: yzheng23@fredhutch.org
226 | 
227 | Twitter: @yezhengSTAT
228 | 


--------------------------------------------------------------------------------
/data/cell_x_adt.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/data/cell_x_adt.rda


--------------------------------------------------------------------------------
/data/cell_x_feature.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/data/cell_x_feature.rda


--------------------------------------------------------------------------------
/inst/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM bioconductor/bioconductor_docker:latest
2 | ADD . ~/ADTnorm/
3 | WORKDIR ~/
4 | RUN R -q -e 'install.packages("devtools")'
5 | RUN R -q -e 'devtools::install("./ADTnorm", dependencies = TRUE)'
6 | WORKDIR /home/rstudio
7 | 


--------------------------------------------------------------------------------
/man/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/.DS_Store


--------------------------------------------------------------------------------
/man/ADTnorm.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/ADTnorm.R
  3 | \name{ADTnorm}
  4 | \alias{ADTnorm}
  5 | \title{ADTnorm normalization to remove the technical variations across samples for each ADT marker.}
  6 | \usage{
  7 | ADTnorm(
  8 |   cell_x_adt = NULL,
  9 |   cell_x_feature = NULL,
 10 |   save_outpath = NULL,
 11 |   study_name = "ADTnorm",
 12 |   marker_to_process = NULL,
 13 |   exclude_zeroes = FALSE,
 14 |   bimodal_marker = NULL,
 15 |   trimodal_marker = NULL,
 16 |   positive_peak = NULL,
 17 |   bw_smallest_bi = 1.1,
 18 |   bw_smallest_tri = 0.8,
 19 |   bw_smallest_adjustments = list(CD3 = 0.8, CD4 = 0.8, CD8 = 0.8),
 20 |   quantile_clip = 1,
 21 |   peak_type = "mode",
 22 |   multi_sample_per_batch = FALSE,
 23 |   shoulder_valley = TRUE,
 24 |   shoulder_valley_slope = -0.5,
 25 |   valley_density_adjust = 3,
 26 |   landmark_align_type = "negPeak_valley_posPeak",
 27 |   midpoint_type = "valley",
 28 |   neg_candidate_thres = NULL,
 29 |   lower_peak_thres = 0.001,
 30 |   brewer_palettes = "Set1",
 31 |   save_landmark = FALSE,
 32 |   save_fig = TRUE,
 33 |   detect_outlier_valley = FALSE,
 34 |   target_landmark_location = NULL,
 35 |   clean_adt_name = FALSE,
 36 |   customize_landmark = FALSE,
 37 |   override_landmark = NULL,
 38 |   verbose = FALSE
 39 | )
 40 | }
 41 | \arguments{
 42 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format. By default, ADTnorm expects raw counts as input data and arcsinh transformation to be performed by ADTnorm internally. If ADTnorm detects that the input count matrix is a non-integer matrix, it will skip the arcsinh transformation. Therefore, users also need to tune the parameters to fit their input transformation.}
 43 | 
 44 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as sample, batch, or other cell-type related information. Please ensure that the cell_x_feature matrix at least contains a sample column with the exact "sample" column name. Please note that "sample" should be the smallest unit to group the cells. At this resolution, ADTnorm will identify peaks and valleys to implement normalization. Please ensure the samples have different names across batches/conditions/studies. "batch" column is optional. It can be batches/conditions/studies etc, that group the samples based on whether the samples are collected from the same batch run or experiment. This column is needed if \code{multi_sample_per_batch} parameter is turned on to remove outlier positive peaks per batch or \code{detect_outlier_valley} for detecting and imputing outlier valleys per batch. If "batch" column is not provided, it will be set as the same as "sample" column. In the intermediate density plots that ADTnorm provides, density plots will be colored by the "batch" column.}
 45 | 
 46 | \item{save_outpath}{The path to save the results.}
 47 | 
 48 | \item{study_name}{Name of this run.}
 49 | 
 50 | \item{marker_to_process}{Markers to normalize. Leaving empty to process all the ADT markers in the cell_x_adt matrix.}
 51 | 
 52 | \item{exclude_zeroes}{Indicator to consider zeros as NA, i.e., missing values. Recommend TRUE if zeroes in the data represent dropout, likely for large ADT panels, big datasets, or under-sequenced data. The default is FALSE.}
 53 | 
 54 | \item{bimodal_marker}{Specify ADT markers that are likely to have two peaks based on researchers' prior knowledge or preliminary observation of particular data to be processed. Leaving it as default, ADTnorm will try to find the bimodal peak in all markers that are not listed in \code{trimodal_marker.}}
 55 | 
 56 | \item{trimodal_marker}{Index of the ADT markers that tend to have three peaks based on researchers' prior knowledge (e.g., CD4) or preliminary observation on particular data to be processed.}
 57 | 
 58 | \item{positive_peak}{A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells. The only CD3 peak should be aligned with the positive peaks of other samples.}
 59 | 
 60 | \item{bw_smallest_bi}{The smallest bandwidth parameter value for bi-modal peaks. Recommend 1.1.}
 61 | 
 62 | \item{bw_smallest_tri}{The smallest bandwidth parameter value for tri-modal peaks. Recommend the same value for CD4, such as 0.5.}
 63 | 
 64 | \item{bw_smallest_adjustments}{A named list of floats, with names matching marker names, specifying the smallest bandwidth parameter value. The default value is bw_smallest_adjustments = list(CD3 = 0.8, CD4 = 0.8, CD8 = 0.8). Recommend 0.5 or 0.8 for the common multi-modal marker.}
 65 | 
 66 | \item{quantile_clip}{Implement an upper quantile clipping to avoid warping function errors caused by outlier measurements of extremely high expression. Provide the quantile threshold to remove outlier points above such a quantile. The default is 1, meaning no filtering. 0.99 means 99th quantile and points above 99th quantile will be discard.}
 67 | 
 68 | \item{peak_type}{The type of peak to be detected. Select from "midpoint" for setting the peak landmark to the midpoint of the peak region being detected or "mode" for setting the peak landmark to the mode location of the peak. "midpoint" can generally be more robust across samples and less impacted by the bandwidth. "mode" can be more accurate in determining the peak location if the bandwidth is generally ideal for the target marker. The default is "mode".}
 69 | 
 70 | \item{multi_sample_per_batch}{Set it to TRUE to discard the positive peak that only appears in one sample per batch (sample number is >=3 per batch).}
 71 | 
 72 | \item{shoulder_valley}{Indicator to specify whether a shoulder valley is expected in case of the heavy right tail where the population of cells should be considered as a positive population. The default is TRUE.}
 73 | 
 74 | \item{shoulder_valley_slope}{The slope on the ADT marker density distribution to call shoulder valley. Default is -0.5}
 75 | 
 76 | \item{valley_density_adjust}{Parameter for \code{density} function: bandwidth used is adjust*bw. This makes it easy to specify values like 'half the default' bandwidth. The default is 3.}
 77 | 
 78 | \item{landmark_align_type}{Algin the peak and valleys using one of the "negPeak", "negPeak_valley", "negPeak_valley_posPeak", and "valley" alignment modes. The default is "negPeak_valley_posPeak".}
 79 | 
 80 | \item{midpoint_type}{Fill in the missing first valley by the midpoint of two positive peaks ("midpoint") or impute by other valleys ("valley"). The default is "valley".}
 81 | 
 82 | \item{neg_candidate_thres}{The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95\% quantile of IgG samples is large. The default is asinh(8/5+1) for raw count input. This filtering will be disabled if the input is not raw count data.}
 83 | 
 84 | \item{lower_peak_thres}{The minimal ADT marker density height of calling it a real peak. Set it to 0.01 to avoid a suspicious positive peak. Set it to 0.001 or smaller to include some small but tend to be real positive peaks, especially for markers like CD19. The default is 0.001.}
 85 | 
 86 | \item{brewer_palettes}{Set the color scheme of the color brewer. The default is "Set1".}
 87 | 
 88 | \item{save_landmark}{Save the peak and valley locations in rds format. The default is FALSE.}
 89 | 
 90 | \item{save_fig}{Save the density plot figure for checking the peak and valley location detection. We highly recommend checking the intermediate peak and valley locations identification on those density plots to visually check if the detection is accurate and if manual tuning is needed. The default is TRUE.}
 91 | 
 92 | \item{detect_outlier_valley}{Detect the outlier valley within each batch of samples and impute by the neighbor samples' valley location. For outlier detection methods, choose from "MAD" (Median Absolute Deviation) or "IQR" (InterQuartile Range). Recommend trying "MAD" first if needed. The default is FALSE.}
 93 | 
 94 | \item{target_landmark_location}{Align the landmarks to a fixed location or, by default, align to the mean across samples for each landmark. The default value is NULL. Setting it to "fixed" will align the negative peak to 1 and the right-most positive peak to 5. Users can also assign a two-element vector indicating the location of the negative and most positive peaks to be aligned.}
 95 | 
 96 | \item{clean_adt_name}{Clean the ADT marker name. The default is FALSE.}
 97 | 
 98 | \item{customize_landmark}{By setting it to be TRUE, ADTnorm will trigger the interactive landmark tuning function and pop out a shiny application for the user's manual setting of the peaks and valleys location. We recommend using this function after initial rounds of ADTnorm normalization with a few parameter tuning attempts. It is better to narrow down a few ADT markers that need manual tuning and provide the list to marker_to_process, as the interactive function will pop out for every marker being processed. The default is FALSE.}
 99 | 
100 | \item{override_landmark}{Override the peak and valley locations if prior information is available or the user wants to manually adjust the peak and valley locations for certain markers. Input is in the format of list, i.e., list(CD3 = list(peak_landmark_list = customized_peak_landmark_list, valley_landmark_list = customized_valley_landmark_list), CD4 = list(peak_landmark_list = customized_peak_landmark_list, valley_landmark_list = customized_valley_landmark_list)). "customized_peak_landmark_list" and "customized_valley_landmark_list" are matrices of customized landmark locations with matching sample names as the rownames. The default is NULL.}
101 | 
102 | \item{verbose}{Set the verbosity of the function. The default is FALSE.}
103 | }
104 | \value{
105 | A data frame list containing normalized count for the ADT markers specified to be normalized. To output the peak and valley locations before and after ADTnorm normalization, please set \code{save_landmark} to TRUE, and the landmarks will be saved as an rds file in the "save_outpath" directory.
106 | }
107 | \description{
108 | This function removes the technical variations such as batch effect, sequencing depth biases, antibody selection differences and antibody concentration differences, etc. The normalized samples are ready for integration across studies.
109 | }
110 | \examples{
111 | \dontrun{
112 | ADTnorm(
113 |   cell_x_adt = cell_x_adt,
114 |   cell_x_feature = cell_x_feature,
115 |   save_outpath = save_outpath,
116 |   study_name = study_name,
117 |   marker_to_process = c("CD3", "CD4", "CD8")
118 |  )
119 | }
120 | }
121 | 


--------------------------------------------------------------------------------
/man/arcsinh_transform.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/arcsinh_transform.R
 3 | \name{arcsinh_transform}
 4 | \alias{arcsinh_transform}
 5 | \title{arcsinh transformation.}
 6 | \usage{
 7 | arcsinh_transform(cell_x_adt = NULL, parameter_list = NULL)
 8 | }
 9 | \arguments{
10 | \item{cell_x_adt}{Matrix where rows are cells and columns are ADT markers.}
11 | 
12 | \item{parameter_list}{Parameter list for a: positive double that corresponds to a shift about 0; b: positive double that corresponds to a scale factor; c: positive double. By default a = 1, b = 1/5 and c = 0.}
13 | }
14 | \description{
15 | This function transforms the input cell_x_adt matrix by arcsinh with co-factor 5. The definition of this function is x_new <- asinh(a + b * x) + c)
16 | }
17 | \examples{
18 | \dontrun{
19 | arcsinh_transform(cell_x_adt)
20 | }
21 | }
22 | 


--------------------------------------------------------------------------------
/man/cell_x_adt.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{cell_x_adt}
 5 | \alias{cell_x_adt}
 6 | \title{A matrix of raw count for the cell by ADT markers}
 7 | \format{
 8 | A data frame with 422682 rows and 9 variables:
 9 | \describe{
10 | \item{CD3}{CD3 ADT marker raw count across each cell}
11 | \item{CD4}{CD4 ADT marker raw count across each cell}
12 | \item{CD8}{CD8 ADT marker raw count across each cell}
13 | \item{CD14}{CD14 ADT marker raw count across each cell}
14 | \item{CD19}{CD19 ADT marker raw count across each cell}
15 | \item{CD25}{CD25 ADT marker raw count across each cell}
16 | \item{CD45RA}{CD45RA ADT marker raw count across each cell}
17 | \item{CD56}{CD56 ADT marker raw count across each cell}
18 | \item{CD127}{CD127 ADT marker raw count across each cell}
19 | }
20 | }
21 | \source{
22 | See detailed description in the manuscript
23 | }
24 | \usage{
25 | cell_x_adt
26 | }
27 | \description{
28 | A dataset containing 422682 cells and 9 ADT markers for the CITE-seq raw measurement of 13 publicly available CITE-seq datasets.
29 | }
30 | \keyword{datasets}
31 | 


--------------------------------------------------------------------------------
/man/cell_x_feature.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{cell_x_feature}
 5 | \alias{cell_x_feature}
 6 | \title{A matrix of raw count for the cell by features}
 7 | \format{
 8 | A data frame with 422682 rows and 7 variables:
 9 | \describe{
10 | \item{sample}{Sample name. In this demo data, the sample name is the same as the study_name, assuming that one study is one sample.}
11 | \item{batch}{Batch ID. In this demo data, the batch ID is the same as the study_name.}
12 | \item{sample_status}{Sample status, i.e., Healthy, MALTtumor, HIV Vaccine, Lupus, B-ALL, AML.}
13 | \item{study_name}{Name of the data set/study.}
14 | \item{ADTseqDepth}{Total UMI per cell.}
15 | \item{cell_type_l1}{Broad level of cell type annotation using manual gating.}
16 | \item{cell_type_l2}{Fine level of cell type annotation using manual gating.}
17 | }
18 | }
19 | \source{
20 | See detailed description in the manuscript
21 | }
22 | \usage{
23 | cell_x_feature
24 | }
25 | \description{
26 | A dataset containing 422682 cells and 7 feature categories for the CITE-seq raw measurement of 13 publicly available CITE-seq datasets.
27 | }
28 | \keyword{datasets}
29 | 


--------------------------------------------------------------------------------
/man/clean_adt_name.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/clean_adt_name.R
 3 | \name{clean_adt_name}
 4 | \alias{clean_adt_name}
 5 | \title{Clean ADT marker name}
 6 | \usage{
 7 | clean_adt_name(adt_name)
 8 | }
 9 | \arguments{
10 | \item{adt_name}{A vector of ADT marker name.}
11 | }
12 | \description{
13 | This function enables the general cleaning of ADT marker names. Regardless, users should try cleaning and unifying their ADT marker name first. Please also ensure there is no "/" in the ADT name, such as "TCRγ/δ".
14 | }
15 | \examples{
16 | \dontrun{
17 | clean_adt_name(colnames(cell_x_adt))
18 | }
19 | }
20 | 


--------------------------------------------------------------------------------
/man/detect_impute_outlier_valley.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/detect_impute_outlier_valley.R
 3 | \name{detect_impute_outlier_valley}
 4 | \alias{detect_impute_outlier_valley}
 5 | \title{Identify the valley outliers and impute by valley by closet neighbors.}
 6 | \usage{
 7 | detect_impute_outlier_valley(
 8 |   valley_location_res,
 9 |   adt_marker_select,
10 |   cell_x_adt,
11 |   cell_x_feature,
12 |   scale = 3,
13 |   method = "MAD",
14 |   nearest_neighbor_n = 3,
15 |   nearest_neighbor_threshold = 0.75
16 | )
17 | }
18 | \arguments{
19 | \item{valley_location_res}{Matrix of valley landmark locations with rows being samples and columns being the valleys.}
20 | 
21 | \item{adt_marker_select}{The marker whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample of the same ADT marker.}
22 | 
23 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.}
24 | 
25 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.}
26 | 
27 | \item{scale}{Scale level to defining outlier. A larger scale value corresponds to more severe outliers.}
28 | 
29 | \item{method}{Outlier detection methods, choose from "MAD" (Median Absolute Deviation) or "IQR" (InterQuartile Range). The default is MAD.}
30 | 
31 | \item{nearest_neighbor_n}{Number of top nearest neighbor samples to detect.}
32 | 
33 | \item{nearest_neighbor_threshold}{Threshold to call neighbor samples.}
34 | }
35 | \description{
36 | This function identifies the valley(s) that tend to be outliers compared to other valley locations and tries to find the closest samples with similar density distribution to impute the valley. If no neighbor sample is detected, the valley will remain as original.
37 | }
38 | \examples{
39 | \dontrun{
40 | detect_impute_outlier_valley(valley_location_res, cell_x_feature)
41 | }
42 | }
43 | 


--------------------------------------------------------------------------------
/man/figures/ADTnorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/ADTnorm.png


--------------------------------------------------------------------------------
/man/figures/ArcsinhTransform_log10_CD4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/ArcsinhTransform_log10_CD4.png


--------------------------------------------------------------------------------
/man/figures/PublicData_samplelevel_adtnorm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/PublicData_samplelevel_adtnorm.png


--------------------------------------------------------------------------------
/man/figures/PublicData_samplelevel_raw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/PublicData_samplelevel_raw.png


--------------------------------------------------------------------------------
/man/figures/RawCount.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/RawCount.png


--------------------------------------------------------------------------------
/man/figures/ShinyR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/ShinyR.png


--------------------------------------------------------------------------------
/man/figures/lower_peak_thres.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/lower_peak_thres.png


--------------------------------------------------------------------------------
/man/figures/multi_sample_per_batch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/multi_sample_per_batch.png


--------------------------------------------------------------------------------
/man/figures/peak_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/peak_type.png


--------------------------------------------------------------------------------
/man/figures/pipeline_202208.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/pipeline_202208.png


--------------------------------------------------------------------------------
/man/figures/shoulder_valley.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/shoulder_valley.png


--------------------------------------------------------------------------------
/man/figures/shoulder_valley_CD3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/shoulder_valley_CD3.png


--------------------------------------------------------------------------------
/man/figures/target_landmark_location.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/target_landmark_location.png


--------------------------------------------------------------------------------
/man/figures/valley_density_adjust.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/valley_density_adjust.png


--------------------------------------------------------------------------------
/man/get_customize_landmark.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_customize_landmark.R
 3 | \name{get_customize_landmark}
 4 | \alias{get_customize_landmark}
 5 | \title{Prompt Shiny browser to manually customize peaks and valleys locations.}
 6 | \usage{
 7 | get_customize_landmark(
 8 |   cell_x_adt_sample,
 9 |   landmark_pos,
10 |   bw,
11 |   adt_marker_select_name,
12 |   brewer_palettes = "Set1"
13 | )
14 | }
15 | \arguments{
16 | \item{cell_x_adt_sample}{Matrix of ADT counts of the selected marker, with columns of sample and batch information for each row of cells.}
17 | 
18 | \item{landmark_pos}{Matrix of landmark location including peaks and valleys.}
19 | 
20 | \item{bw}{Bandwidth for the density plot.}
21 | 
22 | \item{adt_marker_select_name}{The ADT marker needed to be manually processed to set the landmarks.}
23 | 
24 | \item{brewer_palettes}{Set the color scheme of the color brewer. The default is "Set1".}
25 | }
26 | \description{
27 | This function will launch a shiny app allowing the user to set the location of peaks and valleys manually. The function will output the landmark positions that the user has set.
28 | }
29 | \examples{
30 | \dontrun{
31 | get_customize_landmark(
32 | cell_x_adt_sample, 
33 | landmark_pos, 
34 | bw, 
35 | adt_marker_select_name, 
36 | brewer_palettes = "Set1"
37 | )}
38 | }
39 | 


--------------------------------------------------------------------------------
/man/get_neighbors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_neighbors.R
 3 | \name{get_neighbors}
 4 | \alias{get_neighbors}
 5 | \title{Find the closest neighbors to impute outlier valleys.}
 6 | \usage{
 7 | get_neighbors(
 8 |   target_sample,
 9 |   adt_marker_select,
10 |   cell_x_adt,
11 |   cell_x_feature,
12 |   nearest_neighbor_n = 3,
13 |   nearest_neighbor_threshold = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{target_sample}{The target sample whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample.}
18 | 
19 | \item{adt_marker_select}{The marker whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample of the same ADT marker.}
20 | 
21 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.}
22 | 
23 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.}
24 | 
25 | \item{nearest_neighbor_n}{Number of top nearest neighbor samples to detect.}
26 | 
27 | \item{nearest_neighbor_threshold}{Threshold to call neighbor samples.}
28 | }
29 | \description{
30 | This function identifies the valley that tends to be outliers compared to other valley locations and tries to find the closest samples that have similar density distribution to input the valley. If no neighbor sample is detected, the valley will remain as original.
31 | }
32 | \examples{
33 | \dontrun{
34 | get_neighbors(target_sample, adt_marker_select, cell_x_adt, cell_x_feature)
35 | }
36 | }
37 | 


--------------------------------------------------------------------------------
/man/get_peak_midpoint.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_peak_midpoint.R
 3 | \name{get_peak_midpoint}
 4 | \alias{get_peak_midpoint}
 5 | \title{Get the peak landmarks location using the peak region midpoint}
 6 | \usage{
 7 | get_peak_midpoint(
 8 |   cell_x_adt = NULL,
 9 |   cell_x_feature = NULL,
10 |   adt_marker_select = NULL,
11 |   adt_marker_index = NULL,
12 |   bwFac_smallest = 1.1,
13 |   bimodal_marker_index = NULL,
14 |   trimodal_marker_index = NULL,
15 |   positive_peak = NULL,
16 |   neg_candidate_thres = asinh(10/5 + 1),
17 |   lower_peak_thres = 0.001,
18 |   arcsinh_transform_flag = TRUE
19 | )
20 | }
21 | \arguments{
22 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.}
23 | 
24 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.}
25 | 
26 | \item{adt_marker_select}{Markers to normalize. Leaving empty to process all the ADT markers in the cell_x_adt matrix.}
27 | 
28 | \item{adt_marker_index}{Index of the ADT markers that will be normalized. Leaving empty if adt_marker_select is not specified.}
29 | 
30 | \item{bwFac_smallest}{The smallest bandwidth parameter value. Recommend 1.1 for general bi-modal ADT markers except for CD3, CD4, and CD8.}
31 | 
32 | \item{bimodal_marker_index}{Index of the ADT markers that tend to have two peaks based on researchers' prior knowledge or preliminary observation on particular data to be processed.}
33 | 
34 | \item{trimodal_marker_index}{Index of the ADT markers that tend to have three peaks based on researchers' prior knowledge (e.g., CD4) or preliminary observation on particular data to be processed.}
35 | 
36 | \item{positive_peak}{A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells. The only CD3 peak should be aligned with the positive peaks of other samples.}
37 | 
38 | \item{neg_candidate_thres}{The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95\% quantile of IgG samples is large.}
39 | 
40 | \item{lower_peak_thres}{The minimal ADT marker density height to call it a real peak. Set it to 0.01 to avoid a suspicious positive peak. Set it to 0.001 or smaller to include some small but tend to be real positive peaks, especially for markers like CD19.}
41 | 
42 | \item{arcsinh_transform_flag}{The flag indicates if the input is raw count and arcsinh transformation is implemented.}
43 | }
44 | \description{
45 | This function detects the peak landmark locations for each sample per ADT markers based on the midpoint of the peak region. Using the peak midpoint instead of the peak mode can be more stable across samples and less affected by the bandwidth.
46 | }
47 | \examples{
48 | \dontrun{
49 | get_peak_midpoint(
50 |   cell_x_adt = cell_x_adt,
51 |   cell_x_feature = cell_x_feature,
52 |   adt_marker_select = "CD3",
53 |   neg_candidate_thres = asinh(6/5+1)
54 | )
55 | }
56 | }
57 | 


--------------------------------------------------------------------------------
/man/get_peak_mode.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_peak_mode.R
 3 | \name{get_peak_mode}
 4 | \alias{get_peak_mode}
 5 | \title{Get the peak landmarks location using the peak mode}
 6 | \usage{
 7 | get_peak_mode(
 8 |   cell_x_adt = NULL,
 9 |   cell_x_feature = NULL,
10 |   adt_marker_select = NULL,
11 |   adt_marker_index = NULL,
12 |   bwFac_smallest = 1.1,
13 |   bimodal_marker_index = NULL,
14 |   trimodal_marker_index = NULL,
15 |   positive_peak = NULL,
16 |   neg_candidate_thres = asinh(10/5 + 1),
17 |   lower_peak_thres = 0.001,
18 |   arcsinh_transform_flag = TRUE
19 | )
20 | }
21 | \arguments{
22 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.}
23 | 
24 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.}
25 | 
26 | \item{adt_marker_select}{Markers to normalize. Leaving empty to process all the ADT markers in the cell_x_adt matrix.}
27 | 
28 | \item{adt_marker_index}{Index of the ADT markers that will be normalized. Leaving empty if adt_marker_select is not specified.}
29 | 
30 | \item{bwFac_smallest}{The smallest bandwidth parameter value. Recommend 1.1 for general bi-modal ADT markers except for CD3, CD4, and CD8.}
31 | 
32 | \item{bimodal_marker_index}{Index of the ADT markers that tend to have two peaks based on researchers' prior knowledge or preliminary observation on particular data to be processed.}
33 | 
34 | \item{trimodal_marker_index}{Index of the ADT markers that tend to have three peaks based on researchers' prior knowledge (e.g., CD4) or preliminary observation on particular data to be processed.}
35 | 
36 | \item{positive_peak}{A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells. The only CD3 peak should be aligned with the positive peaks of other samples.}
37 | 
38 | \item{neg_candidate_thres}{The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95\% quantile of IgG samples is large.}
39 | 
40 | \item{lower_peak_thres}{The minimal ADT marker density height to call it a real peak. Set it to 0.01 to avoid suspicious positive peaks. Set it to 0.001 or smaller to include some small but tend to be real positive peaks, especially for markers like CD19.}
41 | 
42 | \item{arcsinh_transform_flag}{The flag indicates if the input is raw count and arcsinh transformation is implemented.}
43 | }
44 | \description{
45 | This function detects the peak mode locations for each sample per ADT markers. Using the peak mode instead of the peak region midpoint can be more accurate in determining the peak location if the bandwidth is generally proper and the local peak density is not too discrete.
46 | }
47 | \examples{
48 | \dontrun{
49 | get_peak_mode(
50 |   cell_x_adt = cell_x_adt,
51 |   cell_x_feature = cell_x_feature,
52 |   adt_marker_select = "CD3",
53 |   neg_candidate_thres = asinh(6/5+1)
54 | )
55 | }
56 | }
57 | 


--------------------------------------------------------------------------------
/man/get_valley_location.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_valley_location.R
 3 | \name{get_valley_location}
 4 | \alias{get_valley_location}
 5 | \title{Get the valley landmark locations.}
 6 | \usage{
 7 | get_valley_location(
 8 |   cell_x_adt = NULL,
 9 |   cell_x_feature = NULL,
10 |   adt_marker_select = NULL,
11 |   peak_mode_res = NULL,
12 |   shoulder_valley = TRUE,
13 |   positive_peak = NULL,
14 |   multi_sample_per_batch = FALSE,
15 |   adjust = 1.5,
16 |   min_fc = 20,
17 |   shoulder_valley_slope = -1,
18 |   lower_peak_thres = 0.01,
19 |   neg_candidate_thres = asinh(10/5 + 1),
20 |   arcsinh_transform_flag = TRUE
21 | )
22 | }
23 | \arguments{
24 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.}
25 | 
26 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.}
27 | 
28 | \item{adt_marker_select}{Markers to normalize. Leaving empty to process all the ADT markers in the cell_x_adt matrix.}
29 | 
30 | \item{peak_mode_res}{The peak landmark results coming out of \code{get_peak_mode} or \code{get_peak_midpoint} function.}
31 | 
32 | \item{shoulder_valley}{Indictor to specify whether a shoulder valley is expected in case of the heavy right tail where the population of cells should be considered as positive population.}
33 | 
34 | \item{positive_peak}{A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells. The only CD3 peak should be aligned with the positive peaks of other samples.}
35 | 
36 | \item{multi_sample_per_batch}{Set it to TRUE to discard the positive peak that only appears in one sample per batch (sample number is >=3 per batch).}
37 | 
38 | \item{adjust}{Parameter for \code{density} function: bandwidth used is actually adjust*bw. This makes it easy to specify values like 'half the default' bandwidth.}
39 | 
40 | \item{min_fc}{Mimimal fold change between the highest peak density height and candidate valley density height. The default is 20.}
41 | 
42 | \item{shoulder_valley_slope}{The slope on the ADT marker density distribution to call shoulder valley.}
43 | 
44 | \item{lower_peak_thres}{The minimal ADT marker density height to call it a real peak. Set it to 0.01 to avoid suspicious positive peaks. Set it to 0.001 or smaller to include some small but tend to be real positive peaks, especially for markers like CD19.}
45 | 
46 | \item{neg_candidate_thres}{The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95\% quantile of IgG samples is large.}
47 | 
48 | \item{arcsinh_transform_flag}{The flag indicates if the input is raw count and arcsinh transformation is implemented.}
49 | }
50 | \description{
51 | This function detects the valley locations either between every two peak landmarks or cut at the right heavy tails. If the specified positive uni-peak, the valley location will be set at the left side of the uni-peak.
52 | }
53 | \examples{
54 | \dontrun{
55 | get_valley_location(cell_x_adt, cell_x_feature, peak_mode_res)
56 | }
57 | }
58 | 


--------------------------------------------------------------------------------
/man/landmark_fill_na.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/landmark_fill_na.R
 3 | \name{landmark_fill_na}
 4 | \alias{landmark_fill_na}
 5 | \title{Merge locations of peak and valley landmarks}
 6 | \usage{
 7 | landmark_fill_na(
 8 |   peak_landmark_list = NULL,
 9 |   valley_landmark_list = NULL,
10 |   landmark_align_type = NULL,
11 |   midpoint_type = "valley",
12 |   neg_candidate_thres = asinh(10/5 + 1)
13 | )
14 | }
15 | \arguments{
16 | \item{peak_landmark_list}{Matrix of peak landmark detection results. Rows are samples, and column(s) are the peak locations.}
17 | 
18 | \item{valley_landmark_list}{Matrix of valley landmark detection results. Rows are samples, and column(s) are the valley locations.}
19 | 
20 | \item{landmark_align_type}{Algin the peak and valleys using one of the "negPeak", "negPeak_valley", "negPeak_valley_posPeak", and "valley" alignment modes.}
21 | 
22 | \item{midpoint_type}{Fill in the missing first valley by the midpoint of two positive peaks ("midpoint") or impute by other valleys ("valley").}
23 | 
24 | \item{neg_candidate_thres}{The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95\% quantile of IgG samples is large.}
25 | }
26 | \description{
27 | This function merges the peak and valley landmarks locations and fills in NA if the landmark is not detected.
28 | }
29 | \examples{
30 | \dontrun{
31 | landmark_fill_na(
32 |   peak_landmark_list = peak_landmark_list,
33 |   valley_landmark_list = valley_landmark_list,
34 |   landmark_align_type = "negPeak_valley_posPeak"
35 | )
36 | }
37 | }
38 | 


--------------------------------------------------------------------------------
/man/load_landmarks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/load_landmarks.R
 3 | \name{load_landmarks}
 4 | \alias{load_landmarks}
 5 | \title{Load saved landmarks into a list of matrices, as required for input into "override_landmark".}
 6 | \usage{
 7 | load_landmarks(dir, append_rds = TRUE)
 8 | }
 9 | \arguments{
10 | \item{dir}{Directory where landmark location .rds files are stored}
11 | 
12 | \item{append_rds}{Whether to append "/RDS" to the provided directory path. The default is TRUE, which aligns with ADTnorm's default save location.}
13 | }
14 | \description{
15 | Load saved landmarks into a list of matrices, as required for input into "override_landmark".
16 | }
17 | \examples{
18 | \dontrun{
19 | override_landmark = load_landmarks(dir = save_outpath)
20 | }
21 | }
22 | 


--------------------------------------------------------------------------------
/man/peak_alignment.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/peak_alignment.R
 3 | \name{peak_alignment}
 4 | \alias{peak_alignment}
 5 | \title{Align the peak and valley landmarks by the warpset function}
 6 | \usage{
 7 | peak_alignment(
 8 |   cell_x_adt,
 9 |   cell_x_feature = NULL,
10 |   landmark_matrix = NULL,
11 |   target_landmark = NULL
12 | )
13 | }
14 | \arguments{
15 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.}
16 | 
17 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.}
18 | 
19 | \item{landmark_matrix}{Matrix of peak and valley landmarks after filling in NA using the \code{landmark_fill_na} function.}
20 | 
21 | \item{target_landmark}{Leave it as NULL to align the landmark to the mean location across samples. Denote it by a vector of the same length as the column number of the landmark to align the negative peak, valley, and positive peak(s) to the specified fixed location.}
22 | }
23 | \description{
24 | This function monotonously transforms the ADT marker counts to align the landmarks detected in previous steps. By aligning the landmarks, ADTnorm removes the batch effect and allows integration across batches/studies.
25 | }
26 | \examples{
27 | \dontrun{
28 | peak_alignment(cell_x_adt, cell_x_feature, landmark_matrix)
29 | }
30 | }
31 | 


--------------------------------------------------------------------------------
/man/plot_adt_density_each.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_adt_density_each.R
 3 | \name{plot_adt_density_each}
 4 | \alias{plot_adt_density_each}
 5 | \title{Plot the expression density profile for ONE ADT marker}
 6 | \usage{
 7 | plot_adt_density_each(
 8 |   adt_count,
 9 |   cell_x_feature,
10 |   brewer_palettes,
11 |   parameter_list = NULL
12 | )
13 | }
14 | \arguments{
15 | \item{adt_count}{Matrix of ADT raw counts in cells (rows) by one target ADT marker (column) format.}
16 | 
17 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.}
18 | 
19 | \item{brewer_palettes}{Set the color scheme of the color brewer.}
20 | 
21 | \item{parameter_list}{Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot.}
22 | }
23 | \description{
24 | This function plots the ADT expression density profile for only one ADT marker. Each track is a sample. Color by batch
25 | }
26 | \examples{
27 | \dontrun{
28 | plot_adt_density_each(
29 |  cell_x_adt,
30 |  cell_x_feature,
31 |  brewer_palettes = "Set1",
32 |  parameter_list = list(bw = 0.1, run_label = "ADTnorm")
33 | )
34 | }
35 | }
36 | 


--------------------------------------------------------------------------------
/man/plot_adt_density_with_peak_valley.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_adt_density_with_peak_valley.R
 3 | \name{plot_adt_density_with_peak_valley}
 4 | \alias{plot_adt_density_with_peak_valley}
 5 | \title{Plot ADT marker expression density profile with identified peak and valley locations}
 6 | \usage{
 7 | plot_adt_density_with_peak_valley(
 8 |   cell_x_adt,
 9 |   cell_x_feature,
10 |   adt_marker_select = NULL,
11 |   peak_landmark_list,
12 |   valley_landmark_list,
13 |   brewer_palettes = "Set1",
14 |   parameter_list = NULL
15 | )
16 | }
17 | \arguments{
18 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.}
19 | 
20 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.}
21 | 
22 | \item{adt_marker_select}{The target ADT marker(s) that the denstiy plot is about. Leave it NULL will generate figures for all the ADT markers available in cell_x_adt.}
23 | 
24 | \item{peak_landmark_list}{Matrix of peak landmark locations with rows being samples and columns being the peaks.}
25 | 
26 | \item{valley_landmark_list}{Matrix of valley landmark locations with rows being samples and columns being the valleys.}
27 | 
28 | \item{brewer_palettes}{Set the color scheme of the color brewer.}
29 | 
30 | \item{parameter_list}{Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot.}
31 | }
32 | \description{
33 | This function plots the ADT expression density profile with identified peak and valley locations. Each panel is an ADT marker, and each track is a sample. Color by batch
34 | }
35 | \examples{
36 | \dontrun{
37 | plot_adt_density_with_peak_valley(
38 |   cell_x_adt,
39 |   cell_x_feature,
40 |   adt_marker_select = c("CD3", "CD4", "CD8", "CD19"),
41 |   peak_landmark_list = peak_mode_norm_res,
42 |   valley_landmark_list = valley_location_norm_res,
43 |   brewer_palettes = "Set1",
44 |   parameter_list = list(bw = 0.1, run_label = "ADTnorm")
45 | )
46 | }
47 | }
48 | 


--------------------------------------------------------------------------------
/man/plot_adt_density_with_peak_valley_each.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_adt_density_with_peak_valley_each.R
 3 | \name{plot_adt_density_with_peak_valley_each}
 4 | \alias{plot_adt_density_with_peak_valley_each}
 5 | \title{Plot the expression density profile for ONE ADT marker with identified peak and valley locations}
 6 | \usage{
 7 | plot_adt_density_with_peak_valley_each(
 8 |   adt_count,
 9 |   cell_x_feature,
10 |   peak_landmark_list,
11 |   valley_landmark_list,
12 |   brewer_palettes = "Set1",
13 |   parameter_list = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{adt_count}{Matrix of ADT raw counts in cells (rows) by one target ADT marker (column) format.}
18 | 
19 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.}
20 | 
21 | \item{peak_landmark_list}{Matrix of peak landmark locations with rows being samples and columns being the peaks.}
22 | 
23 | \item{valley_landmark_list}{Matrix of valley landmark locations with rows being samples and columns being the valleys.}
24 | 
25 | \item{brewer_palettes}{Set the color scheme of the color brewer.}
26 | 
27 | \item{parameter_list}{Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot.}
28 | }
29 | \description{
30 | This function plots the ADT expression density profile with identified peak and valley locations for only one ADT marker. Each track is a sample. Color by batch
31 | }
32 | \examples{
33 | \dontrun{
34 | plot_adt_density_with_peak_valley_each(
35 |   cell_x_adt,
36 |   cell_x_feature,
37 |   peak_landmark_list = peak_mode_norm_res,
38 |   valley_landmark_list = valley_location_norm_res,
39 |   brewer_palettes = "Set1",
40 |   parameter_list = list(bw = 0.1, run_label = "ADTnorm")
41 | )
42 | }
43 | }
44 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ComparativeAnalysis/WNN_integrateRNA_ADT.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | args = commandArgs(trailingOnly=TRUE)
  3 | 
  4 | library(Seurat)
  5 | library(dplyr)
  6 | library(ggplot2)
  7 | library(httpgd)
  8 | 
  9 | method <- args[1] #"Arcsinh_b5"
 10 | run_name = args[2] #"publicData_CITEseq"
 11 | # run_name = "public13Dataset_CITEseq"
 12 | 
 13 | master_path = "./"
 14 | in_path = "/publicData_CITEseq/data/"
 15 | out_path = paste0(master_path, "manuscript/results/", run_name)
 16 | fig_path = paste0(out_path, "/Figures/")
 17 | 
 18 | ## load data directly
 19 | marker_list = c("CD3", "CD4", "CD8",  "CD14", "CD19", "CD25", "CD45RA", "CD56", "CD127")
 20 | adt_data = readRDS(file = paste0(out_path, "/RDS/adt_data_RawCount_", run_name, ".rds"))
 21 | adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", run_name, ".rds"))
 22 | 
 23 | ## RPCA integration methods
 24 | # creates a Seurat object based on the CITE-seq data
 25 | rna_data = readRDS(file = './results/publicData_CITEseq/RDS/rna_data_common_RawCount_public13Dataset_CITEseq.rds')
 26 | citeseq_obj = CreateSeuratObject(counts = t(rna_data))
 27 | citeseq_obj = AddMetaData(citeseq_obj, metadata = adt_feature)
 28 | 
 29 | cite_list = SplitObject(citeseq_obj, split.by = "study_name")
 30 | cite_list <- lapply(X = cite_list, FUN = function(x) {
 31 |     x <- NormalizeData(x)
 32 |     x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 5000) ##
 33 | })
 34 | 
 35 | # select features that are repeatedly variable across datasets for integration run PCA on each
 36 | # dataset using these features
 37 | features <- SelectIntegrationFeatures(object.list = cite_list)
 38 | cite_list <- lapply(X = cite_list, FUN = function(x) {
 39 |     x <- ScaleData(x, features = features, verbose = FALSE)
 40 |     x <- RunPCA(x, features = features, verbose = FALSE)
 41 | })
 42 | immune.anchors <- FindIntegrationAnchors(object.list = cite_list, anchor.features = features, reduction = "rpca") ##
 43 | # this command creates an 'integrated' data assay
 44 | immune.combined <- IntegrateData(anchorset = immune.anchors)
 45 | # specify that we will perform downstream analysis on the corrected data note that the
 46 | # original unmodified data still resides in the 'RNA' assay
 47 | DefaultAssay(immune.combined) <- "integrated"
 48 | 
 49 | # Run the standard workflow for visualization and clustering
 50 | immune.combined <- ScaleData(immune.combined, verbose = FALSE)
 51 | immune.combined <- RunPCA(immune.combined, npcs = 100, verbose = FALSE)
 52 | immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:50)
 53 | saveRDS(immune.combined, file = paste0(out_path, "/RDS/citeseq_obj_RPCA_", run_name, ".rds"))
 54 | 
 55 | citeNorm = readRDS(file = paste0(out_path, "/RDS/citeseq_obj_RPCA_", run_name, ".rds"))
 56 | 
 57 | adt_data_adtnorm = readRDS(file = paste0(out_path, "/RDS/adt_data_norm_", method, "_", run_name, ".rds"))
 58 | adt_data_adtnorm[is.na(adt_data_adtnorm)] = 0
 59 | 
 60 | rownames(adt_data_adtnorm) = colnames(citeNorm)
 61 | adt_assay <- CreateAssay5Object(counts = t(adt_data_adtnorm), data = t(adt_data_adtnorm))
 62 | citeNorm[["ADT"]] <- adt_assay
 63 | DefaultAssay(citeNorm) <- 'ADT'
 64 | VariableFeatures(citeNorm) <- rownames(citeNorm[["ADT"]])
 65 | ## set scale.data
 66 | if(method %in% c("fastMNN_study", "fastMNN_sample", "logCPM", "decontPro")){
 67 |     citeNorm = ScaleData(citeNorm)
 68 | }else{
 69 |     citeNorm <- SetAssayData(citeNorm, assay = "ADT", slot = "scale.data", new.data = t(adt_data_adtnorm))
 70 | }
 71 | citeNorm <- RunPCA(citeNorm, reduction.name = 'apca')
 72 | 
 73 | DefaultAssay(citeNorm) <- 'integrated'
 74 | citeNorm <- FindMultiModalNeighbors(
 75 |   citeNorm, reduction.list = list("pca", "apca"), 
 76 |   dims.list = list(1:15, 1:8), modality.weight.name = "RNA.weight", k.nn = 30
 77 | )
 78 | 
 79 | citeNorm <- RunUMAP(citeNorm, nn.name = "weighted.nn", reduction.name = "wnn.umap", reduction.key = "wnnUMAP_", min.dist = 0.01)
 80 | citeNorm <- FindClusters(citeNorm, graph.name = "wsnn", algorithm = 1, resolution = 0.15, verbose = FALSE)
 81 | saveRDS(citeNorm, file = paste0(out_path, "/RDS/citeseq_obj_RPCA_citeNorm_WNN_15_8_withCluster_noScalePCA_nneighbor20_mindist0.01_", run_name, "_", method, ".rds"))
 82 | 
 83 | ans_umap = Embeddings(citeNorm, reduction = "wnn.umap") %>% data.frame
 84 | colnames(ans_umap) = c("X1", "X2")
 85 | adt_feature$seurat_clusters = citeNorm@meta.data$seurat_clusters
 86 | 
 87 | 
 88 | pdf(paste0("./manuscript/results/public13Dataset_CITEseq/Figures/WNN/citeseq_obj_RPCA_citeNorm_WNN_15_8_withCluster_noScalePCA_nneighbor20_mindist0.01_", run_name, "_", method, ".pdf"), width = 15, height = 11)
 89 | 
 90 | reindex1 = which(adt_feature$cell_type_l1 == "undefined")
 91 | reindex2 = which(adt_feature$cell_type_l1 != "undefined")
 92 | reindex = c(reindex1, reindex2)
 93 | 
 94 | # print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list(
 95 | # target_feature = "seurat_clusters", 
 96 | # color_design = colorRampPalette(brewer.pal(8, "Spectral"))(length(adt_feature$seurat_clusters %>% unique)),
 97 | # method_label = method
 98 | # )) #+ scale_color_brewer(palette = "Dark2")
 99 | # )
100 | 
101 | 
102 | print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list(
103 | target_feature = "sample", 
104 | color_design = colorRampPalette(brewer.pal(8, "Spectral"))(length(adt_feature$sample %>% unique)),
105 | method_label = method
106 | )) #+ scale_color_brewer(palette = "Dark2")
107 | )
108 | 
109 | print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list(
110 | target_feature = "study_name", 
111 | color_design = colorRampPalette(brewer.pal(8, "Dark2"))(length(adt_feature$study_name %>% unique)),
112 | method_label = method
113 | )))
114 | 
115 | print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list(
116 | target_feature = "sample_status", 
117 | color_design = colorRampPalette(brewer.pal(8, "Set1"))(length(adt_feature$sample_status %>% unique)),
118 | method_label = method
119 | )))
120 | 
121 | adt_feature$cell_type_l1 = factor(adt_feature$cell_type_l1, levels = c("B", "CD4 T", "CD8 T", "monocytes", "NK", "DCs", "undefined"))
122 | print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list(
123 | target_feature = "cell_type_l1", 
124 | color_design = c("#B00000", "#3F918B", "#896191", "#FF980A", "#226fa7", "#F781BF", "grey"), 
125 | #color_design = c(colorRampPalette(brewer.pal(8, "Set1"))(6), "grey"),
126 | color_break = c("B", "CD4 T", "CD8 T", "monocytes", "NK", "DCs", "undefined"),
127 | method_label = method
128 | )))
129 | 
130 | adt_feature$cell_type_l2 = factor(adt_feature$cell_type_l2, levels = c("naive B", "memory B", 
131 |     "naive CD4", "memory CD4", "Treg", 
132 |     "naive CD8", "memory CD8", 
133 |     "classical monocyte", "intermediate monocyte", "non-classical CD16+ monocyte",
134 |     "CD16- NK", "CD16+ NK", 
135 |     "myeloid DC", "plasmacytoid DC", "undefined"))
136 | print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list(
137 | target_feature = "cell_type_l2",
138 | color_design = c(
139 |     "#B00000", "#FF3380", 
140 |     "#9ef954", "#3F918B", "#03500e",
141 |     "#896191", "#350154",  
142 |     "#FF980A", "#A78300", "#DB6400",
143 |     "#1000a0", "#226fa7",
144 |     "#F781BF", "#7B0054", "grey"),
145 | #color_design = c(colorRampPalette(brewer.pal(8, "Dark2"))(14), "grey"),
146 | color_break = c(
147 |     "naive B", "memory B", 
148 |     "naive CD4", "memory CD4", "Treg", 
149 |     "naive CD8", "memory CD8", 
150 |     "classical monocyte", "intermediate monocyte", "non-classical CD16+ monocyte",
151 |     "CD16- NK", "CD16+ NK", 
152 |     "myeloid DC", "plasmacytoid DC", "undefined"),
153 | method_label = method
154 | )))
155 | 
156 | dev.off()
157 | 
158 | pdf(paste0("./manuscript/results/public13Dataset_CITEseq/Figures/WNN/citeseq_obj_RPCA_citeNorm_WNN_15_8_withCluster_noScalePCA_nneighbor20_mindist0.01_", run_name, "_", method, "_weights.pdf"), width = 18, height = 9)
159 | VlnPlot(citeNorm, features = "ADT.weight", group.by = 'cell_type_l1', sort = TRUE, pt.size = 0) +
160 | scale_fill_manual(values = c("#B00000", "#3F918B", "#896191", "#FF980A", "#226fa7", "#F781BF", "grey"), 
161 | breaks = c("B", "CD4 T", "CD8 T", "monocytes", "NK", "DCs", "undefined"))
162 | VlnPlot(citeNorm, features = "ADT.weight", group.by = 'cell_type_l2', sort = TRUE, pt.size = 0) +
163 | scale_fill_manual(values = c(
164 |     "#B00000", "#FF3380", 
165 |     "#9ef954", "#3F918B", "#03500e",
166 |     "#896191", "#350154",  
167 |     "#FF980A", "#A78300", "#DB6400",
168 |     "#1000a0", "#226fa7",
169 |     "#F781BF", "#7B0054", "grey"),
170 | breaks = c(
171 |     "naive B", "memory B", 
172 |     "naive CD4", "memory CD4", "Treg", 
173 |     "naive CD8", "memory CD8", 
174 |     "classical monocyte", "intermediate monocyte", "non-classical CD16+ monocyte",
175 |     "CD16- NK", "CD16+ NK", 
176 |     "myeloid DC", "plasmacytoid DC", "undefined"))
177 | VlnPlot(citeNorm, features = "ADT.weight", group.by = 'seurat_clusters', sort = TRUE, pt.size = 0) +
178 | scale_fill_manual(values = colorRampPalette(brewer.pal(8, "Spectral"))(length(adt_feature$seurat_clusters %>% unique)))
179 | dev.off()
180 | 
181 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ComparativeAnalysis/evaluate_norm.R:
--------------------------------------------------------------------------------
  1 | ## evaluation using ARI, silhouette score on cell types and batches, LISI on batches
  2 | 
  3 | library(data.table)
  4 | library(gridExtra)
  5 | library(mclust)
  6 | library(cluster)
  7 | library(stringr)
  8 | library(harmony)
  9 | library(Seurat)
 10 | library(pdfCluster)
 11 | library(dplyr)
 12 | library(umap)
 13 | # library(lisi)
 14 | 
 15 | mlgFindNeighbors = function(latent){
 16 |   rownames(latent) = paste0("cell", 1:nrow(latent))
 17 |   NN_graph = FindNeighbors(latent, verbose = FALSE, nn.method = "rann") ## "annoy"
 18 |   G = NN_graph$snn
 19 |   return(G)
 20 | }
 21 | mlgARI = function(embed, cluster_label){ ## louvain clustering using resolution 0.8
 22 |     cluster = cluster_label
 23 |     G = mlgFindNeighbors(embed)
 24 |     mlgc = FindClusters(G, resolution = 0.8, random.seed = 20220525, verbose = FALSE)[[1]] #0.8
 25 |     mlgARI = adjustedRandIndex(mlgc, cluster)
 26 |     return(mlgARI)
 27 | }
 28 | # evaluationLISI = function(embed, summary){ ## run it using conda environment
 29 | #     if ("batch" %in% colnames(summary)) {
 30 | #         return(compute_lisi(embed, summary, 'batch')$batch)
 31 | #     } else {
 32 | #         return(rep(NA, nrow(summary)))
 33 | #     }
 34 | # }
 35 | 
 36 | 
 37 | ## latent can be PCA
 38 | ## latent_type = "PCA" or "UMAP"
 39 | ## evaluation_method = "ARI", "Si"
 40 | ## label_type = "cell_type" or "batch"
 41 | ## cluster_label = c(......) cell types of batches of each cell
 42 | evaluation_norm = function(latent = NULL, latent_type = NULL, evaluation_method = NULL, cluster_label = NULL){
 43 |     if(evaluation_method == "ARI"){
 44 |         return(mlgARI(latent, cluster_label))
 45 |     }
 46 | 
 47 |     if(evaluation_method == "Si"){
 48 |         if(latent_type %in% c("UMAP", "TSNE")){
 49 |             clusterNum = factor(cluster_label, levels = unique(cluster_label), label = 1:length(unique(cluster_label))) %>% as.numeric
 50 |             score = silhouette(clusterNum, dist(as.matrix(latent), method = "manhattan"))[,3] #
 51 |             return(score)
 52 |         }else{
 53 |             if(latent_type == "PCA"){
 54 |                 embed = umap(latent)$layout
 55 |                 clusterNum = factor(cluster_label, levels = unique(cluster_label), label = 1:length(unique(cluster_label))) %>% as.numeric
 56 |                 score = silhouette(clusterNum, dist(as.matrix(embed)))[,3]
 57 |                 return(score)
 58 |             }
 59 |         }
 60 |         
 61 |     }
 62 | 
 63 |     # if(evaluation_method == "LISI"){
 64 |     #     return(evaluationLISI(latent, data.frame(batch = cluster_label)))
 65 |     # }
 66 | }
 67 | 
 68 | 
 69 | args = commandArgs(trailingOnly=TRUE)
 70 | 
 71 | method = args[1]
 72 | evaluation_type = args[2]
 73 | study_name = args[3] #"public13Dataset_CITEseq"
 74 | out_path = "./manuscript/results/public13Dataset_CITEseq/" 
 75 | 
 76 | # adt_data = readRDS(file = paste0(out_path, "/RDS/adt_data_RawCount_", study_name, ".rds"))
 77 | if(method %in% c("totalVI_sample_GPU", "totalVI_study_GPU", "sciPENN_sample_GPU", "sciPENN_study_GPU")){
 78 |     adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", method, "_", study_name, ".rds"))
 79 | 
 80 | }else{
 81 |     adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", study_name, ".rds"))
 82 | }
 83 | 
 84 | tmp = readRDS( file = paste0(out_path, "/RDS/adt_", method, "_", study_name, "_pca_umap.rds"))
 85 | ans_pca = tmp[[1]]
 86 | ans_umap = tmp[[2]]
 87 | dim(ans_pca)
 88 | dim(ans_umap)
 89 | 
 90 | seed_list = c(
 91 |     20220524, 20230817, 20200502, 20230801, 20190826, 20220525, 20230818, 20200503, 20230829, 20190827, 
 92 |     20137910, 20349283, 24918503, 38298493, 150, 2940, 10, 302934245, 3428, 9123784
 93 | )
 94 | 
 95 | # subsample_index = sample(1:nrow(ans_umap), 50000, replace = FALSE)
 96 | generate_index = function(seed){
 97 |     set.seed(seed)
 98 |     subsample_index = c()
 99 |     for(study_each in unique(adt_feature$study_name)){
100 |         study_index = which(adt_feature$study_name == study_each)
101 |         if(length(study_index) < 5000){
102 |             subsample_index = c(subsample_index, study_index)
103 |         }else(
104 |             subsample_index = c(subsample_index, sample(study_index, 5000, replace = FALSE))
105 |         )
106 |     }
107 |     print(length(subsample_index))
108 | 
109 |     return(subsample_index)
110 | }
111 | 
112 | 
113 | ## Silhouette score
114 | if(evaluation_type == "Si"){
115 |     print("Silhouette score...")
116 |     si_score_sample = c()
117 |     si_score_study = c()
118 |     si_score_broadCT = c()
119 |     si_score_refineCT = c()
120 | 
121 |     for (seed in seed_list){
122 |         subsample_index = generate_index(seed)
123 |         # latent = ans_pca[subsample_index, 1:min(ncol(ans_pca), 50)] #ans_umap[subsample_index, ]
124 |         latent = ans_umap[subsample_index, ] #ans_umap[subsample_index, ]
125 |     
126 |         cluster_label = adt_feature$sample[subsample_index] %>% as.character
127 |         si_score_sample = evaluation_norm(latent, "UMAP", "Si", cluster_label) %>% rbind(si_score_sample, .)
128 |         
129 |         cluster_label = adt_feature$study_name[subsample_index] %>% as.character
130 |         si_score_study = evaluation_norm(latent, "UMAP", "Si", cluster_label) %>% rbind(si_score_study, .)
131 | 
132 |         cluster_label = adt_feature$cell_type_l2[subsample_index] %>% as.character
133 |         cell_select = which(cluster_label != "undefined")
134 |         cluster_label = cluster_label[cell_select]
135 |         si_score_refineCT = evaluation_norm(latent[cell_select, ], "UMAP", "Si", cluster_label) %>% rbind(si_score_refineCT, .)
136 | 
137 |         cluster_label = adt_feature$cell_type_l1[subsample_index] %>% as.character
138 |         cell_select = which(cluster_label != "undefined")
139 |         cluster_label = cluster_label[cell_select]
140 |         si_score_broadCT = evaluation_norm(latent[cell_select, ], "UMAP", "Si", cluster_label) %>% rbind(si_score_broadCT, .)
141 |         saveRDS(list(si_score_sample, si_score_study, si_score_refineCT, si_score_broadCT), paste0(out_path, "/RDS/adt_", method, "_", study_name, "_umap_si.rds"))
142 | 
143 |         
144 |     }
145 |     print(rowMeans(si_score_sample))
146 |     print(rowMeans(si_score_study))
147 |     print(rowMeans(si_score_broadCT))
148 |     print(rowMeans(si_score_refineCT))
149 | 
150 |     # saveRDS(list(si_score_sample, si_score_study, si_score_refineCT, si_score_broadCT), paste0(out_path, "/RDS/adt_", method, "_", study_name, "_umap_si.rds"))
151 | 
152 | }
153 | 
154 | ## ARI
155 | if(evaluation_type == "ARI"){
156 |     print("ARI score")
157 |     
158 |     ari_sample = c()
159 |     ari_study = c()
160 |     ari_refineCT = c()
161 |     ari_broadCT = c()
162 | 
163 |     for(seed in seed_list){
164 |         print(seed)
165 |         subsample_index = generate_index(seed)
166 |         latent = ans_pca[subsample_index, 1:min(ncol(ans_pca), 50)]
167 | 
168 |         cluster_label = adt_feature$sample[subsample_index] %>% as.character
169 |         ari_sample = c(ari_sample, evaluation_norm(latent, "UMAP", "ARI", cluster_label))
170 |         
171 |         cluster_label = adt_feature$study_name[subsample_index] %>% as.character
172 |         ari_study = c(ari_study, evaluation_norm(latent, "UMAP", "ARI", cluster_label))
173 |         
174 |         cluster_label = adt_feature$cell_type_l2[subsample_index] %>% as.character
175 |         ari_refineCT = c(ari_refineCT, evaluation_norm(latent, "UMAP", "ARI", cluster_label))
176 |         
177 |         cluster_label = adt_feature$cell_type_l1[subsample_index] %>% as.character
178 |         ari_broadCT = c(ari_broadCT, evaluation_norm(latent, "UMAP", "ARI", cluster_label))
179 |         saveRDS(list(ari_sample, ari_study, ari_refineCT, ari_broadCT), paste0(out_path, "/RDS/adt_", method, "_", study_name, "_ari.rds"))
180 |         
181 |     }
182 |     print(ari_sample)
183 |     print(ari_study)
184 |     print(ari_refineCT)
185 |     print(ari_broadCT)
186 | 
187 |     saveRDS(list(ari_sample, ari_study, ari_refineCT, ari_broadCT), paste0(out_path, "/RDS/adt_", method, "_", study_name, "_ari.rds"))
188 | }
189 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ComparativeAnalysis/publicData_CITEseq_umap.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | args = commandArgs(trailingOnly=TRUE)
 3 | 
 4 | library(dplyr)
 5 | library(umap)
 6 | library(viridis)
 7 | 
 8 | 
 9 | method <- args[1] #"Arcsinh_b5"
10 | run_name = args[2] #"publicData_CITEseq"
11 | out_path = "./manuscript/results/public13Dataset_CITEseq" 
12 | # adt_data = readRDS(file = paste0(out_path, "/RDS/adt_data_RawCount_", run_name, ".rds"))
13 | # adt_feature = readRDS(file = paste0("./results/publicData_CITEseq/RDS/adt_feature_", run_name, ".rds"))
14 | adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", run_name, ".rds"))
15 | 
16 | ans = readRDS(file = paste0(out_path, "/RDS/adt_data_norm_", method, "_", run_name, ".rds"))
17 | ans[is.na(ans)] = 0
18 | ans_pca = prcomp(ans)$x 
19 | print("Start UMAP")
20 | ans_umap = umap(ans_pca[, 1:min(ncol(ans), 50)])$layout %>% data.frame
21 | saveRDS(list(ans_pca, ans_umap), file = paste0(out_path, "/RDS/adt_", method, "_", run_name, "_pca_umap.rds"))
22 | 
23 | # tmp = readRDS(file = paste0(out_path, "/RDS/adt_", method, "_", run_name, "_pca_umap.rds"))
24 | # ans_umap = tmp[[2]]
25 | 
26 | 
27 | print("Start plotting")
28 | pdf(paste0(out_path, "/Figures/adt_", method, "_", run_name, "_umap.pdf"), width = 15, height = 15)
29 | reindex1 = which(adt_feature$cell_type_l1 == "undefined")
30 | reindex2 = which(adt_feature$cell_type_l1 != "undefined")
31 | reindex = c(reindex1, reindex2)
32 | 
33 | print(plot_umap(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list(
34 | target_feature = "sample", 
35 | color_design = colorRampPalette(brewer.pal(8, "Spectral"))(length(adt_feature$sample %>% unique)),
36 | method_label = method
37 | )) #+ scale_color_brewer(palette = "Dark2")
38 | )
39 | 
40 | print(plot_umap(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list(
41 | target_feature = "study_name", 
42 | color_design = colorRampPalette(brewer.pal(8, "Dark2"))(length(adt_feature$study_name %>% unique)),
43 | method_label = method
44 | )))
45 | 
46 | print(plot_umap(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list(
47 | target_feature = "sample_status", 
48 | color_design = colorRampPalette(brewer.pal(8, "Set1"))(length(adt_feature$sample_status %>% unique)),
49 | method_label = method
50 | )))
51 | 
52 | adt_feature$cell_type_l1 = factor(adt_feature$cell_type_l1, levels = c("B", "CD4 T", "CD8 T", "monocytes", "NK", "DCs", "undefined"))
53 | print(plot_umap(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list(
54 | target_feature = "cell_type_l1", 
55 | color_design = c("#B00000", "#3F918B", "#896191", "#FF980A", "#226fa7", "#F781BF", "grey"), 
56 | #color_design = c(colorRampPalette(brewer.pal(8, "Set1"))(6), "grey"),
57 | color_break = c("B", "CD4 T", "CD8 T", "monocytes", "NK", "DCs", "undefined"),
58 | method_label = method
59 | )))
60 | 
61 | adt_feature$cell_type_l2 = factor(adt_feature$cell_type_l2, levels = c("naive B", "memory B", 
62 |     "naive CD4", "memory CD4", "Treg", 
63 |     "naive CD8", "memory CD8", 
64 |     "classical monocyte", "intermediate monocyte", "non-classical CD16+ monocyte",
65 |     "CD16- NK", "CD16+ NK", 
66 |     "myeloid DC", "plasmacytoid DC", "undefined"))
67 | print(plot_umap(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list(
68 | target_feature = "cell_type_l2",
69 | color_design = c(
70 |     "#B00000", "#FF3380", 
71 |     "#9ef954", "#3F918B", "#03500e",
72 |     "#896191", "#350154",  
73 |     "#FF980A", "#A78300", "#DB6400",
74 |     "#1000a0", "#226fa7",
75 |     "#F781BF", "#7B0054", "grey"),
76 | #color_design = c(colorRampPalette(brewer.pal(8, "Dark2"))(14), "grey"),
77 | color_break = c(
78 |     "naive B", "memory B", 
79 |     "naive CD4", "memory CD4", "Treg", 
80 |     "naive CD8", "memory CD8", 
81 |     "classical monocyte", "intermediate monocyte", "non-classical CD16+ monocyte",
82 |     "CD16- NK", "CD16+ NK", 
83 |     "myeloid DC", "plasmacytoid DC", "undefined"),
84 | method_label = method
85 | )))
86 | dev.off()
87 |  
88 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ComparativeAnalysis/stain_quality.R:
--------------------------------------------------------------------------------
  1 | library(ggplot2)
  2 | library(dplyr)
  3 | library(RColorBrewer)
  4 | library(httpgd)
  5 | library(ggpubr)
  6 | library(splines)
  7 | 
  8 | run_name = "public13Dataset_CITEseq"
  9 | 
 10 | master_path = "./"
 11 | in_path = "./publicData_CITEseq/data/"
 12 | out_path = paste0(master_path, "manuscript/results/", run_name)
 13 | fcs_path = paste0(out_path, "/FCS/")
 14 | fig_path = paste0(out_path, "/Figures/")
 15 | 
 16 | marker_list = c("CD3", "CD4", "CD8",  "CD14", "CD19", "CD25", "CD45RA", "CD56", "CD127")
 17 | adt_data_full = readRDS(file = paste0(out_path, "/RDS/adt_data_full_RawCount_", run_name, ".rds"))
 18 | adt_data = readRDS(file = paste0(out_path, "/RDS/adt_data_RawCount_", run_name, ".rds"))
 19 | adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", run_name, ".rds"))
 20 | 
 21 | cell_x_adt = arcsinh_transform(cell_x_adt = adt_data) 
 22 | cell_x_feature = adt_feature
 23 | 
 24 | 
 25 | area_under_curve = function(densityObj, threshold, peak_num){
 26 | 
 27 |     if(peak_num > 1){
 28 |         index <- which(densityObj$x >= threshold)
 29 |         auc <- sum(densityObj$y[index] * diff(c(densityObj$x[index], max(densityObj$x))))
 30 |         return(auc)
 31 | 
 32 |     }else{
 33 |         index <- which(densityObj$x >= threshold)
 34 |         auc <- sum(densityObj$y[index] * diff(c(densityObj$x[index], max(densityObj$x))))
 35 |         return(min(auc, 1-auc))
 36 |     }
 37 | 
 38 | }
 39 | 
 40 | valley_deep_scaler = function(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, adjust = 3, peak_num = NA){
 41 |     ind = which(cell_x_feature$sample == each_sample)
 42 |     # mean_diff = (peak_info[each_sample, ncol(peak_info)] - peak_info[each_sample, 1])
 43 |     adt_tmp = cell_x_adt[ind, adt_marker_select]
 44 | 
 45 |     density_res = stats::density(
 46 |         adt_tmp,
 47 |         adjust = adjust , na.rm = TRUE
 48 |     )
 49 |     x = density_res$x
 50 |     y = density_res$y
 51 |     if(peak_num >1){
 52 |         x_peakR = which(abs(x - peak_info[each_sample, ncol(peak_info)]) == min(abs(x - peak_info[each_sample, ncol(peak_info)])))
 53 |         y_peakR = y[x_peakR]
 54 |     }else{
 55 |         y_peakR = 0
 56 |     }
 57 |     
 58 |     # x_peakL = which(abs(x - peak_info[each_sample, 1]) == min(abs(x - peak_info[each_sample, 1])))
 59 |     if(peak_num <=2){
 60 |         x_valley = which(abs(x - valley_info[each_sample, 1]) == min(abs(x - valley_info[each_sample, 1])))
 61 |     }else{
 62 |         x_valley = which(abs(x - valley_info[each_sample, ncol(valley_info)]) == min(abs(x - valley_info[each_sample, ncol(valley_info)])))
 63 |     }
 64 |     # y_peakL = y[x_peakL]
 65 |     y_valley = y[x_valley]
 66 |     auc_scaler = area_under_curve(density_res, threshold = valley_info[each_sample, 1], peak_num = peak_num)
 67 |     deep_scaler = (1 + y_peakR - y_valley) * (1 + auc_scaler)
 68 |     return(deep_scaler)
 69 | }
 70 | 
 71 | two_peak_stain_quality = function(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = 2){
 72 | 
 73 |     ind = which(cell_x_feature$sample == each_sample)
 74 |     mean_diff = (peak_info[each_sample, ncol(peak_info)] - peak_info[each_sample, 1])
 75 |     adt_tmp = cell_x_adt[ind, adt_marker_select]
 76 |     within_peak_sd = sqrt((sum((adt_tmp[which(adt_tmp < valley_info[each_sample, 1])] - peak_info[each_sample, 1])^2) + sum((adt_tmp[which(adt_tmp > valley_info[each_sample, 1])] - peak_info[each_sample, ncol(peak_info)])^2))/length(adt_tmp))
 77 |     
 78 |     deep_scaler = valley_deep_scaler(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num)
 79 |     # return(mean_diff/within_peak_sd)
 80 |     return((mean_diff*deep_scaler)/within_peak_sd)
 81 | }
 82 | 
 83 | multi_peak_stain_quality = function(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = 3){
 84 | 
 85 |     ind = which(cell_x_feature$sample == each_sample)
 86 |     mean_diff = (peak_info[each_sample, ncol(peak_info)] - peak_info[each_sample, 1])
 87 |     adt_tmp = cell_x_adt[ind, adt_marker_select]
 88 |     within_peak_var = 0
 89 |     for(col_index in 1:ncol(peak_info)){
 90 |         if(col_index == 1){
 91 |             ## first peak
 92 |             adt_tmp_select = which(adt_tmp < valley_info[each_sample, col_index])
 93 |             within_peak_var = within_peak_var + sum((adt_tmp[adt_tmp_select] - peak_info[each_sample, col_index])^2)
 94 |         }else if(col_index == ncol(peak_info)){
 95 |             ## last peak
 96 |             adt_tmp_select = which(adt_tmp > valley_info[each_sample, col_index-1])
 97 |             within_peak_var = within_peak_var + sum((adt_tmp[adt_tmp_select] - peak_info[each_sample, col_index])^2)
 98 |         }else{
 99 |             ## middle peak
100 |             adt_tmp_select = which(adt_tmp < valley_info[each_sample, col_index] & adt_tmp > valley_info[each_sample, col_index-1])
101 |             within_peak_var = within_peak_var + sum((adt_tmp[adt_tmp_select] - peak_info[each_sample, col_index])^2)
102 |         }        
103 |     }
104 |     within_peak_sd = sqrt(within_peak_var/length(adt_tmp))
105 |     
106 |     deep_scaler = valley_deep_scaler(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num)
107 |     # return(mean_diff/within_peak_sd)
108 |     return((mean_diff*deep_scaler)/within_peak_sd)
109 | }
110 | 
111 | one_peak_stain_quality = function(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = 1){
112 | 
113 |     ind = which(cell_x_feature$sample == each_sample)
114 |     adt_tmp = cell_x_adt[ind, adt_marker_select]
115 | 
116 |     # median_pos = median(adt_tmp[which(adt_tmp > valley_info[each_sample, 1])])
117 |     if(length(peak_info[each_sample,]) == 1){
118 |         mean_diff = abs(valley_info[each_sample, 1] - peak_info[each_sample, 1])
119 |     }else if(is.na(peak_info[each_sample, 1])){
120 |         mean_diff = abs(valley_info[each_sample, 1] - peak_info[each_sample, ncol(peak_info)])
121 |     }else{
122 |         mean_diff = abs(valley_info[each_sample, 1] - peak_info[each_sample, 1])
123 |     }
124 |     within_peak_sd = sqrt(sd(adt_tmp))
125 |     deep_scaler = valley_deep_scaler(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num)
126 |     # return(mean_diff/within_peak_sd)
127 |     return((mean_diff*deep_scaler)/within_peak_sd)
128 |  
129 | 
130 | }
131 | 
132 | ## use valley and peak
133 | peak_num_summary = c()
134 | peak_sep_summary = c()
135 | 
136 | for(adt_marker_select in colnames(cell_x_adt)){ ##
137 |     
138 |     peak_valley_density = readRDS(paste0(out_path, "/RDS/peak_valley_locations_", adt_marker_select, "_ADTnorm_sample_manual_keepZero.rds"))
139 |     
140 |     peak_info = peak_valley_density$peak_landmark_list
141 |     valley_info = peak_valley_density$valley_landmark_list
142 | 
143 |     for(each_sample in unique(adt_feature$sample)){
144 |         batch_info = unique(adt_feature$study_name[which(adt_feature$sample == each_sample)]) %>% as.vector
145 |         each_peak_info = peak_info[each_sample, ]
146 |         peak_num = sum(is.na(each_peak_info) == FALSE)
147 |         peak_num_summary = data.frame(
148 |             peak_num = peak_num, 
149 |             batch = batch_info, 
150 |             sample = each_sample, adt_marker = adt_marker_select) %>% 
151 |             rbind(peak_num_summary, .)
152 |         if(peak_num == 1){
153 |             peak_sep_summary = data.frame(
154 |                 sep_power = one_peak_stain_quality(adt_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num),
155 |                 adt_marker = adt_marker_select, sample = each_sample, batch = batch_info, peak_num = paste0("# of peak: ", peak_num)) %>% rbind(peak_sep_summary, .)
156 |         }
157 |         if(peak_num == 2){
158 |             peak_sep_summary = data.frame(
159 |                 sep_power = two_peak_stain_quality(adt_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num),
160 |                 adt_marker = adt_marker_select, sample = each_sample, batch = batch_info, peak_num = paste0("# of peak: ", peak_num)) %>% rbind(peak_sep_summary, .)
161 |         }
162 |         if(peak_num > 2){
163 |             peak_sep_summary = data.frame(
164 |                 sep_power = multi_peak_stain_quality(adt_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num),
165 |                 adt_marker = adt_marker_select, sample = each_sample, batch = batch_info, peak_num = paste0("# of peak: ", peak_num)) %>% rbind(peak_sep_summary, .)
166 |         }
167 |     }
168 | }
169 | 
170 | file_list <- c(
171 |     "10X_pbmc_10k", "10X_pbmc_1k", "10X_pbmc_5k_v3", "10X_pbmc_5k_nextgem", "10X_malt_10k", 
172 |     "stuart_2019", "granja_2019_pbmc", "granja_2019_bmmc", "hao_2020",
173 |     "kotliarov_2020", "witkowski_2020", "triana_2021", "buus_2021_T"   
174 | )
175 | 
176 | peak_sep_summary$batch = factor(peak_sep_summary$batch, levels = file_list)
177 | peak_sep_summary$sample = factor(peak_sep_summary$sample, levels = unique(cell_x_feature$sample))
178 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/arcsinh_centered_scaled_transform.R:
--------------------------------------------------------------------------------
 1 | ## Arcsinh centered scaled transformation
 2 | require(dplyr)
 3 | require(Seurat)
 4 | 
 5 | arcsinh_centered_scaled_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){
 6 |     ## parameters
 7 |     a = 1
 8 |     b = 1/5
 9 |     c = 0
10 |     if(!is.null(parameter_list)){
11 |         if("a" %in% names(parameter_list)){
12 |             a = parameter_list[["a"]]
13 |         }
14 |         if("b" %in% names(parameter_list)){
15 |             b = parameter_list[["b"]]
16 |         }
17 |         if("c" %in% names(parameter_list)){
18 |             c = parameter_list[["c"]]
19 |         }
20 |     }
21 |     ## transformation
22 |     asinhTrans <- arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c)
23 | 
24 |     ## output
25 |     out <- ((cell_x_adt %>%
26 |         asinhTrans() %>%
27 |         NormalizeData(normalization.method = "CLR", scale.factor = 1000000, margin = 1) %>%
28 |         exp()) - 1) * 3
29 |     return(out)
30 | }


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/arcsinh_centered_transform.R:
--------------------------------------------------------------------------------
 1 | ## Arcsinh centered transformation
 2 | require(dplyr)
 3 | require(Seurat)
 4 | 
 5 | arcsinh_centered_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){
 6 |     ## parameters
 7 |     a = 1
 8 |     b = 1/5
 9 |     c = 0
10 |     if(!is.null(parameter_list)){
11 |         if("a" %in% names(parameter_list)){
12 |             a = parameter_list[["a"]]
13 |         }
14 |         if("b" %in% names(parameter_list)){
15 |             b = parameter_list[["b"]]
16 |         }
17 |         if("c" %in% names(parameter_list)){
18 |             c = parameter_list[["c"]]
19 |         }
20 |     }
21 |     ## transformation
22 |     asinhTrans <- arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c)
23 | 
24 |     ## output
25 |     out <- (cell_x_adt %>%
26 |         asinhTrans() %>%
27 |         NormalizeData(normalization.method = "CLR", scale.factor = 1000000, margin = 1) %>%
28 |         exp()) - 1
29 |     return(out)
30 | }


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/arcsinh_clr_transform.R:
--------------------------------------------------------------------------------
 1 | ## Arcsinh CLR transformation
 2 | require(dplyr)
 3 | require(Seurat)
 4 | 
 5 | arcsinh_clr_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){
 6 |     ## parameters
 7 |     a = 1
 8 |     b = 1/5
 9 |     c = 0
10 |     if(!is.null(parameter_list)){
11 |         if("a" %in% names(parameter_list)){
12 |             a = parameter_list[["a"]]
13 |         }
14 |         if("b" %in% names(parameter_list)){
15 |             b = parameter_list[["b"]]
16 |         }
17 |         if("c" %in% names(parameter_list)){
18 |             c = parameter_list[["c"]]
19 |         }
20 |     }
21 |     ## transformation
22 |     asinhTrans <- arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c)
23 | 
24 |     ## output
25 |     out <- cell_x_adt %>%
26 |         asinhTrans %>%
27 |         NormalizeData(normalization.method = "CLR", scale.factor = 1000000, margin = 1)
28 |     return(out)
29 | }


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/arcsinh_transform.R:
--------------------------------------------------------------------------------
 1 | ## Arcsinh transformation
 2 | arcsinh_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){
 3 |     ## parameters
 4 |     a = 1
 5 |     b = 1/5
 6 |     c = 0
 7 |     if(!is.null(parameter_list)){
 8 |         if("a" %in% names(parameter_list)){
 9 |             a = parameter_list[["a"]]
10 |         }
11 |         if("b" %in% names(parameter_list)){
12 |             b = parameter_list[["b"]]
13 |         }
14 |         if("c" %in% names(parameter_list)){
15 |             c = parameter_list[["c"]]
16 |         }
17 |     }
18 |     ## transformation
19 |     asinhTrans <- arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c)
20 | 
21 |     ## output
22 |     out <- asinhTrans(cell_x_adt)
23 |     return(out)
24 | }


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/centered_arcsinh_transform.R:
--------------------------------------------------------------------------------
 1 | ## centered arcsinh transformation
 2 | require(dplyr)
 3 | require(Seurat)
 4 | 
 5 | centered_arcsinh_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){
 6 | 
 7 |     ## parameters
 8 |     a = 1
 9 |     b = 1/5
10 |     c = 0
11 |     if(!is.null(parameter_list)){
12 |         if("a" %in% names(parameter_list)){
13 |             a = parameter_list[["a"]]
14 |         }
15 |         if("b" %in% names(parameter_list)){
16 |             b = parameter_list[["b"]]
17 |         }
18 |         if("c" %in% names(parameter_list)){
19 |             c = parameter_list[["c"]]
20 |         }
21 |     }
22 |     ## transformation
23 |     asinhTrans <- arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c)
24 | 
25 |     ## output
26 |     out <- ((cell_x_adt %>%
27 |         NormalizeData(normalization.method = "CLR", scale.factor = 1000000, margin = 1) %>%
28 |         exp()) - 1) %>%
29 |         asinhTrans()
30 |     return(out)
31 | }


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/clr_transformation.R:
--------------------------------------------------------------------------------
 1 | ## CLR transformation
 2 | require(dplyr)
 3 | require(Seurat)
 4 | 
 5 | clr_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){
 6 | 
 7 |     ## output
 8 |     out <- cell_x_adt %>%
 9 |         NormalizeData(normalization.method = "CLR", scale.factor = 1000000, margin = 1)
10 |     return(out)
11 | }


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/cytofruv_transform.R:
--------------------------------------------------------------------------------
 1 | ## CytofRUV normalization
 2 | require(writexl)
 3 | require(CATALYST)
 4 | 
 5 | cytofruv_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL) {
 6 | 
 7 |     ## read parameters
 8 |     tmp_path <- parameter_list$tmp_path
 9 |     fcs_path <- parameter_list$fcs_path
10 |     condition <- parameter_list$condition
11 |     patient_id <- parameter_list$patient_id
12 |     batch <- parameter_list$batch
13 |     clusters_nb <- parameter_list$clusters_nb
14 |     start_adt_method <- parameter_list$start_adt_method
15 | 
16 |     ## change column name to ensure all end with number
17 |     ori_col_name <- colnames(cell_x_adt)
18 |     colnames(cell_x_adt) <- paste0(colnames(cell_x_adt), "0")
19 | 
20 |     for (each_sample in cell_x_feature$sample %>% levels()) {
21 |         if (!dir.exists(paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV"))) {
22 |             dir.create(paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV"))
23 |         }
24 |         fcs_file_name <- paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV/", each_sample, ".fcs")
25 |         if (!file.exists(fcs_file_name)) {
26 |             sample_ind <- which(cell_x_feature$sample == each_sample)
27 |             fcs_count <- cell_x_adt[sample_ind, ] %>% as.matrix()
28 |             fcs <- flowFrame(fcs_count)
29 |             write.FCS(fcs, filename = fcs_file_name)
30 |         }
31 |     }
32 |     md <- data.frame(
33 |         file_name = paste0(levels(cell_x_feature$sample), ".fcs"),
34 |         sample_id = levels(cell_x_feature$sample),
35 |         condition = condition,
36 |         patient_id = patient_id,
37 |         batch = batch
38 |     )
39 | 
40 |     panel <- data.frame(
41 |         fcs_colname = colnames(cell_x_adt), 
42 |         antigen = colnames(cell_x_adt), 
43 |         marker_class = "type"
44 |     )
45 |     write_xlsx(x = md, path = paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV/", "/Metadata.xlsx"))
46 |     write_xlsx(x = panel, path = paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV/", "/Panel.xlsx"))
47 |     
48 |     cytof_data <- load_data(paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV/"), metadata_filename = "Metadata.xlsx", panel_filename = "Panel.xlsx")
49 |     cytof_data$daf <- cluster_data(
50 |         cytof_data$daf, 
51 |         seed = 12345, 
52 |         markers_to_use = cytof_data$daf %>% rownames(), 
53 |         clusters_nb = clusters_nb)
54 |     cytof_data$lineage_markers <- cytof_data$daf %>% rownames()
55 | 
56 |     dir_name_norm_data <- "CytofRUV"
57 |     raw_data <- data.frame(
58 |         sample = cytof_data$daf$sample_id,
59 |         cluster = cluster_ids(cytof_data$daf), t(SummarizedExperiment::assay(cytof_data$daf, "exprs"))
60 |     )
61 |     rep_samples <- levels(cell_x_feature$sample) %>% list() # list(file_list)
62 |     cluster_list_rep_samples <- list(seq(1, clusters_nb))
63 |     k_value <- 5
64 |     seed <- 1234
65 | 
66 |     normalise_data(
67 |         data = cytof_data,
68 |         raw_data = raw_data,
69 |         rep_samples = rep_samples,
70 |         norm_clusters = cluster_list_rep_samples,
71 |         k = k_value,
72 |         num_clusters = clusters_nb,
73 |         wd_data = fcs_path,
74 |         dir_norm_data = dir_name_norm_data
75 |     )
76 |     out <- c()
77 |     for (each_sample in levels(cell_x_feature$sample)) {
78 |         tmpNorm <- read.FCS(filename = paste0(fcs_path, dir_name_norm_data, "/Norm_RUVIII_k", k_value, "_", each_sample, ".fcs"))
79 | 
80 |         out <- exprs(tmpNorm) %>%
81 |             round(3) %>%
82 |             data.frame() %>%
83 |             rbind(out, .)
84 |     }
85 |     colnames(out) <- ori_col_name
86 |     return(out)
87 | 
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/decontPro_transform.R:
--------------------------------------------------------------------------------
 1 | ## decontPro transformation
 2 | require(decontX)
 3 | require(Seurat)
 4 | require(dplyr)
 5 | 
 6 | decontPro_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){
 7 | 
 8 |   if(!is.null(parameter_list)){
 9 |     delta_sd = parameter_list$delta_sd,
10 |     background_sd = parameter_list$background_sd
11 |   }else{
12 |     ## give a default value
13 |     delta_sd = 2e-5
14 |     background_sd = 2e-6
15 |   }
16 |   adt_seurat <- CreateSeuratObject(t(cell_x_adt), assay = "ADT")
17 |   adt_seurat <- NormalizeData(adt_seurat, normalization.method = "CLR", margin = 2) %>%
18 |   ScaleData(assay = "ADT") %>%
19 |   RunPCA(assay = "ADT", features = rownames(adt_seurat), npcs = min(ncol(cell_x_adt), 10), reduction.name = "pca_adt") 
20 |   
21 |   adt_seurat = FindNeighbors(adt_seurat, dims = 1:min(ncol(cell_x_adt)-1, 10), assay = "ADT", reduction = "pca_adt")  %>%
22 |   FindClusters(resolution = 0.5)
23 | 
24 |   clusters <- as.integer(Idents(adt_seurat))
25 |   
26 |   rm_cell_index = which(rowSums(cell_x_adt) == 0)
27 |   out <- decontPro(t(cell_x_adt[-rm_cell_index, ]),
28 |                  clusters[-rm_cell_index],
29 |                  delta_sd = delta_sd,
30 |                  background_sd = background_sd)
31 | 
32 |   decontaminated_counts <- out$decontaminated_counts
33 |   norm_counts <- matrix(0, nrow(decontaminated_counts), nrow(cell_x_adt))
34 |   norm_counts[, -rm_cell_index] <- decontaminated_counts
35 | 
36 |   return(t(norm_counts))
37 | }
38 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/fdanorm_transform.R:
--------------------------------------------------------------------------------
 1 | ## fdaNorm transformation
 2 | require(flowStats)
 3 | 
 4 | fdanorm_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL) {
 5 | 
 6 |     ## read in parameters
 7 |     parameter_list_name <- names(parameter_list)
 8 |     if ("fcs_path" %in% parameter_list_name) {
 9 |         fcs_path <- parameter_list$fcs_path
10 |     } else {
11 |         return("fcs_path is not provided in the parameter_list!")
12 |     }
13 | 
14 |     if ("start_adt_method" %in% parameter_list_name) {
15 |         start_adt_method <- parameter_list$start_adt_method
16 |     } else {
17 |         return("start_adt_method is not provided in the paramter_list!")
18 |     }
19 | 
20 |     ## write out fcs files
21 |     for (each_sample in cell_x_feature$sample %>% levels()) {
22 |         if (!dir.exists(paste0(fcs_path, "/", start_adt_method))) {
23 |             dir.create(paste0(fcs_path, "/", start_adt_method))
24 |         }
25 |         fcs_file_name <- paste0(fcs_path, "/", start_adt_method, "/", each_sample, ".fcs")
26 |         if (!file.exists(fcs_file_name)) {
27 |             sample_ind <- which(cell_x_feature$sample == each_sample)
28 |             fcs_count <- cell_x_adt[sample_ind, ] %>% as.matrix()
29 |             fcs <- flowFrame(fcs_count)
30 |             write.FCS(fcs, filename = fcs_file_name)
31 |         }
32 |     }
33 | 
34 |     ## fdanorm
35 |     file_fcs <- read.ncdfFlowSet(paste0(fcs_path, "/", start_adt_method, "/", levels(cell_x_feature$sample), ".fcs"))
36 |     adt_fdanorm <- warpSet(file_fcs, colnames(cell_x_adt), monwrd = TRUE) %>% as.flowSet() 
37 | 
38 |     out <- c()
39 |     for (i in 1:length(adt_fdanorm)) {
40 |         out <- adt_fdanorm[[i]] %>%
41 |             exprs() %>%
42 |             rbind(out, .)
43 |     }
44 | 
45 |     return(out)
46 | }
47 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/gaussnorm_transform.R:
--------------------------------------------------------------------------------
 1 | ## gaussNorm transformation
 2 | require(flowStats)
 3 | 
 4 | gaussnorm_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL) {
 5 | 
 6 |     ## read in parameters
 7 |     parameter_list_name <- names(parameter_list)
 8 |     if ("fcs_path" %in% parameter_list_name) {
 9 |         fcs_path <- parameter_list$fcs_path
10 |     } else {
11 |         return("fcs_path is not provided in the parameter_list!")
12 |     }
13 | 
14 |     if ("start_adt_method" %in% parameter_list_name) {
15 |         start_adt_method <- parameter_list$start_adt_method
16 |     } else {
17 |         return("start_adt_method is not provided in the paramter_list!")
18 |     }
19 | 
20 |     ## write out fcs files
21 |     for (each_sample in cell_x_feature$sample %>% levels()) {
22 |         if (!dir.exists(paste0(fcs_path, "/", start_adt_method))) {
23 |             dir.create(paste0(fcs_path, "/", start_adt_method))
24 |         }
25 |         fcs_file_name <- paste0(fcs_path, "/", start_adt_method, "/", each_sample, ".fcs")
26 |         if (!file.exists(fcs_file_name)) {
27 |             sample_ind <- which(cell_x_feature$sample == each_sample)
28 |             fcs_count <- cell_x_adt[sample_ind, ] %>% as.matrix()
29 |             fcs <- flowFrame(fcs_count)
30 |             write.FCS(fcs, filename = fcs_file_name)
31 |         }
32 |     }
33 | 
34 |     ## gaussnorm
35 |     file_fcs <- read.ncdfFlowSet(paste0(fcs_path, "/", start_adt_method, "/", levels(cell_x_feature$sample), ".fcs"))
36 |     adt_gaussnorm <- gaussNorm(file_fcs, colnames(cell_x_adt))$flowset
37 | 
38 |     out <- c()
39 |     for (i in 1:length(adt_gaussnorm)) {
40 |         out <- adt_gaussnorm[[i]] %>%
41 |             exprs() %>%
42 |             rbind(out, .)
43 |     }
44 | 
45 |     return(out)
46 | }
47 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/get_peak_mode.R:
--------------------------------------------------------------------------------
  1 | ## get peak location
  2 | require(flowStats)
  3 | require(dplyr)
  4 | 
  5 | get_peak_mode <- function(cell_x_adt = NULL, cell_x_feature = NULL, adt_marker_select = NULL, parameter_list = NULL) {
  6 | 
  7 |     ## get parameters
  8 |     user_define_marker <- NULL ## known marker that tend to have bimodal
  9 |     user_define_peak <- NULL ## known peak that tend to be positive if only one peak detected
 10 |     bwFac <- 1.2
 11 |     border <- 0.01
 12 |     peakNr <- NULL
 13 |     densities <- NULL
 14 |     n <- 201
 15 |     indices <- FALSE
 16 |     lower_peak_thres <- 0.001
 17 | 
 18 |     start_adt_method <- parameter_list$start_adt_method
 19 |     fcs_path <- parameter_list$fcs_path
 20 |     parameter_list_name <- names(parameter_list)
 21 | 
 22 |     if ("user_define_marker" %in% parameter_list_name) {
 23 |         user_define_marker <- parameter_list$user_define_marker ## known marker that tend to have bimodal
 24 |     }
 25 |     if ("user_define_peak" %in% parameter_list_name) {
 26 |         user_define_peak <- parameter_list$user_define_peak ## known peak that tend to be positive if only one peak detected
 27 |     }
 28 |     if ("bwFac_smallest" %in% parameter_list_name) {
 29 |         bwFac_smallest <- parameter_list$bwFac_smallest
 30 |     } else {
 31 |         bwFac_smallest <- 1.5
 32 |     }
 33 | 
 34 |     ## write out fcs files
 35 |     for (each_sample in cell_x_feature$sample %>% levels()) {
 36 |         if (!dir.exists(paste0(fcs_path, "/", start_adt_method))) {
 37 |             dir.create(paste0(fcs_path, "/", start_adt_method))
 38 |         }
 39 |         fcs_file_name <- paste0(fcs_path, "/", start_adt_method, "/", each_sample, ".fcs")
 40 |         if (!file.exists(fcs_file_name)) {
 41 |             sample_ind <- which(cell_x_feature$sample == each_sample)
 42 |             fcs_count <- cell_x_adt[sample_ind, ] %>% as.matrix()
 43 |             fcs <- flowFrame(fcs_count)
 44 |             write.FCS(fcs, filename = fcs_file_name)
 45 |         }
 46 |     }
 47 | 
 48 |     dat <- read.ncdfFlowSet(paste0(fcs_path, "/", start_adt_method, "/", levels(cell_x_feature$sample), ".fcs"))
 49 | 
 50 |     ranges <- fsApply(dat, range)
 51 |     from <- min(sapply(ranges, function(z) z[1, adt_marker_select] - diff(z[, adt_marker_select]) * 0.15), na.rm = TRUE)
 52 |     to <- max(sapply(ranges, function(z) z[2, adt_marker_select] + diff(z[, adt_marker_select]) * 0.15), na.rm = TRUE)
 53 | 
 54 |     peak_num <- 0
 55 |     peak_mode <- list()
 56 |     peak_region <- list()
 57 |     zero_prop_list <- list()
 58 | 
 59 |     for (sample_name in sampleNames(dat)) {
 60 |         try_out <- tryCatch(filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = 2)), error = function(e) {
 61 |             c(1, 2)
 62 |         })
 63 |         if (length(try_out) == 1) {
 64 |             fres1 <- filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = 2))
 65 |             fres2 <- filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = 3))
 66 |             fres3 <- filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = 3.1))
 67 | 
 68 |             cell_ind <- which(cell_x_feature$sample == gsub(".fcs", "", sample_name))
 69 |             zero_prop <- sum(cell_x_adt[cell_ind, adt_marker_select] < 2) / length(cell_x_adt[cell_ind, adt_marker_select])
 70 |             zero_prop_list[[gsub(".fcs", "", sample_name)]] <- zero_prop
 71 | 
 72 |             ## different bandwidth w.r.t the zero proportion.
 73 |             if (zero_prop > 0.5) {
 74 |                 fres <- fres3
 75 |             } else if (zero_prop > 0.3) {
 76 |                 fres <- fres2
 77 |             } else {
 78 |                 fres <- fres1
 79 |             }
 80 | 
 81 |             if (adt_marker_select == "CD4") {
 82 |                 fres <- filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = bwFac_smallest))
 83 |             }
 84 |             peak_info <- flowStats:::curvPeaks(
 85 |                 x = fres[[sample_name]],
 86 |                 dat = exprs(dat[[sample_name]])[, adt_marker_select],
 87 |                 borderQuant = border,
 88 |                 from = from,
 89 |                 to = to
 90 |             )
 91 |             # peak_info$midpoints = peak_info$peaks[, "x"]
 92 | 
 93 |             ## User defined the marker that is known to usually have multiple peaks (n = 2)
 94 |             if (adt_marker_select %in% user_define_marker) {
 95 |                 if (length(peak_info$midpoints) == 2) {
 96 |                     ## 2 peaks, perfect!
 97 |                     res <- peak_info$midpoints
 98 |                     res_region <- peak_info$regions
 99 |                 } else if (length(peak_info$midpoints) > 2) {
100 |                     ## more than 2 peaks, consider filtering out very low density peaks
101 |                     peak_ind <- peak_info$peaks[, "y"] > lower_peak_thres
102 |                     res <- peak_info$midpoints[peak_ind]
103 |                     res_region <- peak_info$regions[peak_ind, ]
104 |                 } else if (zero_prop > 0.3 && length(peak_info$midpoints) < 2) {
105 |                     ## less than 2 peaks and zero proportion is larger than 0.3, use finer bandwidth:fres1 instead of fres2
106 |                     peak_info <- flowStats:::curvPeaks(
107 |                         x = fres1[[sample_name]],
108 |                         dat = exprs(dat[[sample_name]])[, adt_marker_select],
109 |                         borderQuant = 0,
110 |                         from = from,
111 |                         to = to
112 |                     )
113 |                     # peak_info$midpoints = peak_info$peaks[, "x"]
114 |                     if (length(peak_info$midpoints) <= 2) {
115 |                         ## peak number <=2 output results.
116 |                         res <- peak_info$midpoints
117 |                         res_region <- peak_info$regions
118 |                     } else if (length(peak_info$midpoints > 2)) {
119 |                         ## using new bandwidth, too many peaks, consider filtering out very low density peaks
120 |                         res <- peak_info$midpoints[peak_info$peaks[, "y"] > lower_peak_thres]
121 |                         res_region <- peak_info$regions[peak_info$peaks[, "y"] > lower_peak_thres, ]
122 |                     }
123 |                 } else if (zero_prop <= 0.3 && length(peak_info$midpoints) < 2) {
124 |                     ## less than 2 peaks and small zero proportion, user finer bandwidth: fres0 instead of fres1
125 |                     fres0 <- filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = bwFac_smallest)) ## 1.5
126 |                     peak_info <- flowStats:::curvPeaks(
127 |                         x = fres0[[sample_name]],
128 |                         dat = exprs(dat[[sample_name]])[, adt_marker_select],
129 |                         borderQuant = 0,
130 |                         from = from,
131 |                         to = to
132 |                     )
133 |                     # peak_info$midpoints = peak_info$peaks[, "x"]
134 |                     if (length(peak_info$midpoints) <= 2) {
135 |                         res <- peak_info$midpoints
136 |                         res_region <- peak_info$regions
137 |                     } else if (length(peak_info$midpoints > 2)) {
138 |                         res <- peak_info$midpoints[peak_info$peaks[, "y"] > lower_peak_thres]
139 |                         res_region <- peak_info$regions[peak_info$peaks[, "y"] > lower_peak_thres, ]
140 |                     }
141 |                 } else {
142 |                     ## no other cases?
143 |                     res <- peak_info$midpoints[peak_info$peaks[, "y"] > lower_peak_thres]
144 |                     res_region <- peak_info$regions[peak_info$peaks[, "y"] > lower_peak_thres, ]
145 |                 }
146 |             } else {
147 |                 ## not in user defined marker list, can have 1 peaks. Filter very low density peaks
148 |                 res <- peak_info$midpoints[peak_info$peaks[, "y"] > lower_peak_thres]
149 |                 res_region <- peak_info$regions[peak_info$peaks[, "y"] > lower_peak_thres, ]
150 |             }
151 | 
152 | 
153 |             ## all the multiple peaks are around 0
154 |             if (length(res) > 1 && zero_prop <= 0.3 && (sum(res < 2) == length(res))) {
155 |                 ## use broader bandwidth to merge multiple peaks around 0. Use fres2 instead fres1
156 |                 peak_infoTmp <- flowStats:::curvPeaks(
157 |                     x = fres2[[sample_name]],
158 |                     dat = exprs(dat[[sample_name]])[, adt_marker_select],
159 |                     borderQuant = border,
160 |                     from = from,
161 |                     to = to
162 |                 )
163 |                 # peak_infoTmp$midpoints = peak_infoTmp$peaks[, "x"]
164 | 
165 |                 if (adt_marker_select %in% user_define_marker) {
166 |                     ## if user define this marker to have 2 peaks.
167 |                     if (length(peak_infoTmp$midpoints) == 2) {
168 |                         resTmp <- peak_infoTmp$midpoints
169 |                         res_regionTmp <- peak_infoTmp$regions
170 |                     } else {
171 |                         resTmp <- peak_infoTmp$midpoints[peak_infoTmp$peaks[, "y"] > lower_peak_thres]
172 |                         res_regionTmp <- peak_infoTmp$regions[peak_infoTmp$peaks[, "y"] > lower_peak_thres, ]
173 |                     }
174 |                 } else {
175 |                     resTmp <- peak_infoTmp$midpoints[peak_infoTmp$peaks[, "y"] > lower_peak_thres]
176 |                     res_regionTmp <- peak_infoTmp$regions[peak_infoTmp$peaks[, "y"] > lower_peak_thres, ]
177 |                 }
178 | 
179 |                 indTmp <- which(!is.na(resTmp))
180 |                 resTmp <- resTmp[indTmp]
181 |                 if (is.null(nrow(res_regionTmp))) {
182 |                     res_regionTmp <- res_regionTmp %>%
183 |                         as.matrix() %>%
184 |                         t()
185 |                 }
186 |                 res_regionTmp <- res_regionTmp[indTmp, ]
187 |                 if (length(resTmp) > 1 && (sum(resTmp < 2) < length(resTmp))) {
188 |                     res <- resTmp
189 |                     res_region <- res_regionTmp
190 |                 }
191 |             }
192 | 
193 |             ## remove small negative peak around 0
194 |             if (length(res) > 1 && zero_prop < 0.3 && (sum(res < 2) < length(res))) {
195 |                 if (peak_info$peaks[1, "x"] < 0.9 && peak_info$peaks[1, "y"] < 1 && peak_info$peaks[2, "x"] > 2 && peak_info$peaks[2, "y"] / peak_info$peaks[1, "y"] > 5) {
196 |                     res <- res[-1]
197 |                     res_region <- res_region[-1, ]
198 |                 }
199 |             }
200 | 
201 |             ## all the peaks around 2 and zero proportion very large. Highly likely to have only one peak.
202 |             if (length(res) > 1 && zero_prop > 0.5 && (sum(res < 2) == length(res))) {
203 |                 res <- peak_info$midpoints[which(peak_info$peaks[, "y"] == max(peak_info$peak[, "y"]))]
204 |                 res_region <- peak_info$regions[which(peak_info$peaks[, "y"] == max(peak_info$peak[, "y"])), ]
205 |             }
206 | 
207 | 
208 | 
209 |             peak_num <- max(peak_num, length(res))
210 |             peak_mode[[sample_name]] <- res
211 |             peak_region[[sample_name]] <- matrix(NA, ncol = 2, nrow = length(res))
212 |             peak_region[[sample_name]][1:length(res), ] <- res_region
213 |         } else {
214 |             ## only one value for this marker
215 |             peak_mode[[sample_name]] <- NA
216 |             peak_region[[sample_name]] <- matrix(NA, ncol = 2, nrow = 1)
217 |             print(paste0(i, "-Single Value!"))
218 |         }
219 |     }
220 | 
221 |     landmark <- matrix(NA, ncol = peak_num, nrow = length(peak_mode))
222 |     landmarkRegion <- list()
223 |     for (i in 1:peak_num) {
224 |         landmarkRegion[[i]] <- matrix(NA, ncol = 2, nrow = length(peak_mode))
225 |     }
226 |     for (i in 1:length(peak_mode)) {
227 |         if (!is.na(peak_mode[[i]][1])) {
228 |             peak_modeNum <- length(peak_mode[[i]])
229 |             if (peak_modeNum == 1) {
230 |                 if (paste0(names(peak_mode)[i], ">", adt_marker_select) %in% user_define_peak) {
231 |                     landmark[i, min(2, peak_num)] <- peak_mode[[i]]
232 |                     landmarkRegion[[min(2, peak_num)]][i, ] <- peak_region[[i]]
233 |                 } else {
234 |                     landmark[i, 1] <- peak_mode[[i]]
235 |                     landmarkRegion[[1]][i, ] <- peak_region[[i]]
236 |                 }
237 |             } else if (peak_modeNum == 2) {
238 |                 landmark[i, c(1, max(2, peak_num))] <- peak_mode[[i]]
239 | 
240 |                 landmarkRegion[[1]][i, ] <- peak_region[[i]][1, ]
241 |                 landmarkRegion[[max(2, peak_num)]][i, ] <- peak_region[[i]][2, ]
242 |             } else if (peak_modeNum == 3) {
243 |                 landmark[i, c(1, 2, max(3, peak_num))] <- peak_mode[[i]]
244 |                 landmarkRegion[[1]][i, ] <- peak_region[[i]][1, ]
245 |                 landmarkRegion[[2]][i, ] <- peak_region[[i]][2, ]
246 |                 landmarkRegion[[max(3, peak_num)]][i, ] <- peak_region[[i]][3, ]
247 |             } else {
248 |                 landmark[i, 1:peak_modeNum] <- peak_mode[[i]]
249 |                 for (k in 1:peak_modeNum) {
250 |                     landmarkRegion[[k]][i, ] <- peak_region[[i]][k, ]
251 |                 }
252 |             }
253 |         }
254 |     }
255 | 
256 |     ## if all the peaks are within 1 - highly likely that there is only one negative peak
257 |     if (max(landmark[!is.na(landmark)]) < 2) {
258 |         landmark_new <- matrix(NA, ncol = 1, nrow = nrow(landmark))
259 |         landmarkAllMedian <- median(landmark[!is.na(landmark)])
260 |         for (i in 1:nrow(landmark)) {
261 |             landmark_nonNA <- landmark[i, !is.na(landmark[i, ])]
262 |             if (length(landmark_nonNA) > 0) {
263 |                 landmark_new[i, ] <- landmark[i, which.min(abs(landmark_nonNA - landmarkAllMedian))]
264 |             } else {
265 |                 landmark_new[i, ] <- NA
266 |             }
267 |         }
268 |         landmark <- landmark_new
269 |     }
270 | 
271 |     rownames(landmark) <- levels(cell_x_feature$sample)
272 |     return(list(peak_landmark_list = landmark, zero_prop_list = zero_prop_list))
273 | }
274 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/get_valley.R:
--------------------------------------------------------------------------------
 1 | ## get valley location
 2 | require(dplyr)
 3 | 
 4 | get_valley_location <- function(cell_x_adt = NULL, cell_x_feature = NULL, adt_marker_select = NULL, parameter_list = NULL, ....) {
 5 |     adjust <- parameter_list$adjust
 6 |     peak_landmark_list <- parameter_list$peak_mode_res$peak_landmark_list
 7 |     zero_prop_list <- parameter_list$peak_mode_res$zero_prop_list
 8 |     user_define_peak <- parameter_list$user_define_peak
 9 | 
10 |     parameter_list_name <- names(parameter_list)
11 | 
12 |     if ("min_fc" %in% parameter_list_name) {
13 |         min_fc <- parameter_list$min_fc
14 |     }
15 |     valley_location_list <- matrix(NA, nrow = nrow(peak_landmark_list), ncol = max(1, ncol(peak_landmark_list) - 1))
16 |     rownames(valley_location_list) <- cell_x_feature$sample %>% levels()
17 |     for (sample_name in cell_x_feature$sample %>% levels()) {
18 |         peak_landmark <- peak_landmark_list[sample_name, ]
19 |         zero_prop <- zero_prop_list[[sample_name]]
20 | 
21 |         ## check if user define single peak to be positive peak
22 |         if (paste0(sample_name, ">", adt_marker_select) %in% user_define_peak) {
23 |             lower_valley <- TRUE
24 |         } else {
25 |             lower_valley <- FALSE
26 |         }
27 | 
28 |         density_res <- density(
29 |             cell_x_adt[which(cell_x_feature$sample == sample_name), adt_marker_select],
30 |             adjust = adjust
31 |         )
32 |         x <- density_res$x
33 |         y <- density_res$y
34 | 
35 |         sign_diff <- sign(diff(y))
36 |         diff_sign_diff <- diff(sign_diff)
37 |         # peak <- which(diff_sign_diff == -2) + 1
38 |         valley <- which(diff_sign_diff == 2) + 1
39 | 
40 |         # yp <- y[peak]
41 |         x_valley <- x[valley]
42 |         y_valley <- y[valley]
43 |         #   stdy <- std(yp)
44 |         real_peak <- peak_landmark[!is.na(peak_landmark)] # peak
45 |         np <- length(real_peak)
46 | 
47 | 
48 |         if (np > 1) {
49 |             real_valley <- c()
50 |             for (i in 1:(np - 1)) {
51 |                 tmp_valley <- x_valley[(x_valley > real_peak[i]) & (x_valley < real_peak[i + 1])]
52 |                 real_valley <- c(real_valley, tmp_valley[which.min(y_valley[(x_valley > real_peak[i]) & (x_valley < real_peak[i + 1])])])
53 |             }
54 |         } else {
55 |             if (lower_valley == FALSE) {
56 |                 real_valley <- x[which(y < max(y) / min_fc)[which(y < max(y) / min_fc) > max(which(y == max(y)), which(x > real_peak[1]) %>% min())] %>% min()] # x_valley[x_valley > real_peak[1]][1]
57 |                 if (zero_prop > 0.8) {
58 |                     real_valley <- max(2, real_valley)
59 |                 }
60 |             } else {
61 |                 real_valley <- x[which(y < max(y) / min_fc)[which(y < max(y) / min_fc) < min(which(y == max(y)), which(x < real_peak[1]) %>% max())] %>% max()]
62 |             }
63 |         }
64 |         ## check if no valley is detected due to shoulder peak
65 |         if (length(real_valley) == 0) {
66 |             real_valley <- (real_peak[1] + real_peak[2]) / 2
67 |         }
68 | 
69 |         valley_location_list[sample_name, 1:length(real_valley)] <- real_valley
70 |     }
71 | 
72 | 
73 |     ## rownames(valley_location_list) <- cell_x_feature$sample %>% levels()
74 |     return(valley_location_list)
75 | }
76 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/harmony_transform.R:
--------------------------------------------------------------------------------
 1 | ## Harmony remove batch effect
 2 | require(harmony)
 3 | require(dplyr)
 4 | 
 5 | harmony_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL) {
 6 |     out <- HarmonyMatrix(
 7 |         data_mat = cell_x_adt,
 8 |         meta_data = cell_x_feature$batch,
 9 |         do_pca = FALSE
10 |     )
11 |     return(out)
12 | }
13 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/landmark_fill_na.R:
--------------------------------------------------------------------------------
 1 | ## merge peak and valley and fill in NA
 2 | 
 3 | landmark_fill_na <-  function(peak_landmark_list = NULL, valley_landmark_list = NULL, landmark_align_type = NULL){
 4 |     if(!landmark_align_type %in% c("negPeak", "negPeak_valley", "negPeak_valley_posPeak", "valley")){
 5 |       return("Please provide one of the landmark_align_type from: negPeak, negPeak_valley, negPeak_valley_posPeak, valley")
 6 |     }
 7 |     if(landmark_align_type == "valley"){
 8 |       ## only use the first valley to align
 9 |       landmark_matrix <- valley_landmark_list[, 1] %>% t %>% t
10 |       landmark_matrix[is.na(landmark_matrix), 1] <- 2 ## fill in na by background level 2 after arcinsh_b5_a1 transformation
11 |     }else{
12 |       ## involve negative peaks in landmark alignment
13 |       if(ncol(peak_landmark_list) == 1){
14 |         ## only have negative peaks
15 |         landmark_matrix <- cbind(
16 |           peak_landmark_list[, 1],
17 |           valley_landmark_list[, 1]
18 |         )
19 |         ## fill in na
20 |         landmark_matrix[is.na(landmark_matrix[, 1]), 1] <- median(landmark_matrix[!is.na(landmark_matrix[, 1]), 1])
21 |         landmark_matrix[is.na(landmark_matrix[, 2]), 2] <- 2
22 |       }else{
23 |         ## have positive peaks
24 |         landmark_matrix <- cbind(
25 |           peak_landmark_list[, 1],
26 |           valley_landmark_list[, 1],
27 |           peak_landmark_list[, ncol(peak_landmark_list)]
28 |         )
29 | 
30 |         ## fill in na
31 |         ## fill in valley first
32 |         landmark_matrix[is.na(landmark_matrix[, 2]), 2] <- 2
33 |         ## due to user_define_peak where unique peak is deemed positive.
34 |         ## fill in either 0.5 or half of the first valley
35 |         landmark_matrix[is.na(landmark_matrix[, 1]), 1] <- landmark_matrix[is.na(landmark_matrix[, 1]), 2]/2 #0.5 
36 |         ## fill in the last positive peak: add on the valley using the median distance from the last positive peak to the first valley
37 |         landmark_matrix[is.na(landmark_matrix[, 3]), 3] <- landmark_matrix[is.na(landmark_matrix[, 3]), 2] + median(landmark_matrix[!is.na(landmark_matrix[, 3]), 3] - landmark_matrix[!is.na(landmark_matrix[, 3]), 2])
38 | 
39 |       }
40 |     }
41 | 
42 |     ## only provide negative peak location
43 |     if (landmark_align_type == "negPeak") {
44 |       return(landmark_matrix[, 1] %>% t %>% t)
45 |     }
46 | 
47 |     ## only provide negative peak and valley location
48 |     if (landmark_align_type == "negPeak_valley") {
49 |       return(landmark_matrix[, 1:2])
50 |     }
51 | 
52 |     ## provide negative peak, first valley and last postiive peak location
53 |     return(landmark_matrix)
54 | }
55 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/sciPENN_PublicDataset.py:
--------------------------------------------------------------------------------
  1 | ## This the following python
  2 | ## /app/software/Python/3.8.2-GCCcore-9.3.0/bin/python
  3 | ##
  4 | import numpy as np
  5 | from matplotlib import pyplot
  6 | import os
  7 | from copy import deepcopy
  8 | 
  9 | from time import time
 10 | 
 11 | from math import ceil
 12 | from scipy.stats import spearmanr, gamma, poisson
 13 | 
 14 | from anndata import AnnData, read_h5ad
 15 | import scanpy as sc
 16 | from scanpy import read
 17 | import pandas as pd
 18 | 
 19 | from torch.utils.data import DataLoader, TensorDataset
 20 | from torch import tensor
 21 | from torch.cuda import is_available
 22 | 
 23 | from sciPENN.sciPENN_API import sciPENN_API
 24 | import anndata
 25 | import json
 26 | import psutil
 27 | 
 28 | ## original tutorial
 29 | # """Read in the data"""
 30 | # data_path = './manuscript/scripts/sciPENN_tutorial/data/'
 31 | 
 32 | # adata_gene = sc.read(data_path + "pbmc_gene.h5ad")
 33 | # adata_protein = sc.read(data_path + "pbmc_protein.h5ad")
 34 | 
 35 | # doublet_bool = (adata_gene.obs['celltype.l2'] != 'Doublet')
 36 | 
 37 | # adata_gene = adata_gene[doublet_bool].copy()
 38 | # adata_protein = adata_protein[doublet_bool].copy()
 39 | 
 40 | 
 41 | # """Create training and test"""
 42 | 
 43 | # train_bool = [x in ['P1', 'P3', 'P4', 'P7'] for x in adata_gene.obs['donor']]
 44 | 
 45 | # adata_gene_set1 = adata_gene[train_bool].copy()
 46 | # adata_protein_set1 = adata_protein[train_bool].copy()
 47 | # adata_gene_set2 = adata_gene[np.invert(train_bool)].copy()
 48 | # adata_protein_set2 = adata_protein[np.invert(train_bool)].copy()
 49 | 
 50 | # common_proteins = adata_protein_set1.var.index
 51 | # set1only_proteins = np.random.choice(common_proteins, len(common_proteins)//3, False)
 52 | # common_proteins = np.setdiff1d(common_proteins, set1only_proteins)
 53 | # set2only_proteins = np.random.choice(common_proteins, len(common_proteins)//2, False)
 54 | 
 55 | # set1only_proteins = set(set1only_proteins)
 56 | # set2only_proteins = set(set2only_proteins)
 57 | 
 58 | # keep_set1 = [x not in set2only_proteins for x in adata_protein_set1.var.index]
 59 | # keep_set2 = [x not in set1only_proteins for x in adata_protein_set1.var.index]
 60 | 
 61 | # adata_protein_set1 = adata_protein_set1[:, keep_set1].copy()
 62 | # adata_protein_set2 = adata_protein_set2[:, keep_set2].copy()
 63 | 
 64 | 
 65 | # sciPENN = sciPENN_API(gene_trainsets = [adata_gene_set1, adata_gene_set2], 
 66 | #                       protein_trainsets = [adata_protein_set1, adata_protein_set2], 
 67 | #                       train_batchkeys = ['donor', 'donor'])
 68 | 
 69 | 
 70 | # sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, 
 71 | #              decay_step = 0.1, lr = 10**(-3), weights_dir = "pbmc_to_pbmcINTEGRATE", load = True)
 72 | 
 73 | 
 74 | # imputed_test = sciPENN.impute()
 75 | 
 76 | 
 77 | # imputed_test.X
 78 | 
 79 | # imputed_test.obs['Dataset']
 80 | # imputed_test.obs['batch']
 81 | 
 82 | 
 83 | 
 84 | # proteins = imputed_test.var.index
 85 | 
 86 | # proteins1 = proteins[imputed_test.var['Dataset 1']] #get proteins sequenced in Dataset 1
 87 | # proteins2 = proteins[imputed_test.var['Dataset 2']] #get proteins sequenced in Dataset 1
 88 | 
 89 | 
 90 | # ds1_cells = imputed_test.obs['Dataset'] == 'Dataset 1'
 91 | # ds2_cells = imputed_test.obs['Dataset'] == 'Dataset 2'
 92 | 
 93 | # ds1_pred, ds1_seq = np.invert(imputed_test.var['Dataset 1']), imputed_test.var['Dataset 1']
 94 | # ds2_pred, ds2_seq = np.invert(imputed_test.var['Dataset 2']), imputed_test.var['Dataset 2']
 95 | 
 96 | # pred1 = imputed_test[ds1_cells, ds1_pred] #imputed protein array in dataset 1
 97 | # sequenced1 = imputed_test[ds1_cells, ds1_seq] #sequenced protein array in dataset 1
 98 | # pred2 = imputed_test[ds2_cells, ds2_pred] #imputed protein array in dataset 2
 99 | # sequenced2 = imputed_test[ds2_cells, ds2_seq] #sequenced protein array in dataset 2
100 | 
101 | # embedding = sciPENN.embed()
102 | # embedding.obs['batch']
103 | 
104 | # q10_pred = imputed_test[ds1_cells, ds1_pred].layers['q10'] #get q10 for imputed proteins from reference 1
105 | # q10_truth = imputed_test[ds1_cells, ds1_seq].layers['q10'] #get q10 for sequenced proteins from reference 1, not useful
106 | 
107 | 
108 | 
109 | # ## rewrite according to my scenario
110 | # """Read in the data"""
111 | # data_path = './manuscript/scripts/sciPENN_tutorial/data/'
112 | # adata_gene = sc.read(data_path + "pbmc_gene.h5ad")
113 | # adata_protein = sc.read(data_path + "pbmc_protein.h5ad")
114 | # doublet_bool = (adata_gene.obs['celltype.l2'] != 'Doublet')
115 | # adata_gene = adata_gene[doublet_bool].copy()
116 | # adata_protein = adata_protein[doublet_bool].copy()
117 | 
118 | # sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 
119 | #                       protein_trainsets = [adata_protein], 
120 | #                       train_batchkeys = ['donor'])
121 | 
122 | 
123 | # sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, 
124 | #              decay_step = 0.1, lr = 10**(-3), weights_dir = "pbmc_to_pbmcINTEGRATE", load = True)
125 | 
126 | # sciPENN.train(n_epochs = 10000, ES_max = 12, decay_max = 6, 
127 | #              decay_step = 0.1, lr = 10**(-3), weights_dir = "pbmc_to_pbmcINTEGRATE")
128 | 
129 | # embedding = sciPENN.embed()
130 | 
131 | # embedding.obs['batch']
132 | 
133 | # imputed_test = sciPENN.impute()
134 | 
135 | ## define memory usage
136 | def log_memory_usage(file):
137 |     memory = psutil.virtual_memory()
138 |     file.write(f'Total memory: {memory.total / (1024.0 ** 3)} GB\n')
139 |     file.write(f'Used memory: {memory.used / (1024.0 ** 3)} GB\n')
140 |     file.write(f'Available memory: {memory.available / (1024.0 ** 3)} GB\n')
141 | 
142 | 
143 | ## Public Data 13datasets
144 | out_path = './manuscript/results/public13Dataset_CITEseq/CSV/'
145 | 
146 | 
147 | run_name = 'public13Dataset_CITEseq'
148 | adts = pd.read_csv('./results/publicData_CITEseq/CSV/adt_data_common_RawCount_'+ run_name + '.csv', index_col=0)
149 | gex = pd.read_csv('./results/publicData_CITEseq/CSV/rna_data_common_RawCount_'+ run_name + '.csv', index_col=0)
150 | obs = pd.read_csv('./results/publicData_CITEseq/CSV/adt_feature_common_RawCount_'+ run_name + '.csv', index_col=0)
151 | # obs.reset_index(drop=True, inplace=True)
152 | # gex.reset_index(drop=True, inplace=True)
153 | # adts.reset_index(drop=True, inplace=True)
154 | 
155 | adts.index = obs.index
156 | gex.index = obs.index
157 | 
158 | adata_gene = anndata.AnnData(gex, obs)
159 | adata_gene.index = adata_gene.obs_names
160 | adata_protein = anndata.AnnData(adts, obs)
161 | adata_protein.index = adata_protein.obs_names
162 | 
163 | batch_key = 'study_name'
164 | with open(out_path + 'sciPENN_memory_usage_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f:
165 |     log_memory_usage(f)
166 |     start = time()
167 |     
168 |     sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 
169 |                         protein_trainsets = [adata_protein], 
170 |                         train_batchkeys = [batch_key], use_gpu = True, min_cells = False, min_genes = False)
171 |     log_memory_usage(f)
172 |     sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, decay_step = 0.1, lr = 10**(-3), weights_dir = "sciPENN_public13Dataset_CITEseq_study_name_GPU", load = True)
173 |     embedding = sciPENN.embed()
174 |     imputed_test = sciPENN.impute()
175 |     log_memory_usage(f)
176 |     end = time()
177 |     print(f'Seconds to complete: {round(end - start,0)}')
178 |     train_time = round(end - start, 0)
179 | 
180 | with open(out_path + 'sciPENN_runtime_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f:
181 |     json.dump(train_time, f)    
182 | 
183 | imputed_res = pd.DataFrame(imputed_test.X, index=imputed_test.obs_names, columns=imputed_test.var_names)
184 | imputed_res.to_csv(out_path + 'sciPENN_Imputed_Protein_' + run_name + '_' + batch_key + '.csv')
185 | embedding_res = pd.DataFrame(embedding.X, index=embedding.obs_names, columns=embedding.var_names)
186 | embedding_res.to_csv(out_path + 'sciPENN_Embedding_' + run_name + '_' + batch_key + '.csv')
187 | embedding_obs_res = pd.DataFrame(embedding.obs)
188 | embedding_obs_res.to_csv(out_path + 'sciPENN_Embedding_DataFeature_' + run_name + '_' + batch_key + '.csv')
189 | 
190 | batch_key = 'sample'
191 | with open(out_path + 'sciPENN_memory_usage_' + run_name + '_' + batch_key + '_CPU.csv', 'w') as f:
192 |     log_memory_usage(f)
193 |     start = time()
194 |     sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 
195 |                         protein_trainsets = [adata_protein], 
196 |                         train_batchkeys = [batch_key], use_gpu = False, min_cells = False, min_genes = False)
197 |     log_memory_usage(f)
198 |     sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, decay_step = 0.1, lr = 10**(-3), weights_dir = "sciPENN_public13Dataset_CITEseq_sample_CPU", load = True)
199 |     embedding = sciPENN.embed()
200 |     imputed_test = sciPENN.impute()
201 |     log_memory_usage(f)
202 |     end = time()
203 |     print(f'Seconds to complete: {round(end - start,0)}')
204 |     train_time = round(end - start, 0)
205 | 
206 | with open(out_path + 'sciPENN_runtime_' + run_name + '_' + batch_key + '_CPU.csv', 'w') as f:
207 |     json.dump(train_time, f)  
208 | 
209 | 
210 | batch_key = 'sample'
211 | with open(out_path + 'sciPENN_memory_usage_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f:
212 |     log_memory_usage(f)
213 |     start = time()
214 |     sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 
215 |                         protein_trainsets = [adata_protein], 
216 |                         train_batchkeys = [batch_key], use_gpu = True, min_cells = False, min_genes = False)
217 |     log_memory_usage(f)
218 |     sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, decay_step = 0.1, lr = 10**(-3), weights_dir = "sciPENN_public13Dataset_CITEseq_sample_GPU", load = True)
219 |     embedding = sciPENN.embed()
220 |     imputed_test = sciPENN.impute()
221 |     log_memory_usage(f)
222 |     end = time()
223 |     print(f'Seconds to complete: {round(end - start,0)}')
224 |     train_time = round(end - start, 0)
225 | 
226 | with open(out_path + 'sciPENN_runtime_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f:
227 |     json.dump(train_time, f)    
228 | 
229 | imputed_res = pd.DataFrame(imputed_test.X, index=imputed_test.obs_names, columns=imputed_test.var_names)
230 | imputed_res.to_csv(out_path + 'sciPENN_Imputed_Protein_' + run_name + '_' + batch_key + '.csv')
231 | embedding_res = pd.DataFrame(embedding.X, index=embedding.obs_names, columns=embedding.var_names)
232 | embedding_res.to_csv(out_path + 'sciPENN_Embedding_' + run_name + '_' + batch_key + '.csv')
233 | embedding_obs_res = pd.DataFrame(embedding.obs)
234 | embedding_obs_res.to_csv(out_path + 'sciPENN_Embedding_DataFeature_' + run_name + '_' + batch_key + '.csv')
235 | 
236 | 
237 | with open(out_path + 'sciPENN_memory_usage_' + run_name + '_' + batch_key + '_CPU.csv', 'w') as f:
238 |     log_memory_usage(f)
239 |     start = time.time()
240 |     
241 |     sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 
242 |                         protein_trainsets = [adata_protein], 
243 |                         train_batchkeys = [batch_key], use_gpu = False, min_cells = False, min_genes = False)
244 |     sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, decay_step = 0.1, lr = 10**(-3), weights_dir = "sciPENN_public13Dataset_CITEseq_sample_CPU", load = True)
245 |     embedding = sciPENN.embed()
246 |     imputed_test = sciPENN.impute()
247 |     end = time.time()
248 |     print(f'Seconds to complete: {round(end - start,0)}')
249 |     train_time = round(end - start, 0)
250 | 
251 | with open(out_path + 'sciPENN_runtime_' + run_name + '_' + batch_key + '_CPU.csv', 'w') as f:
252 |     json.dump(train_time, f)  
253 | 
254 | 
255 | 
256 | ## T cell dataset
257 | run_name = 'public13Dataset_CITEseq_Tcelllargeprop' #'public13Dataset_CITEseq_Tcelllargeprop_cd8T'
258 | out_path = './manuscript/results/' + run_name + '/CSV/'
259 | 
260 | adts = pd.read_parquet('./manuscript/results/' + run_name + '/CSV/adt_data_common_RawCount_'+ run_name + '.parquet')
261 | gex = pd.read_parquet('./manuscript/results/' + run_name + '/CSV/rna_data_common_RawCount_'+ run_name + '.parquet')
262 | obs = pd.read_parquet('./manuscript/results/' + run_name + '/CSV/adt_feature_common_RawCount_'+ run_name + '.parquet')
263 | 
264 | adts.index = obs.index
265 | gex.index = obs.index
266 | 
267 | adata_gene = anndata.AnnData(gex, obs)
268 | adata_gene.index = adata_gene.obs_names
269 | adata_protein = anndata.AnnData(adts, obs)
270 | adata_protein.index = adata_protein.obs_names
271 | 
272 | batch_key = 'sample' # 'sample' #'study_name' #
273 | with open(out_path + 'sciPENN_memory_usage_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f:
274 |     log_memory_usage(f)
275 |     start = time()
276 |     sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 
277 |                         protein_trainsets = [adata_protein], 
278 |                         train_batchkeys = [batch_key], use_gpu = True, min_cells = False, min_genes = False)
279 |     log_memory_usage(f)
280 |     sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, decay_step = 0.1, lr = 10**(-3), weights_dir = "sciPENN_public12Dataset_CITEseq_notriana_study_name", load = True)
281 |     log_memory_usage(f)
282 |     embedding = sciPENN.embed()
283 |     imputed_test = sciPENN.impute()
284 |     log_memory_usage(f)
285 |     end = time()
286 |     print(f'Seconds to complete: {round(end - start,0)}')
287 |     train_time = round(end - start, 0)
288 | 
289 | with open(out_path + 'sciPENN_runtime_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f:
290 |     json.dump(train_time, f)   
291 | 
292 | 
293 | imputed_res = pd.DataFrame(imputed_test.X, index=imputed_test.obs_names, columns=imputed_test.var_names)
294 | imputed_res.to_csv(out_path + 'sciPENN_Imputed_Protein_' + run_name + '_' + batch_key + '.csv')
295 | 
296 | embedding_res = pd.DataFrame(embedding.X, index=embedding.obs_names, columns=embedding.var_names)
297 | embedding_res.to_csv(out_path + 'sciPENN_Embedding_' + run_name + '_' + batch_key + '.csv')
298 | 
299 | embedding_obs_res = pd.DataFrame(embedding.obs)
300 | embedding_obs_res.to_csv(out_path + 'sciPENN_Embedding_DataFeature_' + run_name + '_' + batch_key + '.csv')
301 | 


--------------------------------------------------------------------------------
/manuscript/BenchmarkingAnalysis/ImplementMethods/warpset_transform.R:
--------------------------------------------------------------------------------
  1 | ## warpset peak valley alignment
  2 | require(dplyr)
  3 | require(flowStats)
  4 | warpset_transform <- function(landmark_matrix = NULL, cell_x_feature = NULL, adt_marker_select = NULL, parameter_list = NULL) {
  5 |   ## get parameters
  6 |   grouping <- NULL
  7 |   monwrd <- TRUE
  8 |   subsample <- NULL
  9 |   peakNr <- NULL
 10 |   clipRange <- 0.01
 11 |   nbreaks <- 11
 12 |   bwFac <- 2
 13 |   warpFuns <- FALSE
 14 |   chunksinze <- 10
 15 |   newNcFile <- NULL
 16 |   z <- NULL
 17 |   nb <- 1001
 18 |   eps <- .Machine$double.eps
 19 |   target <- NULL
 20 | 
 21 | 
 22 |   start_adt_method <- parameter_list$start_adt_method
 23 |   fcs_path <- parameter_list$fcs_path
 24 |   
 25 |   ## read in original adt expression matrix
 26 |   exp_data <- read.ncdfFlowSet(paste0(fcs_path, "/", start_adt_method, "/", levels(cell_x_feature$sample), ".fcs")) %>% as.flowSet()
 27 | 
 28 |   ## check if target marker is available
 29 |   adt_marker_flag <- adt_marker_select %in% colnames(exp_data)
 30 |   if (!all(adt_marker_flag)) {
 31 |     stop(
 32 |       "Invalid stain(s) not matching the flowSet:\n    ",
 33 |       paste(adt_marker_select[!adt_marker_flag], collapse = ", ")
 34 |     )
 35 |   }
 36 | 
 37 |   ## expression range and sample name list
 38 |   ranges <- fsApply(exp_data, range)
 39 |   samples <- sampleNames(exp_data)
 40 | 
 41 |   ## set up fda parameters
 42 |   extend <- 0.15
 43 |   from <- min(sapply(ranges, function(z) z[1, adt_marker_select] - diff(z[, adt_marker_select]) * extend), na.rm = TRUE)
 44 |   to <- max(sapply(ranges, function(z) z[2, adt_marker_select] + diff(z[, adt_marker_select]) * extend), na.rm = TRUE)
 45 |   wbasis <- create.bspline.basis(
 46 |     rangeval = c(from, to),
 47 |     norder = 4, breaks = seq(from, to, len = nbreaks)
 48 |   )
 49 |   Wfd0 <- fd(matrix(0, wbasis$nbasis, 1), wbasis)
 50 |   WfdPar <- fdPar(Wfd0, 1, 1e-4)
 51 | 
 52 |   density_y <- t(fsApply(exp_data, function(exp_data_each) { # t(fsApply(thisX, function(z){
 53 |     exp_data_each_range <- range(exp_data_each)[, adt_marker_select]
 54 |     exp_data_each <- exprs(exp_data_each)
 55 |     exp_data_each <- exp_data_each[exp_data_each[, adt_marker_select] > exp_data_each_range[1] + eps & exp_data_each[, adt_marker_select] < exp_data_each_range[2] - eps, adt_marker_select]
 56 |     density(exp_data_each, from = from, to = to, n = nb, na.rm = TRUE)$y
 57 |   }))
 58 |   arg_vals <- seq(from, to, len = nb)
 59 |   fdobj <- Data2fd(arg_vals, density_y, wbasis)
 60 | 
 61 |   if (ncol(landmark_matrix) == 1) { ## only one peak: offset
 62 |     if (is.null(target)) {
 63 |       offsets <- landmark_matrix - median(landmark_matrix)
 64 |       names(offsets) <- sampleNames(exp_data)
 65 |     } else {
 66 |       offsets <- landmark_matrix - landmark_matrix[sampleNames(exp_data) %in% target]
 67 |       names(offsets) <- sampleNames(exp_data)
 68 |     }
 69 |     funs <- funsBack <- vector("list", length(landmark_matrix))
 70 |     names(funs) <- samples
 71 |     names(funsBack) <- samples
 72 |     for (j in seq_along(funs)) {
 73 |       funs[[samples[[j]]]] <- function(x) x - z
 74 |       e1 <- new.env(hash = TRUE)
 75 |       e1$z <- offsets[samples[[j]]]
 76 |       environment(funs[[samples[[j]]]]) <- e1
 77 |       funsBack[[samples[[j]]]] <- function(x) x + z
 78 |       e2 <- new.env(hash = TRUE)
 79 |       e2$z <- offsets[samples[[j]]]
 80 |       environment(funsBack[[samples[[j]]]]) <- e2
 81 |     }
 82 |   } else { ## multiple peaks: warping
 83 |     if (is.null(target)) {
 84 |       capture.output(regDens <- landmarkreg(fdobj, landmark_matrix, WfdPar = WfdPar, monwrd = monwrd))
 85 |     } else {
 86 |       capture.output(regDens <- landmarkreg(fdobj, landmark_matrix, x0marks = apply(landmark_matrix, 2, jitter)[rownames(landmark_matrix) %in% target, ], WfdPar = WfdPar, monwrd = monwrd))
 87 |     }
 88 |     warpfdobj <- regDens$warpfd
 89 |     warpedX <- eval.fd(warpfdobj, arg_vals)
 90 |     warpedX[1, ] <- head(arg_vals, 1)
 91 |     warpedX[nrow(warpedX), ] <- tail(arg_vals, 1)
 92 |     ## compute warping functions
 93 |     ## funs <-  apply(warpedX, 2, function(y) approxfun(arg_vals, y))
 94 |     funs <- apply(warpedX, 2, approxfun, arg_vals)
 95 |     funsBack <- apply(warpedX, 2, function(a, b) approxfun(b, a), arg_vals)
 96 |   }
 97 | 
 98 | 
 99 |   names(funs) <- names(funsBack) <- samples # sampleNames(thisX)
100 | 
101 |   warped_landmark_matrix <- landmark_matrix
102 |   leftBoard <- rightBoard <- vector("list", length(funs))
103 |   # chunkleftBoard<-chunkrightBoard<-rep(list(length(funs)),max(1:length(funs)%/%chunksize)+1)
104 |   newRange <- c(Inf, -Inf)
105 | 
106 |   ## transform the raw data using the warping functions
107 |   for (i in seq_along(funs)) {
108 |     thisDat <- exprs(exp_data[[i]][, adt_marker_select])
109 |     lb <- thisDat < ranges[[i]][1, adt_marker_select] + eps
110 |     lb[is.na(lb)] <- TRUE
111 |     leftBoard[[i]] <- lb
112 |     rb <- thisDat > ranges[[i]][2, adt_marker_select] - eps
113 |     rb[is.na(rb)] <- TRUE
114 |     rightBoard[[i]] <- rb
115 |     # Include ALL data, none of this thresholding crap at borders.
116 |     sel <- leftBoard[[i]] | rightBoard[[i]]
117 |     # sel<-rep(FALSE,length(thisDat))
118 |     # browser();
119 |     newDat <- as.matrix(funs[[i]](thisDat[!sel, ]))
120 |     newDat[is.na(newDat)] <- thisDat[!sel, ][is.na(newDat)]
121 |     exprs(exp_data[[i]])[!sel, adt_marker_select] <- newDat
122 |     warped_landmark_matrix[i, ] <- funs[[i]](landmark_matrix[i, ])
123 |     newRange[1] <- min(newRange[1], min(exprs(exp_data[[i]])[, adt_marker_select], na.rm = TRUE))
124 |     newRange[2] <- max(newRange[2], max(exprs(exp_data[[i]])[, adt_marker_select], na.rm = TRUE))
125 |   }
126 |   ## make sure that edge envents are set to the extreme values
127 |   ## of the warped data range and update the parameters slot
128 |   ## accordingly
129 |   for (i in seq_along(funs)) {
130 |     minSel <- leftBoard[[i]]
131 |     maxSel <- rightBoard[[i]]
132 |     exprs(exp_data[[i]])[minSel, adt_marker_select] <- as.matrix(rep(
133 |       newRange[1],
134 |       sum(minSel, na.rm = TRUE)
135 |     ),
136 |     ncol = 1
137 |     )
138 |     exprs(exp_data[[i]])[maxSel, adt_marker_select] <- as.matrix(rep(
139 |       newRange[2],
140 |       sum(maxSel, na.rm = TRUE)
141 |     ),
142 |     ncol = 1
143 |     )
144 |     ip <- match(adt_marker_select, pData(parameters(exp_data[[i]]))$name)
145 |     tmp <- parameters(exp_data[[i]])
146 |     oldRanges <- unlist(range(exp_data[[i]])[, adt_marker_select])
147 |     pData(tmp)[ip, c("minRange", "maxRange")] <- c(
148 |       min(oldRanges[1], newRange[1]),
149 |       max(oldRanges[2], newRange[2])
150 |     )
151 |     exp_data[[i]]@parameters <- tmp
152 |   }
153 | 
154 | 
155 |   # exp_data
156 |   exp_data_aligned <- as(exp_data, "flowSet")
157 |   phenoData(exp_data_aligned) <- phenoData(exp_data)
158 |   exp_data_aligned <- exp_data_aligned[sampleNames(exp_data)]
159 | 
160 |   ## convert into matrix
161 |   cell_x_adt_aligned <- c()
162 |   for(sample_name in paste0(levels(cell_x_feature$sample), ".fcs")){
163 |     cell_x_adt_aligned <- rbind(cell_x_adt_aligned, exp_data_aligned[[sample_name]] %>% exprs)
164 |   }
165 |   return(cell_x_adt_aligned %>% data.frame)
166 | }
167 | 


--------------------------------------------------------------------------------
/manuscript/COVID19/ADTnorm_parallel.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | args = commandArgs(trailingOnly=TRUE)
 3 | 
 4 | library(pacman)
 5 | library(ggplot2)
 6 | library(viridis)
 7 | library(data.table)
 8 | library(tidyr)
 9 | library(gridExtra)
10 | library(magrittr)
11 | library(umap)
12 | library(ggpubr)
13 | library(SingleCellExperiment)
14 | library(RColorBrewer)
15 | library(ggridges)
16 | library(dplyr)
17 | library(CytofRUV)
18 | library(shapr)
19 | library(fda)
20 | library(pryr)
21 | library(Seurat)
22 | library(systemfonts)
23 | library(shiny)
24 | library(httpgd)
25 | library(ggrepel)
26 | require(magrittr)
27 | require(ggplot2)
28 | require(RColorBrewer)
29 | require(tidyr)
30 | require(ggridges)
31 | require(cytotidyr)
32 | library(harmony)
33 | library(zellkonverter)
34 | library(ADTnorm)
35 | # p_load(flowStats, flowCore, FlowSOM, ncdfFlow, flowViz, pdfCluster, cluster)
36 | 
37 | ## =====================
38 | ## study name and paths
39 | ## =====================
40 | run_name = "COVID19"
41 | master_path = "./"
42 | out_path = paste0(master_path, "manuscript/results/", run_name)
43 | # fcs_path = paste0(out_path, "/FCS/")
44 | fig_path = paste0(out_path, "/Figures/", run_name)
45 | 
46 | adt_data = readRDS(file = paste0(out_path, "/RDS/adt_data_", run_name, ".rds"))
47 | adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", run_name, ".rds")) %>% data.frame
48 | 
49 | arcsinh_param <- list(a = 1, b = 1 / 5, c = 0)
50 | 
51 | method = "ADTnorm_sample_manual"
52 | adt_feature$sample = factor(adt_feature$site_sample)
53 | res_norm = ADTnorm(
54 |     cell_x_adt = adt_data, 
55 |     cell_x_feature = adt_feature, 
56 |     marker_to_process = args[1], 
57 |     customize_landmark = FALSE,
58 |     save_fig = TRUE,
59 |     save_landmark = TRUE,
60 |     save_outpath = out_path,
61 |     study_name = run_name,
62 |     trimodal_marker = c("CD4", "CD45RA", "CD49f", "CD62L"),
63 |     bw_smallest_bi = 1.1,
64 |     bw_smallest_tri = 1.1,
65 |     bw_smallest_adjustments = list(CD3 = 1.1, CD4 = 1.1, CD8 = 1.1),
66 |     shoulder_valley_slope = -1,
67 |     exclude_zeroes = FALSE,
68 |     bimodal_marker = NULL,
69 |     positive_peak = NULL,
70 |     quantile_clip = 1,
71 |     peak_type = "mode",
72 |     multi_sample_per_batch = TRUE,
73 |     shoulder_valley = TRUE,
74 |     valley_density_adjust = 3,
75 |     landmark_align_type = "negPeak_valley_posPeak",
76 |     midpoint_type = "valley",
77 |     neg_candidate_thres = asinh(3/5 + 1),
78 |     lower_peak_thres = 0.005,
79 |     brewer_palettes = "Set1",
80 |     detect_outlier_valley = FALSE,
81 |     target_landmark_location = NULL,
82 |     clean_adt_name = FALSE,
83 |     override_landmark = NULL,
84 |     verbose = TRUE)
85 | 
86 | 
87 | saveRDS(res_norm, file = paste0(out_path, "/RDS/adt_data_norm_", method, "_", args[1], "_", run_name, ".rds"))
88 | 
89 | 


--------------------------------------------------------------------------------
/manuscript/COVID19/RNA_Mono_DE.R:
--------------------------------------------------------------------------------
  1 | rna_data = readRDS(file = paste0(out_path, "/RDS/rna_data_", run_name, ".rds")) 
  2 | rm_ind = readRDS(file = paste0(out_path, "/RDS/rm_ind1_ind2_", run_name, ".rds"))
  3 | 
  4 | sce = readH5AD("./scripts/COVID19/data/covid_portal_210320_with_raw.h5ad")
  5 | 
  6 | rownames(rna_data) = colnames(sce@assays@data@listData$X[, -rm_ind])
  7 | colnames(rna_data) = rownames(sce@assays@data@listData$X)[1:24737]
  8 | 
  9 | > which(colnames(rna_data) == "CD38")
 10 | [1] 5476
 11 | > which(colnames(rna_data) == "FCGR1A")
 12 | [1] 1327
 13 | > which(colnames(rna_data) == "SIGLEC1")
 14 | [1] 21457
 15 | 
 16 | 
 17 | rna_data[cell_hd, which(colnames(rna_data) == "CD38")] %>% summary
 18 | rna_data[cell_pt, which(colnames(rna_data) == "CD38")] %>% summary
 19 | 
 20 | rna_data[cell_hd, which(colnames(rna_data) == "FCGR1A")] %>% summary
 21 | rna_data[cell_pt, which(colnames(rna_data) == "FCGR1A")] %>% summary
 22 | 
 23 | rna_data[cell_hd, which(colnames(rna_data) == "SIGLEC1")] %>% summary
 24 | rna_data[cell_pt, which(colnames(rna_data) == "SIGLEC1")] %>% summary
 25 | 
 26 | 
 27 | rna_norm = sce@assays@data@listData$X[1:24737, -rm_ind] %>% t()
 28 | 
 29 | ## for each mono cell type
 30 | rna_mono_summary = c()
 31 | for(cell_type_each in c("CD16_mono", "CD14_mono", "CD83_CD14_mono")){
 32 |     cell_hd = which(adt_feature$full_clustering == cell_type_each & adt_feature$Status_on_day_collection_summary == "Healthy")
 33 |     cell_pt = which(adt_feature$full_clustering == cell_type_each & adt_feature$Status_on_day_collection_summary %in% c("Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
 34 | 
 35 |     rna_mono_summary = data.frame(
 36 |         gene_exp = rna_norm[cell_hd, which(colnames(rna_norm) == "CD38")],
 37 |         cell_type = cell_type_each,
 38 |         status = "Healthy",
 39 |         gene = "CD38") %>% rbind(rna_mono_summary)
 40 | 
 41 |     rna_mono_summary = data.frame(
 42 |         gene_exp = rna_norm[cell_pt, which(colnames(rna_norm) == "CD38")],
 43 |         cell_type = cell_type_each,
 44 |         status = "Patient",
 45 |         gene = "CD38") %>% rbind(rna_mono_summary)
 46 |     
 47 |     rna_mono_summary = data.frame(
 48 |         gene_exp = rna_norm[cell_hd, which(colnames(rna_norm) == "FCGR1A")],
 49 |         cell_type = cell_type_each,
 50 |         status = "Healthy",
 51 |         gene = "FCGR1A") %>% rbind(rna_mono_summary)
 52 |     
 53 |     rna_mono_summary = data.frame(
 54 |         gene_exp = rna_norm[cell_pt, which(colnames(rna_norm) == "FCGR1A")],
 55 |         cell_type = cell_type_each,
 56 |         status = "Patient",
 57 |         gene = "FCGR1A") %>% rbind(rna_mono_summary)
 58 |     
 59 |     rna_mono_summary = data.frame(
 60 |         gene_exp = rna_norm[cell_hd, which(colnames(rna_norm) == "SIGLEC1")],
 61 |         cell_type = cell_type_each,
 62 |         status = "Healthy",
 63 |         gene = "SIGLEC1") %>% rbind(rna_mono_summary)
 64 | 
 65 |     rna_mono_summary = data.frame(
 66 |         gene_exp = rna_norm[cell_pt, which(colnames(rna_norm) == "SIGLEC1")],
 67 |         cell_type = cell_type_each,
 68 |         status = "Patient",
 69 |         gene = "SIGLEC1") %>% rbind(rna_mono_summary)
 70 | 
 71 | }
 72 | rna_mono_summary$status = factor(rna_mono_summary$status, levels = c("Patient", "Healthy"))
 73 | 
 74 | rna_mono_summary %>% ggplot(aes(x = gene, y = gene_exp, fill = status)) + 
 75 | geom_boxplot() + 
 76 | facet_wrap(~cell_type, scales = "free_y") + 
 77 | theme_bw(base_size = 25) + 
 78 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
 79 | scale_fill_brewer(palette = "Set1") +
 80 | xlab("") +
 81 | ylab("Normalized Gene Expression")
 82 | 
 83 | pdf("./results/COVID19/Figures/RNA_Mono_DE.pdf", width = 17, height = 8)
 84 | rna_mono_summary %>% dplyr::group_by(cell_type, status, gene) %>% dplyr::summarize(mean_gene_exp = mean(gene_exp), prop_pos_cell = sum(gene_exp > 0)/length(gene_exp) * 100) %>% ggplot(aes(x = gene, y = status, fill = mean_gene_exp, size = prop_pos_cell)) +
 85 | geom_point(shape = 21) +
 86 | facet_wrap(~cell_type) +
 87 | theme_bw(base_size = 25) +
 88 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
 89 | scale_fill_gradient(low = "white", high = "red") +
 90 | xlab("") +
 91 | ylab("") +
 92 | scale_size_continuous(range = c(1, 40)) +
 93 | theme(legend.position = "top")
 94 | dev.off()
 95 | 
 96 | 
 97 | 
 98 | ## for each mono cell type
 99 | rna_mono_summary = c()
100 | for(cell_type_each in c("CD16_mono", "CD14_mono", "CD83_CD14_mono")){
101 |     cell_hd = which(adt_feature$full_clustering == cell_type_each & adt_feature$Status_on_day_collection_summary == "Healthy")
102 |     cell_pt = which(adt_feature$full_clustering == cell_type_each & adt_feature$Status_on_day_collection_summary %in% c("Asymptomatic", "Mild", "Moderate", "Severe", "Critical"))
103 | 
104 |     rna_mono_summary = data.frame(
105 |         gene_exp = rna_data[cell_hd, which(colnames(rna_data) == "CD38")],
106 |         cell_type = cell_type_each,
107 |         status = "Healthy",
108 |         gene = "CD38") %>% rbind(rna_mono_summary)
109 | 
110 |     rna_mono_summary = data.frame(
111 |         gene_exp = rna_data[cell_pt, which(colnames(rna_data) == "CD38")],
112 |         cell_type = cell_type_each,
113 |         status = "Patient",
114 |         gene = "CD38") %>% rbind(rna_mono_summary)
115 |     
116 |     rna_mono_summary = data.frame(
117 |         gene_exp = rna_data[cell_hd, which(colnames(rna_data) == "FCGR1A")],
118 |         cell_type = cell_type_each,
119 |         status = "Healthy",
120 |         gene = "FCGR1A") %>% rbind(rna_mono_summary)
121 |     
122 |     rna_mono_summary = data.frame(
123 |         gene_exp = rna_data[cell_pt, which(colnames(rna_data) == "FCGR1A")],
124 |         cell_type = cell_type_each,
125 |         status = "Patient",
126 |         gene = "FCGR1A") %>% rbind(rna_mono_summary)
127 |     
128 |     rna_mono_summary = data.frame(
129 |         gene_exp = rna_data[cell_hd, which(colnames(rna_data) == "SIGLEC1")],
130 |         cell_type = cell_type_each,
131 |         status = "Healthy",
132 |         gene = "SIGLEC1") %>% rbind(rna_mono_summary)
133 | 
134 |     rna_mono_summary = data.frame(
135 |         gene_exp = rna_data[cell_pt, which(colnames(rna_data) == "SIGLEC1")],
136 |         cell_type = cell_type_each,
137 |         status = "Patient",
138 |         gene = "SIGLEC1") %>% rbind(rna_mono_summary)
139 | 
140 | }
141 | rna_mono_summary$status = factor(rna_mono_summary$status, levels = c("Patient", "Healthy"))
142 | rna_mono_summary %>% ggplot(aes(x = gene, y = gene_exp, fill = status)) + 
143 | geom_boxplot() + 
144 | facet_wrap(~cell_type, scales = "free_y") + 
145 | theme_bw(base_size = 25) + 
146 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
147 | scale_fill_brewer(palette = "Set1") +
148 | xlab("") +
149 | ylab("Raw Gene Expression")
150 | 
151 | wilcox.test(rna_data[cell_hd, which(colnames(rna_data) == "CD38")],
152 | rna_data[cell_pt, which(colnames(rna_data) == "CD38")])
153 | 
154 | rna_data[cell_hd, which(colnames(rna_data) == "FCGR1A")] %>% summary
155 | rna_data[cell_pt, which(colnames(rna_data) == "FCGR1A")] %>% summary
156 | 
157 | rna_data[cell_hd, which(colnames(rna_data) == "SIGLEC1")] %>% summary
158 | rna_data[cell_pt, which(colnames(rna_data) == "SIGLEC1")] %>% summary
159 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/tests.html
 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
 8 | 
 9 | library(testthat)
10 | library(ADTnorm)
11 | 
12 | test_check("ADTnorm")
13 | 


--------------------------------------------------------------------------------
/tests/testthat/test-ADTnorm.R:
--------------------------------------------------------------------------------
 1 | test_that("ADTnorm works", {
 2 |   data(cell_x_adt)
 3 |   data(cell_x_feature)
 4 |   save_outpath <- tempdir()
 5 |   study_name <- "ADTnorm_demoRun"
 6 | 
 7 |   suppressWarnings({
 8 |     res <- ADTnorm(
 9 |       cell_x_adt = cell_x_adt,
10 |       cell_x_feature = cell_x_feature,
11 |       save_outpath = save_outpath,
12 |       study_name = study_name,
13 |       marker_to_process = c("CD3", "CD4", "CD8")
14 |     )
15 |   })
16 | 
17 |   expect_type(res, "list")
18 |   expect_equal(nrow(res), 422682)
19 |   expect_equal(ncol(res), 3)
20 | })
21 | 


--------------------------------------------------------------------------------
/vignettes/.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/vignettes/.RData


--------------------------------------------------------------------------------