├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ ├── docker.yaml │ └── pkgdown.yaml ├── .gitignore ├── ADTnorm.Rproj ├── DESCRIPTION ├── LICENSE.md ├── NAMESPACE ├── R ├── ADTnorm.R ├── arcsinh_transform.R ├── clean_adt_name.R ├── data.R ├── detect_impute_outlier_valley.R ├── get_customize_landmark.R ├── get_neighbors.R ├── get_peak_midpoint.R ├── get_peak_mode.R ├── get_valley_location.R ├── globals.R ├── landmark_fill_na.R ├── load_landmarks.R ├── peak_alignment.R ├── plot_adt_density_each.R ├── plot_adt_density_with_peak_valley.R └── plot_adt_density_with_peak_valley_each.R ├── README.md ├── data ├── cell_x_adt.rda └── cell_x_feature.rda ├── inst └── Dockerfile ├── man ├── .DS_Store ├── ADTnorm.Rd ├── arcsinh_transform.Rd ├── cell_x_adt.Rd ├── cell_x_feature.Rd ├── clean_adt_name.Rd ├── detect_impute_outlier_valley.Rd ├── figures │ ├── ADTnorm.png │ ├── ArcsinhTransform_log10_CD4.png │ ├── PublicData_samplelevel_adtnorm.png │ ├── PublicData_samplelevel_raw.png │ ├── RawCount.png │ ├── ShinyR.png │ ├── lower_peak_thres.png │ ├── multi_sample_per_batch.png │ ├── peak_type.png │ ├── pipeline_202208.png │ ├── shoulder_valley.png │ ├── shoulder_valley_CD3.png │ ├── target_landmark_location.png │ └── valley_density_adjust.png ├── get_customize_landmark.Rd ├── get_neighbors.Rd ├── get_peak_midpoint.Rd ├── get_peak_mode.Rd ├── get_valley_location.Rd ├── landmark_fill_na.Rd ├── load_landmarks.Rd ├── peak_alignment.Rd ├── plot_adt_density_each.Rd ├── plot_adt_density_with_peak_valley.Rd └── plot_adt_density_with_peak_valley_each.Rd ├── manuscript ├── BenchmarkingAnalysis │ ├── ComparativeAnalysis │ │ ├── WNN_integrateRNA_ADT.R │ │ ├── auto_gating.R │ │ ├── create_TcellOnly_publicData.R │ │ ├── evaluate_norm.R │ │ ├── publicData_CITEseq_umap.R │ │ └── stain_quality.R │ └── ImplementMethods │ │ ├── analysis_compareMethods.R │ │ ├── arcsinh_centered_scaled_transform.R │ │ ├── arcsinh_centered_transform.R │ │ ├── arcsinh_clr_transform.R │ │ ├── arcsinh_transform.R │ │ ├── centered_arcsinh_transform.R │ │ ├── clr_transformation.R │ │ ├── cytofruv_transform.R │ │ ├── decontPro_transform.R │ │ ├── fdanorm_transform.R │ │ ├── gaussnorm_transform.R │ │ ├── get_peak_mode.R │ │ ├── get_valley.R │ │ ├── harmony_transform.R │ │ ├── landmark_fill_na.R │ │ ├── sciPENN_PublicDataset.py │ │ ├── totalVI_publicDataSet.ipynb │ │ └── warpset_transform.R ├── COVID19 │ ├── ADTnorm_parallel.R │ ├── RNA_Mono_DE.R │ ├── analysis.R │ └── auto_gating.R ├── HematopoieticStudy │ ├── hema_progenitor_titration.R │ └── hema_titrated_final.R └── TitrationStudy │ └── analysis.R ├── tests ├── testthat.R └── testthat │ └── test-ADTnorm.R └── vignettes ├── .RData └── ADTnorm-tutorial.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^ADTnorm\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.github$ 4 | ^LICENSE\.md$ 5 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | R_KEEP_PKG_SOURCE: yes 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::rcmdcheck, any::XML 27 | needs: check 28 | 29 | - uses: r-lib/actions/check-r-package@v2 30 | -------------------------------------------------------------------------------- /.github/workflows/docker.yaml: -------------------------------------------------------------------------------- 1 | # https://github.com/docker/build-push-action 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - main 8 | 9 | name: docker 10 | 11 | jobs: 12 | docker: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Set up QEMU 16 | uses: docker/setup-qemu-action@v1 17 | - name: Set up Docker Buildx 18 | uses: docker/setup-buildx-action@v1 19 | - name: Login to GitHub Container Registry 20 | uses: docker/login-action@v1 21 | with: 22 | registry: ghcr.io 23 | username: ${{ github.repository_owner }} 24 | password: ${{ secrets.GITHUB_TOKEN }} 25 | - name: Build and push 26 | id: docker_build 27 | uses: docker/build-push-action@v2 28 | with: 29 | file: ./inst/Dockerfile 30 | push: true 31 | tags: | 32 | ghcr.io/yezhengstat/adtnorm:latest 33 | - name: Image digest 34 | run: echo ${{ steps.docker_build.outputs.digest }} 35 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | push: 4 | branches: 5 | - main 6 | tags: 7 | -'*' 8 | 9 | name: pkgdown 10 | 11 | jobs: 12 | pkgdown: 13 | runs-on: macOS-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | steps: 17 | - uses: actions/checkout@v2 18 | 19 | - uses: r-lib/actions/setup-r@v2 20 | 21 | - uses: r-lib/actions/setup-pandoc@v2 22 | 23 | - name: Query dependencies 24 | run: | 25 | install.packages('remotes') 26 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 27 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 28 | shell: Rscript {0} 29 | 30 | - name: Restore R package cache 31 | uses: actions/cache@v2 32 | with: 33 | path: ${{ env.R_LIBS_USER }} 34 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 35 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 36 | 37 | - name: Install dependencies 38 | run: | 39 | remotes::install_deps(dependencies = TRUE) 40 | install.packages("pkgdown", type = "binary") 41 | shell: Rscript {0} 42 | 43 | - name: Install package 44 | run: R CMD INSTALL . 45 | 46 | - name: Deploy package 47 | run: | 48 | git config --local user.email "actions@github.com" 49 | git config --local user.name "GitHub Actions" 50 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | test.R 3 | .Rhistory 4 | -------------------------------------------------------------------------------- /ADTnorm.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | LineEndingConversion: Posix 18 | 19 | BuildType: Package 20 | PackageUseDevtools: Yes 21 | PackageInstallArgs: --no-multiarch --with-keep.source 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: ADTnorm 2 | Title: Robust Normalization and Integration of Single-cell Protein Expression across CITE-seq Datasets 3 | Version: 1.0 4 | Author: Ye Zheng 5 | Maintainer: Ye Zheng 6 | Authors@R: 7 | c(person(given = "Ye", 8 | family = "Zheng", 9 | role = c("aut", "cre"), 10 | email = "yzheng23@fredhutch.org", 11 | comment = c(ORCID = "0000-0002-8806-2761")), 12 | person(given = "Ju Yeong", 13 | family = "Kim", 14 | role = c("ctb"), 15 | email = "jkim2345@fredhutch.org"), 16 | person(given = "Daniel", 17 | family = "Caron", 18 | role = c("ctb"), 19 | email = "dpc2136@cumc.columbia.edu"), 20 | person(given = "Helen", 21 | family = "Lindsay", 22 | role = c("ctb"), 23 | email = "Helen.Lindsay@chuv.ch")) 24 | Description: CITE-seq technology enables the direct measurement of protein expression, known as antibody-derived tags (ADT), in addition to RNA expression. The increase in the copy number of protein molecules leads to a more robust detection of protein features compared to RNA, providing a deep definition of cell types. However, due to added discrepancies of antibodies, such as the different types or concentrations of IgG antibodies, the batch effects of the ADT component of CITE-seq can dominate over biological variations, especially for the across-study integration. We present ADTnorm as a normalization and integration method designed explicitly for the ADT counts of CITE-seq data. Benchmarking with existing scaling and normalization methods, ADTnorm achieves a fast and accurate matching of the negative and positive peaks of the ADT counts across samples, efficiently removing technical variations across batches. Further quantitative evaluations confirm that ADTnorm achieves the best cell-type separation while maintaining the minimal batch effect. Therefore, ADTnorm facilitates the scalable ADT count integration of massive public CITE-seq datasets with distinguished experimental designs, which are essential for creating a corpus of well-annotated single-cell data with deep and standardized annotations. 25 | License: GPL (>= 3) 26 | Encoding: UTF-8 27 | LazyData: true 28 | LazyDataCompression: xz 29 | Depends: 30 | R (>= 4.0.0) 31 | Imports: 32 | grDevices, 33 | stats, 34 | utils, 35 | magrittr, 36 | flowCore, 37 | flowStats, 38 | EMDomics, 39 | fda, 40 | RColorBrewer, 41 | ggpubr, 42 | ggridges, 43 | shiny, 44 | DT, 45 | dplyr, 46 | tidyr, 47 | ggplot2 48 | Suggests: 49 | knitr, 50 | rmarkdown, 51 | markdown, 52 | testthat (>= 3.0.0) 53 | Roxygen: list(markdown = TRUE) 54 | RoxygenNote: 7.2.3 55 | VignetteBuilder: knitr 56 | biocViews: SingleCell 57 | Config/testthat/edition: 3 58 | URL: https://yezhengSTAT.github.io/ADTnorm 59 | BugReports: https://github.com/yezhengSTAT/ADTnorm/issues 60 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(ADTnorm) 4 | export(arcsinh_transform) 5 | export(clean_adt_name) 6 | export(detect_impute_outlier_valley) 7 | export(get_customize_landmark) 8 | export(get_neighbors) 9 | export(get_peak_midpoint) 10 | export(get_peak_mode) 11 | export(get_valley_location) 12 | export(landmark_fill_na) 13 | export(load_landmarks) 14 | export(peak_alignment) 15 | export(plot_adt_density_each) 16 | export(plot_adt_density_with_peak_valley) 17 | export(plot_adt_density_with_peak_valley_each) 18 | import(dplyr) 19 | import(ggplot2) 20 | import(ggridges) 21 | import(shiny) 22 | import(tidyr) 23 | importFrom(magrittr,"%$%") 24 | importFrom(stats,quantile) 25 | -------------------------------------------------------------------------------- /R/arcsinh_transform.R: -------------------------------------------------------------------------------- 1 | #' arcsinh transformation. 2 | #' 3 | #' This function transforms the input cell_x_adt matrix by arcsinh with co-factor 5. The definition of this function is x_new <- asinh(a + b * x) + c) 4 | #' @param cell_x_adt Matrix where rows are cells and columns are ADT markers. 5 | #' @param parameter_list Parameter list for a: positive double that corresponds to a shift about 0; b: positive double that corresponds to a scale factor; c: positive double. By default a = 1, b = 1/5 and c = 0. 6 | #' @export 7 | #' @examples 8 | #' \dontrun{ 9 | #' arcsinh_transform(cell_x_adt) 10 | #' } 11 | 12 | arcsinh_transform = function(cell_x_adt = NULL, parameter_list = NULL){ 13 | ## parameters 14 | a = 1 15 | b = 1/5 16 | c = 0 17 | if(!is.null(parameter_list)){ 18 | if("a" %in% names(parameter_list)){ 19 | a = parameter_list[["a"]] 20 | } 21 | if("b" %in% names(parameter_list)){ 22 | b = parameter_list[["b"]] 23 | } 24 | if("c" %in% names(parameter_list)){ 25 | c = parameter_list[["c"]] 26 | } 27 | } 28 | ## transformation 29 | asinhTrans = flowCore::arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c) 30 | 31 | ## output 32 | out = asinhTrans(cell_x_adt) 33 | return(out) 34 | } 35 | -------------------------------------------------------------------------------- /R/clean_adt_name.R: -------------------------------------------------------------------------------- 1 | #' Clean ADT marker name 2 | #' 3 | #' This function enables the general cleaning of ADT marker names. Regardless, users should try cleaning and unifying their ADT marker name first. Please also ensure there is no "/" in the ADT name, such as "TCRγ/δ". 4 | #' @param adt_name A vector of ADT marker name. 5 | #' @export 6 | #' @examples 7 | #' \dontrun{ 8 | #' clean_adt_name(colnames(cell_x_adt)) 9 | #' } 10 | 11 | 12 | clean_adt_name <- function(adt_name) { 13 | adt_rename <- adt_name %>% 14 | gsub("CD19-CAR", "CARCD19", .) %>% 15 | gsub("_PROT", "", .) %>% 16 | gsub("_TotalSeqB", "", .) %>% 17 | gsub("_control", "", .) %>% 18 | gsub("-GA.*", "", .) %>% 19 | gsub("-GC.*", "", .) %>% 20 | gsub("-GT.*", "", .) %>% 21 | gsub("-GG.*", "", .) %>% 22 | gsub("-CA.*", "", .) %>% 23 | gsub("-CC.*", "", .) %>% 24 | gsub("-CT.*", "", .) %>% 25 | gsub("-CG.*", "", .) %>% 26 | gsub("-AA.*", "", .) %>% 27 | gsub("-AC.*", "", .) %>% 28 | gsub("-AT.*", "", .) %>% 29 | gsub("-AG.*", "", .) %>% 30 | gsub("-TA.*", "", .) %>% 31 | gsub("-TC.*", "", .) %>% 32 | gsub("-TG.*", "", .) %>% 33 | gsub("-TT.*", "", .) %>% 34 | gsub("_.*", "", .) %>% 35 | gsub("ADT-", "", .) 36 | 37 | if (!("CD8" %in% adt_rename)) { 38 | if ("CD8a" %in% adt_rename | "CD8A" %in% adt_rename) { 39 | ind <- which(adt_rename == "CD8a" | adt_rename == "CD8A") 40 | adt_rename[ind] <- "CD8" 41 | } else if ("CD8b" %in% adt_rename | "CD8B" %in% adt_rename) { 42 | ind <- which(adt_rename == "CD8B" | adt_rename == "CD8b") 43 | adt_rename[ind] <- "CD8" 44 | } 45 | } 46 | adt_rename <- replace(adt_rename, adt_rename == "MouseIgG1kappaisotype", "IgG1") 47 | adt_rename <- replace(adt_rename, adt_rename == "MouseIgG2akappaisotype", "IgG2a") 48 | adt_rename <- replace(adt_rename, adt_rename == "Mouse IgG2bkIsotype", "IgG2b") 49 | adt_rename <- replace(adt_rename, adt_rename == "RatIgG2bkIsotype", "IgG2b-Rat") 50 | 51 | adt_rename <- replace(adt_rename, adt_rename == "Mouse-IgG1", "IgG1") 52 | adt_rename <- replace(adt_rename, adt_rename == "Mouse-IgG2a", "IgG2a") 53 | adt_rename <- replace(adt_rename, adt_rename == "Mouse-IgG2b", "IgG2b") 54 | adt_rename <- replace(adt_rename, adt_rename == "Rat-IgG2b", "IgG2b-Rat") 55 | 56 | adt_rename <- adt_rename %>% gsub("HLA-DR", "HLADR", .) %>% gsub("HLA\\.DR", "HLADR", .) %>% gsub("PD-1", "PD1", .) %>% gsub(" ", "", .) %>% gsub("\\s*\\([^\\)]+\\)", "", .) %>% gsub("CD279", "PD1", .) %>% gsub("CD274", "PDL1", .) %>% gsub("\\.", "", .) %>% gsub("-", "", .) %>% gsub("/", "",.) 57 | 58 | return(adt_rename) 59 | } 60 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' A matrix of raw count for the cell by ADT markers 2 | #' 3 | #' A dataset containing 422682 cells and 9 ADT markers for the CITE-seq raw measurement of 13 publicly available CITE-seq datasets. 4 | #' 5 | #' @format A data frame with 422682 rows and 9 variables: 6 | #' \describe{ 7 | #' \item{CD3}{CD3 ADT marker raw count across each cell} 8 | #' \item{CD4}{CD4 ADT marker raw count across each cell} 9 | #' \item{CD8}{CD8 ADT marker raw count across each cell} 10 | #' \item{CD14}{CD14 ADT marker raw count across each cell} 11 | #' \item{CD19}{CD19 ADT marker raw count across each cell} 12 | #' \item{CD25}{CD25 ADT marker raw count across each cell} 13 | #' \item{CD45RA}{CD45RA ADT marker raw count across each cell} 14 | #' \item{CD56}{CD56 ADT marker raw count across each cell} 15 | #' \item{CD127}{CD127 ADT marker raw count across each cell} 16 | #' } 17 | #' @source See detailed description in the manuscript 18 | "cell_x_adt" 19 | 20 | #' A matrix of raw count for the cell by features 21 | #' 22 | #' A dataset containing 422682 cells and 7 feature categories for the CITE-seq raw measurement of 13 publicly available CITE-seq datasets. 23 | #' 24 | #' @format A data frame with 422682 rows and 7 variables: 25 | #' \describe{ 26 | #' \item{sample}{Sample name. In this demo data, the sample name is the same as the study_name, assuming that one study is one sample.} 27 | #' \item{batch}{Batch ID. In this demo data, the batch ID is the same as the study_name.} 28 | #' \item{sample_status}{Sample status, i.e., Healthy, MALTtumor, HIV Vaccine, Lupus, B-ALL, AML.} 29 | #' \item{study_name}{Name of the data set/study.} 30 | #' \item{ADTseqDepth}{Total UMI per cell.} 31 | #' \item{cell_type_l1}{Broad level of cell type annotation using manual gating.} 32 | #' \item{cell_type_l2}{Fine level of cell type annotation using manual gating.} 33 | #' } 34 | #' @source See detailed description in the manuscript 35 | "cell_x_feature" -------------------------------------------------------------------------------- /R/detect_impute_outlier_valley.R: -------------------------------------------------------------------------------- 1 | #' Identify the valley outliers and impute by valley by closet neighbors. 2 | #' 3 | #' This function identifies the valley(s) that tend to be outliers compared to other valley locations and tries to find the closest samples with similar density distribution to impute the valley. If no neighbor sample is detected, the valley will remain as original. 4 | #' @param valley_location_res Matrix of valley landmark locations with rows being samples and columns being the valleys. 5 | #' @param adt_marker_select The marker whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample of the same ADT marker. 6 | #' @param cell_x_adt Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format. 7 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information. 8 | #' @param scale Scale level to defining outlier. A larger scale value corresponds to more severe outliers. 9 | #' @param method Outlier detection methods, choose from "MAD" (Median Absolute Deviation) or "IQR" (InterQuartile Range). The default is MAD. 10 | #' @param nearest_neighbor_n Number of top nearest neighbor samples to detect. 11 | #' @param nearest_neighbor_threshold Threshold to call neighbor samples. 12 | #' @examples 13 | #' \dontrun{ 14 | #' detect_impute_outlier_valley(valley_location_res, cell_x_feature) 15 | #' } 16 | # require(EMDomics) 17 | # require(dplyr) 18 | #' @export 19 | detect_impute_outlier_valley <- function(valley_location_res, adt_marker_select, cell_x_adt, cell_x_feature, scale = 3, method = "MAD", nearest_neighbor_n = 3, nearest_neighbor_threshold = 0.75){ 20 | 21 | if(!(method %in% c("MAD", "IQR"))){ 22 | stop("Please choose the outlier valley detection method to be 'MAD' or 'IQR'.") 23 | } 24 | ## get batch information 25 | valley_df <- valley_location_res %>% data.frame %>% mutate(sample = rownames(valley_location_res)) 26 | valley_df <- left_join(valley_df, cell_x_feature %>% select(sample, batch) %>% unique, by = "sample") 27 | 28 | ## within each batch find the valley outlier and impute by the nearest neighbor samples' valley 29 | for(batch_each in cell_x_feature$batch %>% unique){ 30 | 31 | ## get the sample id within each batch 32 | sample_select <- which(valley_df$batch == batch_each) 33 | 34 | if(length(sample_select) > 2){ ## more than two sample per batch 35 | ## for each valley 36 | for(c in 1:ncol(valley_location_res)){ 37 | 38 | ## choose outlier detection method 39 | if(method == "MAD"){ 40 | row_index <- sample_select[which(abs(valley_location_res[sample_select, c] - stats::median(valley_location_res[sample_select, c], na.rm = TRUE)) > stats::mad(valley_location_res[sample_select, c], na.rm = TRUE) * scale)] 41 | 42 | }else if(method == "IQR"){ 43 | row_index <- sample_select[which(stats::quantile(valley_location_res[sample_select, c], 0.75, na.rm = TRUE) + scale * stats::IQR(valley_location_res[sample_select, c], na.rm = TRUE) < valley_location_res[sample_select, c])] 44 | }else{ 45 | return("Please select method from MAD or IQR") 46 | } 47 | 48 | ## for each detected outlier sample, find the nearest neighbors 49 | 50 | if(length(row_index) > 0){ 51 | print(paste0("Outlier valley for sample: ", valley_df$sample[row_index], " Valley: ", c)) 52 | for(target_sample in valley_df$sample[row_index]){ 53 | ## neighbors at most 3 and earth mover distance <= 0.75 by default 54 | target_neighbors <- get_neighbors(target_sample, adt_marker_select, cell_x_adt, cell_x_feature, nearest_neighbor_n = nearest_neighbor_n, nearest_neighbor_threshold = nearest_neighbor_threshold) #valley_df$sample[row_index], 55 | print(paste0("Outlier sample ", target_sample, " nearest neighbors: ", target_neighbors)) 56 | 57 | ## if there is qualified neighbors to impute 58 | ## otherwise, this is a unique sample marker distribution. Leave original valley value. 59 | if(length(target_neighbors) > 0){ 60 | valley_location_res[target_sample, c] <- valley_location_res[target_neighbors, c] %>% stats::median 61 | } 62 | } 63 | } 64 | } 65 | } 66 | 67 | } 68 | return(valley_location_res) 69 | } 70 | -------------------------------------------------------------------------------- /R/get_customize_landmark.R: -------------------------------------------------------------------------------- 1 | #' Prompt Shiny browser to manually customize peaks and valleys locations. 2 | #' 3 | #' This function will launch a shiny app allowing the user to set the location of peaks and valleys manually. The function will output the landmark positions that the user has set. 4 | #' @param cell_x_adt_sample Matrix of ADT counts of the selected marker, with columns of sample and batch information for each row of cells. 5 | #' @param landmark_pos Matrix of landmark location including peaks and valleys. 6 | #' @param bw Bandwidth for the density plot. 7 | #' @param adt_marker_select_name The ADT marker needed to be manually processed to set the landmarks. 8 | #' @param brewer_palettes Set the color scheme of the color brewer. The default is "Set1". 9 | #' @examples 10 | #' \dontrun{ 11 | #' get_customize_landmark( 12 | #' cell_x_adt_sample, 13 | #' landmark_pos, 14 | #' bw, 15 | #' adt_marker_select_name, 16 | #' brewer_palettes = "Set1" 17 | #' )} 18 | #' @export 19 | #' @import ggplot2 ggridges shiny 20 | get_customize_landmark = function(cell_x_adt_sample, landmark_pos, bw, adt_marker_select_name, brewer_palettes = "Set1"){ 21 | 22 | max_value = ceiling(max(cell_x_adt_sample[, 1], na.rm = TRUE)) + 2 23 | # cell_x_adt_sample_filter = cell_x_adt_sample %>% dplyr::filter(!is.na(adt)) 24 | sample_num = levels(cell_x_adt_sample$sample) %>% length 25 | plot_height = paste0(sample_num * 50, "px") 26 | cell_x_adt_sample$sample_ordered = factor(cell_x_adt_sample$sample, levels = levels(cell_x_adt_sample$sample)) 27 | ## Create an example UI 28 | ui <- create_ui(landmark_pos, max_value, bw, plot_height) 29 | server <- create_server(landmark_pos, cell_x_adt_sample, bw, adt_marker_select_name, brewer_palettes, max_value) 30 | 31 | ## Return user input, vals can change 32 | res <- shiny::runApp(shinyApp(ui, server)) 33 | return(res) 34 | 35 | } 36 | 37 | flat_table <- function(tab) { 38 | tmp <- lapply(1:nrow(tab), function(i) { 39 | data.frame( 40 | peaky = i, 41 | y = rownames(tab)[i], 42 | x = sapply(1:ncol(tab), function(j) { 43 | tab[i,j] 44 | }), 45 | name = colnames(tab), 46 | type = sub("\\d+$", "", colnames(tab)) 47 | ) 48 | }) 49 | do.call(rbind, tmp) 50 | } 51 | 52 | create_ui <- function(landmark_pos, max_value, bw, plot_height) { 53 | ui <- fluidPage( 54 | sidebarLayout( 55 | sidebarPanel( 56 | selectInput(inputId = "landmark_select", 57 | label = "Select the sample to tune:", 58 | choices = rev(rownames(landmark_pos))), 59 | uiOutput(outputId = "landmark_slider_ui") 60 | ), 61 | mainPanel( 62 | sliderInput(inputId = "bandwidth_slider", 63 | label = "Bandwidth for Density Visualization Below:", 64 | min = 0, 65 | max = 3, 66 | value = bw, 67 | step = 0.05, width = '1000px'), 68 | plotOutput("plot", height = plot_height, width = '1000px'), 69 | DT::dataTableOutput("table"), 70 | shiny::actionButton("done", "Step 2: Record User Input and Exit") 71 | ) 72 | )) 73 | return(ui) 74 | } 75 | 76 | 77 | create_server <- function(landmark_pos, cell_x_adt_sample, bw = 0.1, adt_marker_select_name, brewer_palettes = "Set1", max_value) { 78 | 79 | fillColor <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(8, brewer_palettes))(length(unique(cell_x_adt_sample$batch))) 80 | 81 | 82 | server <- function(input, output, session) { 83 | vals <- shiny::reactiveValues( 84 | landmark_pos = landmark_pos, 85 | slider_values = flat_table(landmark_pos), 86 | fig = NULL 87 | ) 88 | 89 | output$table <- DT::renderDataTable(vals$landmark_pos %>% DT::datatable() %>% DT::formatRound(columns = colnames(landmark_pos), digits = 3)) 90 | 91 | output$plot <- renderPlot({ 92 | bw_value <- input$bandwidth_slider 93 | # no_mis_idx = !is.na(cell_x_adt_sample$adt) 94 | 95 | vals$fig <- ggplot(cell_x_adt_sample, aes_string(x = "adt", y = "sample_ordered")) + 96 | ggridges::geom_density_ridges2(aes(fill = factor(batch)), bandwidth = bw_value) + 97 | theme_bw(base_size = 20) + 98 | ggpubr::rotate_x_text(angle = 90) + 99 | ggpubr::rremove("legend") + 100 | xlab(paste0(adt_marker_select_name, " ADT Counts")) + 101 | ylab("Sample") + 102 | scale_fill_manual(values = fillColor) + 103 | scale_x_continuous(breaks = seq(0, max_value, 0.5)) + 104 | # geom_point(data = vals$slider_values, aes_string(x = "x", y = "y", shape = "type"), size = 5) 105 | geom_segment( 106 | data = vals$slider_values, 107 | aes(x = x, xend = x, y = peaky, yend = peaky+0.5, color = type), 108 | size = 1 109 | ) + 110 | scale_color_manual(values = c("peak" = "black", "valley" = "grey")) 111 | 112 | vals$fig 113 | }) 114 | 115 | 116 | output$landmark_slider_ui <- renderUI({ 117 | if (!is.null(input$landmark_select)) { 118 | fluidRow( 119 | column(12, 120 | h2(input$landmark_select), 121 | lapply(1:ncol(landmark_pos), function(j) { 122 | i <- which(rownames(landmark_pos) == input$landmark_select) 123 | val <- vals$landmark_pos[i, j] 124 | na_val <- is.na(val) 125 | val <- ifelse(na_val, 0, val) 126 | fluidRow( 127 | checkboxInput( 128 | inputId = paste0("na_checkbox", j), 129 | label = "Set as NA", 130 | value = na_val 131 | ), 132 | sliderInput( 133 | inputId = paste0(colnames(landmark_pos)[j]), 134 | label = colnames(landmark_pos)[j], 135 | min = 0, 136 | max = max_value, 137 | value = val, 138 | step = 0.0001 139 | ) 140 | ) 141 | })), 142 | shiny::actionButton("applied", "Step 1: Set Customized Location") 143 | ) 144 | } 145 | }) 146 | 147 | observeEvent(input$applied, { 148 | i <- which(rownames(landmark_pos) == input$landmark_select) 149 | for (j in 1:ncol(landmark_pos)) { 150 | if (input[[paste0("na_checkbox", j)]]) { 151 | vals$landmark_pos[i, j] <- NA 152 | } else { 153 | vals$landmark_pos[i, j] <- input[[paste0(colnames(landmark_pos)[j])]] 154 | } 155 | } 156 | output$table <- DT::renderDataTable(vals$landmark_pos %>% DT::datatable() %>% DT::formatRound(columns = colnames(landmark_pos), digits = 3)) 157 | 158 | slider_values <- flat_table(vals$landmark_pos) 159 | 160 | output$plot <- renderPlot({ 161 | bw_value <- input$bandwidth_slider 162 | 163 | vals$fig <- ggplot(cell_x_adt_sample, aes_string(x = "adt", y = "sample_ordered")) + 164 | ggridges::geom_density_ridges2(aes(fill = factor(batch)), bandwidth = bw_value) + 165 | theme_bw(base_size = 20) + 166 | ggpubr::rotate_x_text(angle = 90) + 167 | ggpubr::rremove("legend") + 168 | xlab(paste0(adt_marker_select_name, " ADT Counts")) + 169 | ylab("Sample") + 170 | scale_fill_manual(values = fillColor) + 171 | scale_x_continuous(breaks = seq(0, max_value, 0.5)) + 172 | # geom_point(data = slider_values, aes_string(x = "x", y = "y", shape = "type"), size = 5) 173 | geom_segment( 174 | data = slider_values, 175 | aes(x = x, xend = x, y = peaky, yend = peaky+0.5, color = type), 176 | size = 1 177 | ) + 178 | scale_color_manual(values = c("peak" = "black", "valley" = "grey")) 179 | vals$fig 180 | }) 181 | }) 182 | 183 | shiny::observeEvent(input$done, { 184 | shiny::stopApp(vals$landmark_pos) 185 | }) 186 | } 187 | 188 | return(server) 189 | 190 | } 191 | 192 | 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /R/get_neighbors.R: -------------------------------------------------------------------------------- 1 | #' Find the closest neighbors to impute outlier valleys. 2 | #' 3 | #' This function identifies the valley that tends to be outliers compared to other valley locations and tries to find the closest samples that have similar density distribution to input the valley. If no neighbor sample is detected, the valley will remain as original. 4 | #' @param target_sample The target sample whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample. 5 | #' @param adt_marker_select The marker whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample of the same ADT marker. 6 | #' @param cell_x_adt Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format. 7 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information. 8 | #' @param nearest_neighbor_n Number of top nearest neighbor samples to detect. 9 | #' @param nearest_neighbor_threshold Threshold to call neighbor samples. 10 | #' @export 11 | #' @examples 12 | #' \dontrun{ 13 | #' get_neighbors(target_sample, adt_marker_select, cell_x_adt, cell_x_feature) 14 | #' } 15 | # require(EMDomics) 16 | # require(dplyr) 17 | get_neighbors <- function(target_sample, adt_marker_select, cell_x_adt, cell_x_feature, nearest_neighbor_n = 3, nearest_neighbor_threshold = NULL){ 18 | 19 | knn_res <- c() 20 | target_cell_ind <- which(cell_x_feature$sample == target_sample) 21 | sample_list <- setdiff(cell_x_feature$sample %>% unique, target_sample) 22 | for(sample in sample_list){ 23 | cell_ind <- which(cell_x_feature$sample == sample) 24 | exp_data <- c(cell_x_adt[target_cell_ind, adt_marker_select], cell_x_adt[cell_ind, adt_marker_select]) 25 | names(exp_data) <- rownames(cell_x_adt)[c(target_cell_ind, cell_ind)] 26 | labels <- c(rep("target", length(target_cell_ind)), rep("sample", length(cell_ind))) 27 | names(labels) <- rownames(cell_x_adt)[c(target_cell_ind, cell_ind)] 28 | 29 | knn_res <- c(knn_res, EMDomics::calculate_emd_gene(exp_data, labels, names(exp_data))) 30 | } 31 | names(knn_res) <- sample_list 32 | if(is.null(nearest_neighbor_threshold)){ 33 | return(knn_res %>% sort %>% utils::head(nearest_neighbor_n) %>% names) 34 | }else{ 35 | return(knn_res[knn_res <= nearest_neighbor_threshold] %>% sort %>% utils::head(nearest_neighbor_n) %>% names) 36 | } 37 | 38 | 39 | } 40 | -------------------------------------------------------------------------------- /R/get_valley_location.R: -------------------------------------------------------------------------------- 1 | #' Get the valley landmark locations. 2 | #' 3 | #' This function detects the valley locations either between every two peak landmarks or cut at the right heavy tails. If the specified positive uni-peak, the valley location will be set at the left side of the uni-peak. 4 | #' @param cell_x_adt Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format. 5 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information. 6 | #' @param adt_marker_select Markers to normalize. Leaving empty to process all the ADT markers in the cell_x_adt matrix. 7 | #' @param peak_mode_res The peak landmark results coming out of `get_peak_mode` or `get_peak_midpoint` function. 8 | #' @param shoulder_valley Indictor to specify whether a shoulder valley is expected in case of the heavy right tail where the population of cells should be considered as positive population. 9 | #' @param positive_peak A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells. The only CD3 peak should be aligned with the positive peaks of other samples. 10 | #' @param multi_sample_per_batch Set it to TRUE to discard the positive peak that only appears in one sample per batch (sample number is >=3 per batch). 11 | #' @param adjust Parameter for `density` function: bandwidth used is actually adjust*bw. This makes it easy to specify values like 'half the default' bandwidth. 12 | #' @param min_fc Mimimal fold change between the highest peak density height and candidate valley density height. The default is 20. 13 | #' @param shoulder_valley_slope The slope on the ADT marker density distribution to call shoulder valley. 14 | #' @param neg_candidate_thres The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95% quantile of IgG samples is large. 15 | #' @param lower_peak_thres The minimal ADT marker density height to call it a real peak. Set it to 0.01 to avoid suspicious positive peaks. Set it to 0.001 or smaller to include some small but tend to be real positive peaks, especially for markers like CD19. 16 | #' @param arcsinh_transform_flag The flag indicates if the input is raw count and arcsinh transformation is implemented. 17 | #' @examples 18 | #' \dontrun{ 19 | #' get_valley_location(cell_x_adt, cell_x_feature, peak_mode_res) 20 | #' } 21 | #' @export 22 | #' @importFrom magrittr %$% 23 | get_valley_location = function(cell_x_adt = NULL, cell_x_feature = NULL, adt_marker_select = NULL, peak_mode_res = NULL, shoulder_valley = TRUE, positive_peak = NULL, multi_sample_per_batch = FALSE, adjust = 1.5, min_fc = 20, shoulder_valley_slope = -1, lower_peak_thres = 0.01, neg_candidate_thres = asinh(10/5 + 1), arcsinh_transform_flag = TRUE) { 24 | 25 | peak_landmark_list = peak_mode_res 26 | 27 | ## tag sample if it is the only sample within each batch that have different number of peaks. 28 | # batch_num = cell_x_feature$batch %>% unique %>% length 29 | tag_row = vector("list", length = nrow(peak_landmark_list)) 30 | names(tag_row) = rownames(peak_landmark_list) 31 | 32 | if(multi_sample_per_batch && ncol(peak_landmark_list) > 1){ ## multiple sample per batch remove the positive peak that only appear in one sample 33 | for(batch_each in unique(cell_x_feature$batch)){ 34 | sample_index = cell_x_feature %>% dplyr::filter(batch == batch_each) %$% sample %>% unique 35 | ## if there is only one second peak, remove it 36 | if(length(sample_index) > 2){ 37 | col_index = which(colSums(!is.na(peak_landmark_list[sample_index, ])) == 1) ## which col has only 1 peak per batch 38 | # common_peak_col = which(colSums(!is.na(peak_landmark_list[sample_index, ])) != 1) #ncol(peak_landmark_list[sample_index, ]) - length(col_index) 39 | for(c in col_index){ ## across those columns that only have 1 peak per batch 40 | row_index = sample_index[which(!is.na(peak_landmark_list[sample_index, c]))] ## get the sample name 41 | tag_row[[row_index]] = c(tag_row[[row_index]], c) #common_peak_n #non-zero value, record the peak number that is common across samples in this batch 42 | } 43 | } 44 | } 45 | } 46 | 47 | valley_location_list = matrix(NA, nrow = nrow(peak_landmark_list), ncol = max(1, ncol(peak_landmark_list) - 1)) 48 | rownames(valley_location_list) = cell_x_feature$sample %>% levels() 49 | for (sample_name in cell_x_feature$sample %>% levels()) { 50 | # cell_ind = which(cell_x_feature$sample == sample_name) 51 | cell_ind_tmp = which(cell_x_feature$sample == sample_name) 52 | cell_notNA = which(!is.na(cell_x_adt[cell_ind_tmp, adt_marker_select])) 53 | cell_ind = cell_ind_tmp[cell_notNA] 54 | if(length(cell_ind) > 0){ 55 | peak_landmark = peak_landmark_list[sample_name, ] 56 | if(arcsinh_transform_flag){ 57 | zero_prop = sum(cell_x_adt[cell_ind, adt_marker_select] < 2) / length(cell_x_adt[cell_ind, adt_marker_select]) 58 | }else{ 59 | zero_prop = sum(cell_x_adt[cell_ind, adt_marker_select] < neg_candidate_thres) / length(cell_x_adt[cell_ind, adt_marker_select]) 60 | } 61 | 62 | ## check if user define single peak to be positive peak 63 | pos_marker_index = which(paste0("tmpName", positive_peak$ADT_index) == adt_marker_select) 64 | pos_sample_index = which(positive_peak$sample == sample_name) 65 | if (length(intersect(pos_marker_index, pos_sample_index)) > 0) { 66 | lower_valley = TRUE 67 | } else { 68 | lower_valley = FALSE 69 | } 70 | 71 | 72 | density_res = stats::density( 73 | cell_x_adt[which(cell_x_feature$sample == sample_name), adt_marker_select], 74 | adjust = adjust, na.rm = TRUE 75 | ) 76 | x = density_res$x 77 | y = density_res$y 78 | 79 | sign_diff = sign(diff(y)) 80 | diff_sign_diff = diff(sign_diff) 81 | peak = which(diff_sign_diff == -2) + 1 82 | valley = which(diff_sign_diff == 2) + 1 83 | 84 | x_valley = x[valley] 85 | y_valley = y[valley] 86 | x_peak = x[peak] 87 | real_peak = peak_landmark[!is.na(peak_landmark)] # peak 88 | np = length(real_peak) 89 | 90 | ## if this sample is tagged, choose the highest peak 91 | if (!is.null(tag_row[[sample_name]])) {## there are peaks need to be removed 92 | peak_landmark_y = c() 93 | for (peak_landmark_each in peak_landmark) { 94 | peak_landmark_y = c(peak_landmark_y, y[which.min(abs(x - peak_landmark_each))]) ## get the density value for the peak location 95 | } 96 | common_peak_n = length(peak_landmark_y) - length(tag_row[[sample_name]]) 97 | real_peak = real_peak[sort(order(peak_landmark_y, decreasing = T)[1:common_peak_n])] ## get the highest peak 98 | np = length(real_peak) ## update number of peak i.e, np 99 | peak_landmark_list[sample_name, ] = NA 100 | peak_landmark_list[sample_name, (1:ncol(peak_landmark_list))[-tag_row[[sample_name]]]] = real_peak ## remove the outlier peak 101 | } 102 | 103 | 104 | if (np > 1) { ## two peaks or more 105 | real_valley = c() 106 | for (i in 1:(np - 1)) { 107 | tmp_valley = x_valley[(x_valley > real_peak[i]) & (x_valley < real_peak[i + 1])] 108 | tmp_real_valley = tmp_valley[which.min(y_valley[(x_valley > real_peak[i]) & (x_valley < real_peak[i + 1])])] 109 | if(length(tmp_real_valley) == 0){ ## no minimal point between two peak, ensure tmp_real_valley is not empty 110 | tmp_real_valley = (real_peak[i] + real_peak[i+1]) / 2 111 | } 112 | if(i == 1 && shoulder_valley){ 113 | shoulder_cand_index = which(diff(y)/diff(x) > shoulder_valley_slope) 114 | first_peak_index = (which(x > max(x_peak[1], real_peak[1])) %>% min) + 50 115 | x_shoulder = x[shoulder_cand_index[shoulder_cand_index > first_peak_index][1]] 116 | real_valley = c(real_valley, min(x_shoulder, tmp_real_valley, na.rm = T)) 117 | }else{ 118 | if(length(tmp_valley) == 0){ ## there is no local minimum that fall within two peaks 119 | real_valley = c(real_valley, (real_peak[i] + real_peak[i+1]) / 2) ## use the midpoint of two peak location --- maybe shoulder peak instead? 120 | }else{ 121 | real_valley = c(real_valley, tmp_real_valley) #c(real_valley, tmp_valley[which.min(y_valley[(x_valley > real_peak[i]) & (x_valley < real_peak[i + 1])])]) ## local minimum based on the density 122 | } 123 | 124 | } 125 | } 126 | } else if(np == 1) { ## one peak 127 | if (lower_valley == FALSE) { ## one peak is negative peak 128 | real_valley = x_valley[x_valley > real_peak[1] + 0.1][1] #x[which(y < max(y) / min_fc)[which(y < max(y) / min_fc) > max(which(y == max(y)), which(x > real_peak[1]) %>% min())] %>% min()] 129 | if(shoulder_valley){ 130 | ## if one peak & go for shoulder threshold 131 | shoulder_cand_index = which(diff(y)/diff(x) > shoulder_valley_slope) 132 | first_peak_index = (which(x > max(x_peak[1], real_peak[1])) %>% min) + 50 133 | x_shoulder = x[shoulder_cand_index[shoulder_cand_index > first_peak_index][1]] 134 | if(is.na(x_shoulder) & is.na(real_valley)){ 135 | stop("No valley is detected. Please consider increasing 'valley_density_adjust'.") 136 | }else{ 137 | real_valley = min(x_shoulder, real_valley, na.rm = T) 138 | } 139 | 140 | 141 | }else{ 142 | ## check if no valley is detected due to shoulder peak 143 | # if(length(y_valley[x_valley > real_peak[1]]) == 0 || (y_valley[x_valley > real_peak[1] + 0.1][1] < 0.05)){ 144 | # ## if one peak consider the shoulder point 145 | # shoulder_cand_index = which(diff(y)/diff(x) > shoulder_valley_slope) 146 | # first_peak_index = (which(x > max(x_peak[1], real_peak[1])) %>% min) + 50 147 | # x_shoulder = x[shoulder_cand_index[shoulder_cand_index > first_peak_index][1]] 148 | # real_valley = min(x_shoulder, real_valley, na.rm = T) 149 | # } 150 | if (length(y_valley[!is.na(x_valley) & x_valley > real_peak[1]]) == 0 || (y_valley[!is.na(x_valley) & x_valley > real_peak[1] + 0.1][1] < 0.05)) { 151 | shoulder_cand_index = which(diff(y)/diff(x) > shoulder_valley_slope) 152 | first_peak_index = (which(x > max(x_peak[1], real_peak[1])) %>% min) + 50 153 | x_shoulder = x[shoulder_cand_index[shoulder_cand_index > first_peak_index][1]] 154 | real_valley = min(x_shoulder, real_valley, na.rm = TRUE) 155 | } 156 | } 157 | if (zero_prop > 0.8) { 158 | real_valley = max(neg_candidate_thres, real_valley) 159 | } 160 | } else { ## one peak is positive peak 161 | real_valley = x[which((y < max(y) / min_fc) | (y < lower_peak_thres))[which((y < max(y) / min_fc) | (y < lower_peak_thres))< min(which(y == max(y)), which(x < real_peak[1]) %>% max())] %>% max()] 162 | # peak_landmark_list[sample_name, ] = asinh(0/5 + 1) 163 | # peak_landmark_list[sample_name, ncol(peak_landmark_list)] = real_peak[1] 164 | # real_valley = min(real_valley, min(cell_x_adt[cell_ind, adt_marker_select])) #min(real_valley, asinh(0/5 + 1)) 165 | } 166 | } else { ## no peak 167 | real_valley = NA 168 | } 169 | ## check if no valley is detected due to shoulder peak 170 | if (length(real_valley) == 0 || all(is.na(real_valley))) { 171 | if(length(real_peak) >= 2){ ## midpoint of two peak 172 | real_valley = (real_peak[1] + real_peak[2]) / 2 173 | }else{## shoulder valley 174 | shoulder_cand_index = which(diff(y)/diff(x) > shoulder_valley_slope) 175 | first_peak_index = (which(x > max(x_peak[1], real_peak[1])) %>% min) + 50 176 | x_shoulder = x[shoulder_cand_index[shoulder_cand_index > first_peak_index][1]] 177 | if(!is.na(x_shoulder)){ 178 | real_valley = min(x_shoulder, real_valley, na.rm = T) 179 | } 180 | } 181 | 182 | } 183 | valley_location_list[sample_name, 1:length(real_valley)] = real_valley 184 | } 185 | 186 | 187 | } 188 | 189 | ## update peak_landmark_list if needed 190 | if(any(colSums(!is.na(peak_landmark_list)) == 0)){ ## remove column that only has NA values 191 | col_index = which(colSums(!is.na(peak_landmark_list)) == 0) 192 | peak_landmark_update = matrix(NA, nrow(peak_landmark_list), ncol(peak_landmark_list) - length(col_index)) 193 | i = 1 194 | for (c in 1:ncol(peak_landmark_list)) { 195 | if (!(c %in% col_index)) { 196 | peak_landmark_update[, i] = peak_landmark_list[, c] 197 | i = i + 1 198 | } 199 | } 200 | rownames(peak_landmark_update) = rownames(peak_landmark_list) 201 | peak_landmark_list = peak_landmark_update 202 | } 203 | return(list(valley_location_list = valley_location_list, peak_landmark_list = peak_landmark_list)) 204 | } 205 | -------------------------------------------------------------------------------- /R/globals.R: -------------------------------------------------------------------------------- 1 | utils::globalVariables(c("peakx", "peaky", "peaks", "batch", "counts", ".")) 2 | -------------------------------------------------------------------------------- /R/landmark_fill_na.R: -------------------------------------------------------------------------------- 1 | #' Merge locations of peak and valley landmarks 2 | #' 3 | #' This function merges the peak and valley landmarks locations and fills in NA if the landmark is not detected. 4 | #' @param peak_landmark_list Matrix of peak landmark detection results. Rows are samples, and column(s) are the peak locations. 5 | #' @param valley_landmark_list Matrix of valley landmark detection results. Rows are samples, and column(s) are the valley locations. 6 | #' @param landmark_align_type Algin the peak and valleys using one of the "negPeak", "negPeak_valley", "negPeak_valley_posPeak", and "valley" alignment modes. 7 | #' @param midpoint_type Fill in the missing first valley by the midpoint of two positive peaks ("midpoint") or impute by other valleys ("valley"). 8 | #' @param neg_candidate_thres The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95% quantile of IgG samples is large. 9 | #' @export 10 | #' @examples 11 | #' \dontrun{ 12 | #' landmark_fill_na( 13 | #' peak_landmark_list = peak_landmark_list, 14 | #' valley_landmark_list = valley_landmark_list, 15 | #' landmark_align_type = "negPeak_valley_posPeak" 16 | #' ) 17 | #' } 18 | landmark_fill_na <- function(peak_landmark_list = NULL, valley_landmark_list = NULL, landmark_align_type = NULL, midpoint_type = "valley", neg_candidate_thres = asinh(10/5 + 1)){ 19 | if(!landmark_align_type %in% c("negPeak", "negPeak_valley", "negPeak_valley_posPeak", "valley")){ 20 | return("Please provide one of the landmark_align_type from: negPeak, negPeak_valley, negPeak_valley_posPeak, valley") 21 | } 22 | if(landmark_align_type == "valley"){ 23 | ## only use the first valley to align 24 | landmark_matrix <- valley_landmark_list[, 1] %>% t %>% t 25 | landmark_matrix[is.na(landmark_matrix), 1] <- neg_candidate_thres #2 ## fill in na by background level 2 after arcinsh_b5_a1 transformation 26 | }else{ 27 | ## involve negative peaks in landmark alignment 28 | if(ncol(peak_landmark_list) == 1){ 29 | ## only have negative peaks 30 | landmark_matrix <- cbind( 31 | peak_landmark_list[, 1], 32 | valley_landmark_list[, 1] 33 | ) 34 | ## fill in na 35 | landmark_matrix[is.na(landmark_matrix[, 1]), 1] <- stats::median(landmark_matrix[!is.na(landmark_matrix[, 1]), 1]) 36 | alter_pos = stats::median(landmark_matrix[!is.na(landmark_matrix[, 2]), 2]) 37 | for(tmpIndex in which(is.na(landmark_matrix[, 2]))){ 38 | if(neg_candidate_thres > landmark_matrix[tmpIndex, 1]){ 39 | landmark_matrix[tmpIndex, 2] <- neg_candidate_thres 40 | }else{ 41 | landmark_matrix[tmpIndex, 2] <- alter_pos 42 | } 43 | } 44 | # landmark_matrix[is.na(landmark_matrix[, 2]), 2] <- neg_candidate_thres 45 | }else{ 46 | ## have positive peaks 47 | if(ncol(peak_landmark_list) > 2 && midpoint_type == "midpoint"){ 48 | landmark_matrix <- cbind( 49 | peak_landmark_list[, 1], 50 | rowMeans(valley_landmark_list, na.rm = TRUE), 51 | peak_landmark_list[, ncol(peak_landmark_list)] 52 | ) 53 | }else{ 54 | landmark_matrix <- cbind( 55 | peak_landmark_list[, 1], 56 | valley_landmark_list[, 1], 57 | peak_landmark_list[, ncol(peak_landmark_list)] 58 | ) 59 | } 60 | 61 | ## fill in na 62 | ## fill in valley first 63 | landmark_matrix[is.na(landmark_matrix[, 2]), 2] <- neg_candidate_thres 64 | ## due to user_define_peak where unique peak is deemed positive. 65 | ## fill in either 0.5 or half of the first valley 66 | landmark_matrix[is.na(landmark_matrix[, 1]), 1] <- landmark_matrix[is.na(landmark_matrix[, 1]), 2]/2 #0.5 67 | ## fill in the last positive peak: add on the valley using the median distance from the last positive peak to the first valley 68 | landmark_matrix[is.na(landmark_matrix[, 3]), 3] <- landmark_matrix[is.na(landmark_matrix[, 3]), 2] + stats::median(landmark_matrix[!is.na(landmark_matrix[, 3]), 3] - landmark_matrix[!is.na(landmark_matrix[, 3]), 2]) 69 | 70 | } 71 | } 72 | 73 | ## only provide negative peak location 74 | if (landmark_align_type == "negPeak") { 75 | return(landmark_matrix[, 1] %>% t %>% t) 76 | } 77 | 78 | ## only provide negative peak and valley location 79 | if (landmark_align_type == "negPeak_valley") { 80 | return(landmark_matrix[, 1:2]) 81 | } 82 | 83 | ## provide negative peak, first valley and last postiive peak location 84 | return(landmark_matrix) 85 | } 86 | -------------------------------------------------------------------------------- /R/load_landmarks.R: -------------------------------------------------------------------------------- 1 | #' Load saved landmarks into a list of matrices, as required for input into "override_landmark". 2 | #' 3 | #' @param dir Directory where landmark location .rds files are stored 4 | #' @param append_rds Whether to append "/RDS" to the provided directory path. The default is TRUE, which aligns with ADTnorm's default save location. 5 | #' @export 6 | #' @examples 7 | #' \dontrun{ 8 | #' override_landmark = load_landmarks(dir = save_outpath) 9 | #' } 10 | 11 | load_landmarks = function(dir, append_rds = TRUE){ 12 | if(append_rds) 13 | dir <- paste(dir,'RDS',sep='/') 14 | 15 | override_landmark <- list() 16 | for(file in list.files(dir,pattern='^peak_valley_locations_.*.rds',full.names=TRUE)){ 17 | file_items <- utils::tail(unlist(strsplit(file,'/')),1) 18 | file_items <- unlist(strsplit(file_items,'_')) 19 | marker <- file_items[4] 20 | override_landmark[[marker]] <- readRDS(file) 21 | } 22 | return(override_landmark) 23 | } -------------------------------------------------------------------------------- /R/peak_alignment.R: -------------------------------------------------------------------------------- 1 | #' Align the peak and valley landmarks by the warpset function 2 | #' 3 | #' This function monotonously transforms the ADT marker counts to align the landmarks detected in previous steps. By aligning the landmarks, ADTnorm removes the batch effect and allows integration across batches/studies. 4 | #' @param cell_x_adt Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format. 5 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information. 6 | #' @param landmark_matrix Matrix of peak and valley landmarks after filling in NA using the `landmark_fill_na` function. 7 | #' @param target_landmark Leave it as NULL to align the landmark to the mean location across samples. Denote it by a vector of the same length as the column number of the landmark to align the negative peak, valley, and positive peak(s) to the specified fixed location. 8 | #' @export 9 | #' @examples 10 | #' \dontrun{ 11 | #' peak_alignment(cell_x_adt, cell_x_feature, landmark_matrix) 12 | #' } 13 | # require(dplyr) 14 | # require(flowStats) 15 | # require(fda) 16 | peak_alignment = function(cell_x_adt, cell_x_feature = NULL, landmark_matrix = NULL, target_landmark = NULL) { 17 | ## get parameters 18 | grouping = NULL 19 | monwrd = TRUE 20 | subsample = NULL 21 | peakNr = NULL 22 | clipRange = 0.01 23 | nbreaks = 11 #11 24 | bwFac = 2 25 | warpFuns = FALSE 26 | chunksinze = 10 27 | newNcFile = NULL 28 | z = NULL 29 | nb = 1001 30 | 31 | exp_data = cell_x_adt 32 | cell_x_adt_norm = cell_x_adt 33 | samples = levels(cell_x_feature$sample) ## sampleNames(exp_data) 34 | 35 | ## set up fda parameters 36 | extend = 0.15 37 | from = min(c(min(cell_x_adt, na.rm = TRUE), target_landmark[1], min(landmark_matrix))) - diff(range(cell_x_adt, na.rm = TRUE)) * extend 38 | to = max(c(max(cell_x_adt, na.rm = TRUE), target_landmark[length(target_landmark)], max(landmark_matrix))) + diff(range(cell_x_adt, na.rm = TRUE)) * extend 39 | 40 | lower_bound = min(cell_x_adt, na.rm = TRUE) #- diff(range(cell_x_adt, na.rm = TRUE)) * extend 41 | upper_bound = max(cell_x_adt, na.rm = TRUE) #+ diff(range(cell_x_adt, na.rm = TRUE)) * extend 42 | 43 | wbasis = fda::create.bspline.basis( 44 | rangeval = c(from, to), 45 | norder = 4, breaks = seq(from, to, len = nbreaks) 46 | ) 47 | Wfd0 = fda::fd(matrix(0, wbasis$nbasis, 1), wbasis) 48 | WfdPar = fda::fdPar(Wfd0, 1, 1e-5) 49 | 50 | 51 | density_y = c() 52 | for(sample in samples){ 53 | cell_ind_tmp = which(cell_x_feature$sample == sample) 54 | cell_ind = cell_ind_tmp[which(!is.na(cell_x_adt[cell_ind_tmp]))] 55 | if(length(cell_ind) > 0){ 56 | density_y = cbind(density_y, stats::density(cell_x_adt[cell_ind], from = from, to = to, n = nb, na.rm = TRUE)$y) 57 | }else{ 58 | density_y = cbind(density_y, rep(NA, nb)) 59 | } 60 | 61 | } 62 | colnames(density_y) = samples 63 | 64 | arg_vals = seq(from, to, len = nb) 65 | fdobj = fda::Data2fd(arg_vals, density_y, wbasis) 66 | 67 | if (ncol(landmark_matrix) == 1) { ## only one peak no valley: offset 68 | offsets = landmark_matrix - stats::median(landmark_matrix, na.rm = TRUE) 69 | names(offsets) = samples 70 | funs = funsBack = vector("list", nrow(landmark_matrix)) 71 | names(funs) = samples 72 | names(funsBack) = samples 73 | for (j in seq_along(funs)) { 74 | funs[[samples[j]]] = function(x) x - z 75 | e1 = new.env(hash = TRUE) 76 | e1$z = offsets[samples[j]] 77 | environment(funs[[samples[j]]]) = e1 78 | funsBack[[samples[j]]] = function(x) x + z 79 | e2 = new.env(hash = TRUE) 80 | e2$z = offsets[samples[j]] 81 | environment(funsBack[[samples[j]]]) = e2 82 | } 83 | } else { ## more than one landmark: warping 84 | ## if any valley is beyond the upper bound of range, replace by the upper bound 85 | if(any(landmark_matrix[, 2] > upper_bound)){ 86 | landmark_matrix[which(landmark_matrix[, 2] > upper_bound), 2] = max(cell_x_adt, na.rm = TRUE) 87 | print(paste0("Warning: some valley landmarks are larger the upper bound of the range. They are replaced by the maximum value of cell_x_adt. Please consider reduce 'neg_candidate_thres' value.")) 88 | } 89 | if(any(landmark_matrix[, 1] < lower_bound)){ 90 | landmark_matrix[which(landmark_matrix[, 1] < lower_bound), 1] = min(cell_x_adt, na.rm = TRUE) 91 | print("Warning: some valley landmarks are smaller than the lower bound of the range. They are replaced by the minimum value of cell_x_adt.") 92 | } 93 | args = list("unregfd" = fdobj, "fdobj"=fdobj, "ximarks"=landmark_matrix, "WfdPar"=WfdPar, "monwrd"=monwrd) 94 | if(!is.null(target_landmark)){ 95 | args[['x0marks']] = target_landmark 96 | }else{ 97 | args[['x0marks']] = colMeans(landmark_matrix, na.rm = TRUE) 98 | } 99 | args_run = args[intersect(names(formals(fda::landmarkreg)), names(args))] 100 | regDens = do.call(fda::landmarkreg, args_run, quote = TRUE) 101 | 102 | # if(is.null(target_landmark)){ 103 | # regDens = fda::landmarkreg(fdobj, landmark_matrix, WfdPar = WfdPar, monwrd = monwrd) 104 | # }else{ 105 | # regDens = fda::landmarkreg(fdobj, landmark_matrix, x0marks = target_landmark, WfdPar = WfdPar, monwrd = monwrd) 106 | # } 107 | # if(is.null(target_landmark)){ 108 | # regDens = fda::landmarkreg(fdobj, landmark_matrix, WfdPar = WfdPar) 109 | # }else{ 110 | # regDens = fda::landmarkreg(fdobj, landmark_matrix, x0marks = target_landmark, WfdPar = WfdPar) 111 | # } 112 | warpfdobj = regDens$warpfd 113 | warpedX = fda::eval.fd(warpfdobj, arg_vals) 114 | warpedX[1, ] = utils::head(arg_vals, 1) 115 | warpedX[nrow(warpedX), ] = utils::tail(arg_vals, 1) 116 | funs = apply(warpedX, 2, stats::approxfun, arg_vals) 117 | funsBack = apply(warpedX, 2, function(a, b) stats::approxfun(b, a), arg_vals) 118 | } 119 | 120 | 121 | names(funs) = names(funsBack) = samples 122 | 123 | warped_landmark_matrix = landmark_matrix 124 | leftBoard = rightBoard = vector("list", length(funs)) 125 | newRange = c(Inf, -Inf) 126 | 127 | ## transform the raw data using the warping functions 128 | for (i in seq_along(funs)) { 129 | # cell_index = which(cell_x_feature$sample == samples[i]) 130 | cell_ind_tmp = which(cell_x_feature$sample == samples[i]) 131 | cell_index = cell_ind_tmp[which(!is.na(cell_x_adt[cell_ind_tmp]))] 132 | if(length(cell_index) > 0){ 133 | thisDat = t(t(cell_x_adt[cell_index])) 134 | newDat = as.matrix(funs[[i]](thisDat)) 135 | newDat[is.na(newDat)] = thisDat[is.na(newDat)] 136 | cell_x_adt_norm[cell_index] = newDat 137 | warped_landmark_matrix[i, ] = funs[[i]](landmark_matrix[i, ]) 138 | }else{ 139 | warped_landmark_matrix[i, ] = NA 140 | } 141 | 142 | } 143 | return(list(cell_x_adt_norm = cell_x_adt_norm, landmark_matrix_norm = warped_landmark_matrix)) 144 | } 145 | -------------------------------------------------------------------------------- /R/plot_adt_density_each.R: -------------------------------------------------------------------------------- 1 | #' Plot the expression density profile for ONE ADT marker 2 | #' 3 | #' This function plots the ADT expression density profile for only one ADT marker. Each track is a sample. Color by batch 4 | #' @param adt_count Matrix of ADT raw counts in cells (rows) by one target ADT marker (column) format. 5 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information. 6 | #' @param brewer_palettes Set the color scheme of the color brewer. 7 | #' @param parameter_list Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot. 8 | #' @export 9 | #' @examples 10 | #' \dontrun{ 11 | #' plot_adt_density_each( 12 | #' cell_x_adt, 13 | #' cell_x_feature, 14 | #' brewer_palettes = "Set1", 15 | #' parameter_list = list(bw = 0.1, run_label = "ADTnorm") 16 | #' ) 17 | #' } 18 | # require(ggplot2) 19 | # require(RColorBrewer) 20 | # require(tidyr) 21 | # require(ggridges) 22 | # require(ggpubr) 23 | plot_adt_density_each = function(adt_count, cell_x_feature, brewer_palettes, parameter_list = NULL) { 24 | if (is.null(parameter_list)) { 25 | return("parameter_list is NULL!") 26 | } 27 | parameter_list_name = names(parameter_list) 28 | 29 | run_label = "" 30 | bw = 1 31 | if (!is.null(parameter_list)) { 32 | if ("run_label" %in% parameter_list_name) { 33 | run_label = parameter_list[["run_label"]] 34 | } 35 | if("bw" %in% parameter_list_name){ 36 | bw = parameter_list$bw 37 | } 38 | } 39 | 40 | # peak_landmark_list = parameter_list$peak_landmark_list 41 | # valley_landmark_list = parameter_list$valley_landmark_list 42 | # brewer_palettes = parameter_list$brewer_palettes 43 | 44 | # If there is no batch, add a dummy variable 45 | if (! "batch" %in% colnames(cell_x_feature)){ cell_x_feature$batch <- 1 } 46 | 47 | tmpProfile = data.frame(counts = adt_count) %>% 48 | mutate( 49 | sample = rep(cell_x_feature$sample, 1), 50 | batch = rep(cell_x_feature$batch, 1) 51 | ) 52 | 53 | # peak_location = list() 54 | # valley_location = list() 55 | # for (i in 1:ncol(peak_landmark_list)) { 56 | # peak_location[[i]] = data.frame( 57 | # # ADT = adt_marker_select, 58 | # sample = cell_x_feature$sample %>% levels(), 59 | # peakx = peak_landmark_list[, i], 60 | # peaky = 0.5, 61 | # peaks = 1:length(levels(cell_x_feature$sample)) 62 | # ) 63 | # if (i <= ncol(valley_landmark_list)) { 64 | # valley_location[[i]] = data.frame( 65 | # # ADT = adt_marker_select, 66 | # sample = cell_x_feature$sample %>% levels(), 67 | # peakx = valley_landmark_list[, i], 68 | # peaky = 0.5, 69 | # peaks = 1:length(levels(cell_x_feature$sample)) 70 | # ) 71 | # } 72 | # } 73 | fillColor = grDevices::colorRampPalette(RColorBrewer::brewer.pal(8, brewer_palettes))(length(unique(tmpProfile$batch))) 74 | resPlot = ggplot(tmpProfile, aes(x = counts, y = sample)) + 75 | ggridges::geom_density_ridges(aes(fill = factor(batch)), bandwidth = bw) + 76 | # geom_segment(data = peak_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) + 77 | # geom_segment(data = valley_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") + 78 | # facet_wrap(~ factor(ADT), scales = "free_x") + 79 | theme_bw(base_size = 20) + 80 | xlab(run_label) + 81 | ylab("") + 82 | ggpubr::rotate_x_text(angle = 90) + 83 | ggpubr::rremove("legend") + 84 | scale_fill_manual(values = fillColor) + 85 | ggpubr::rremove("legend.title") 86 | 87 | 88 | return(resPlot) 89 | } 90 | -------------------------------------------------------------------------------- /R/plot_adt_density_with_peak_valley.R: -------------------------------------------------------------------------------- 1 | #' Plot ADT marker expression density profile with identified peak and valley locations 2 | #' 3 | #' This function plots the ADT expression density profile with identified peak and valley locations. Each panel is an ADT marker, and each track is a sample. Color by batch 4 | #' @param cell_x_adt Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format. 5 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information. 6 | #' @param adt_marker_select The target ADT marker(s) that the denstiy plot is about. Leave it NULL will generate figures for all the ADT markers available in cell_x_adt. 7 | #' @param peak_landmark_list Matrix of peak landmark locations with rows being samples and columns being the peaks. 8 | #' @param valley_landmark_list Matrix of valley landmark locations with rows being samples and columns being the valleys. 9 | #' @param brewer_palettes Set the color scheme of the color brewer. 10 | #' @param parameter_list Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot. 11 | #' @export 12 | #' @examples 13 | #' \dontrun{ 14 | #' plot_adt_density_with_peak_valley( 15 | #' cell_x_adt, 16 | #' cell_x_feature, 17 | #' adt_marker_select = c("CD3", "CD4", "CD8", "CD19"), 18 | #' peak_landmark_list = peak_mode_norm_res, 19 | #' valley_landmark_list = valley_location_norm_res, 20 | #' brewer_palettes = "Set1", 21 | #' parameter_list = list(bw = 0.1, run_label = "ADTnorm") 22 | #' ) 23 | #' } 24 | # require(ggplot2) 25 | # require(RColorBrewer) 26 | # require(tidyr) 27 | # require(ggridges) 28 | # require(ggpubr) 29 | plot_adt_density_with_peak_valley = function(cell_x_adt, cell_x_feature, adt_marker_select = NULL, peak_landmark_list, valley_landmark_list = NULL, brewer_palettes = "Set1", parameter_list = NULL) { 30 | if (is.null(parameter_list)) { 31 | return("parameter_list is NULL!") 32 | } 33 | parameter_list_name = names(parameter_list) 34 | 35 | run_label = "" 36 | bw = 1 37 | if (!is.null(parameter_list)) { 38 | if ("run_label" %in% parameter_list_name) { 39 | run_label = parameter_list[["run_label"]] 40 | } 41 | if("bw" %in% parameter_list_name){ 42 | bw = parameter_list$bw 43 | } 44 | } 45 | 46 | # peak_landmark_list = parameter_list$peak_landmark_list 47 | # valley_landmark_list = parameter_list$valley_landmark_list 48 | # brewer_palettes = parameter_list$brewer_palettes 49 | if(is.null(adt_marker_select)){ 50 | adt_marker_select = colnames(cell_x_adt) 51 | } 52 | 53 | # If there is no batch, add a dummy variable 54 | if (! "batch" %in% colnames(cell_x_feature)){ cell_x_feature$batch <- 1 } 55 | 56 | tmpProfile = cell_x_adt %>% data.frame %>% 57 | dplyr::select(all_of(adt_marker_select)) %>% 58 | data.frame() %>% 59 | tidyr::gather(key = "ADT", value = "counts") %>% 60 | mutate( 61 | sample = rep(cell_x_feature$sample, length(adt_marker_select)), 62 | batch = rep(cell_x_feature$batch, length(adt_marker_select)) 63 | ) 64 | 65 | peak_location = list() 66 | valley_location = list() 67 | for (i in 1:ncol(peak_landmark_list)) { 68 | peak_location[[i]] = data.frame( 69 | ADT = adt_marker_select, 70 | sample = cell_x_feature$sample %>% levels(), 71 | peakx = peak_landmark_list[, i], 72 | peaky = 0.5, 73 | peaks = 1:length(levels(cell_x_feature$sample)) 74 | ) 75 | if ((!is.null(valley_landmark_list)) && i <= ncol(valley_landmark_list)) { 76 | valley_location[[i]] = data.frame( 77 | ADT = adt_marker_select, 78 | sample = cell_x_feature$sample %>% levels(), 79 | peakx = valley_landmark_list[, i], 80 | peaky = 0.5, 81 | peaks = 1:length(levels(cell_x_feature$sample)) 82 | ) 83 | } 84 | } 85 | fillColor = grDevices::colorRampPalette(RColorBrewer::brewer.pal(8, brewer_palettes))(length(unique(tmpProfile$batch))) 86 | 87 | resPlot = ggplot(tmpProfile, aes(x = counts, y = sample)) + 88 | ggridges::geom_density_ridges(aes(fill = factor(batch)), bandwidth = bw) + 89 | geom_segment(data = peak_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) + 90 | # geom_segment(data = valley_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") + 91 | facet_wrap(~ factor(ADT), scales = "free_x") + 92 | theme_bw(base_size = 20) + 93 | xlab(run_label) + 94 | ylab("") + 95 | ggpubr::rotate_x_text(angle = 90) + 96 | ggpubr::rremove("legend") + 97 | scale_fill_manual(values = fillColor) + 98 | ggpubr::rremove("legend.title") 99 | 100 | if(is.null(valley_landmark_list)){ 101 | resPlot = resPlot + geom_segment(data = valley_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") 102 | } 103 | 104 | if (ncol(peak_landmark_list) >= 2) { 105 | resPlot = resPlot + 106 | geom_segment(data = peak_location[[2]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) 107 | } 108 | 109 | if (ncol(peak_landmark_list) >= 3) { 110 | resPlot = resPlot + 111 | geom_segment(data = peak_location[[3]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) + 112 | geom_segment(data = valley_location[[2]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") 113 | } 114 | if (ncol(peak_landmark_list) >= 4) { 115 | resPlot = resPlot + 116 | geom_segment(data = peak_location[[4]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) + 117 | geom_segment(data = valley_location[[3]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") 118 | } 119 | 120 | return(resPlot) 121 | } 122 | -------------------------------------------------------------------------------- /R/plot_adt_density_with_peak_valley_each.R: -------------------------------------------------------------------------------- 1 | #' Plot the expression density profile for ONE ADT marker with identified peak and valley locations 2 | #' 3 | #' This function plots the ADT expression density profile with identified peak and valley locations for only one ADT marker. Each track is a sample. Color by batch 4 | #' @param adt_count Matrix of ADT raw counts in cells (rows) by one target ADT marker (column) format. 5 | #' @param cell_x_feature Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information. 6 | #' @param peak_landmark_list Matrix of peak landmark locations with rows being samples and columns being the peaks. 7 | #' @param valley_landmark_list Matrix of valley landmark locations with rows being samples and columns being the valleys. 8 | #' @param brewer_palettes Set the color scheme of the color brewer. 9 | #' @param parameter_list Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot. 10 | #' @export 11 | #' @examples 12 | #' \dontrun{ 13 | #' plot_adt_density_with_peak_valley_each( 14 | #' cell_x_adt, 15 | #' cell_x_feature, 16 | #' peak_landmark_list = peak_mode_norm_res, 17 | #' valley_landmark_list = valley_location_norm_res, 18 | #' brewer_palettes = "Set1", 19 | #' parameter_list = list(bw = 0.1, run_label = "ADTnorm") 20 | #' ) 21 | #' } 22 | # require(ggplot2) 23 | # require(RColorBrewer) 24 | # require(tidyr) 25 | # require(ggridges) 26 | # require(ggpubr) 27 | plot_adt_density_with_peak_valley_each = function(adt_count, cell_x_feature, peak_landmark_list, valley_landmark_list = NULL, brewer_palettes = "Set1", parameter_list = NULL) { 28 | if (is.null(parameter_list)) { 29 | return("parameter_list is NULL!") 30 | } 31 | parameter_list_name = names(parameter_list) 32 | 33 | run_label = "" 34 | bw = 1 35 | if (!is.null(parameter_list)) { 36 | if ("run_label" %in% parameter_list_name) { 37 | run_label = parameter_list[["run_label"]] 38 | } 39 | if("bw" %in% parameter_list_name){ 40 | bw = parameter_list$bw 41 | } 42 | } 43 | 44 | # peak_landmark_list = parameter_list$peak_landmark_list 45 | # valley_landmark_list = parameter_list$valley_landmark_list 46 | # brewer_palettes = parameter_list$brewer_palettes 47 | 48 | # If there is no batch, add a dummy variable 49 | if (! "batch" %in% colnames(cell_x_feature)){ cell_x_feature$batch <- 1 } 50 | 51 | tmpProfile = data.frame(counts = adt_count) %>% 52 | mutate( 53 | sample = rep(cell_x_feature$sample, 1), 54 | batch = rep(cell_x_feature$batch, 1) 55 | ) #%>% dplyr::filter(!is.na(counts)) 56 | 57 | peak_location = list() 58 | valley_location = list() 59 | for (i in 1:ncol(peak_landmark_list)) { 60 | peak_location[[i]] = data.frame( 61 | # ADT = adt_marker_select, 62 | sample = cell_x_feature$sample %>% levels(), 63 | peakx = peak_landmark_list[, i], 64 | peaky = 0.5, 65 | peaks = 1:length(levels(cell_x_feature$sample)) 66 | ) 67 | if ((!is.null(ncol(valley_landmark_list))) && i <= ncol(valley_landmark_list)) { 68 | valley_location[[i]] = data.frame( 69 | # ADT = adt_marker_select, 70 | sample = cell_x_feature$sample %>% levels(), 71 | peakx = valley_landmark_list[, i], 72 | peaky = 0.5, 73 | peaks = 1:length(levels(cell_x_feature$sample)) 74 | ) 75 | } 76 | } 77 | fillColor = grDevices::colorRampPalette(RColorBrewer::brewer.pal(8, brewer_palettes))(length(unique(tmpProfile$batch))) 78 | resPlot = ggplot(tmpProfile, aes(x = counts, y = sample)) + 79 | ggridges::geom_density_ridges(aes(fill = factor(batch)), bandwidth = bw) + 80 | geom_segment(data = peak_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) + 81 | # geom_segment(data = valley_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") + 82 | # facet_wrap(~ factor(ADT), scales = "free_x") + 83 | theme_bw(base_size = 20) + 84 | xlab(run_label) + 85 | ylab("") + 86 | ggpubr::rotate_x_text(angle = 90) + 87 | ggpubr::rremove("legend") + 88 | scale_fill_manual(values = fillColor) + 89 | ggpubr::rremove("legend.title") 90 | 91 | if(!is.null(valley_landmark_list)){ 92 | resPlot = resPlot + 93 | geom_segment(data = valley_location[[1]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") 94 | } 95 | 96 | if (ncol(peak_landmark_list) >= 2) { 97 | resPlot = resPlot + 98 | geom_segment(data = peak_location[[2]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) 99 | } 100 | 101 | if (ncol(peak_landmark_list) >= 3) { 102 | resPlot = resPlot + 103 | geom_segment(data = peak_location[[3]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) + 104 | geom_segment(data = valley_location[[2]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") 105 | } 106 | if (ncol(peak_landmark_list) >= 4) { 107 | resPlot = resPlot + 108 | geom_segment(data = peak_location[[4]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1) + 109 | geom_segment(data = valley_location[[3]], aes(x = peakx, xend = peakx, y = peaks, yend = peaky + peaks), linewidth = 1, color = "grey") 110 | } 111 | 112 | return(resPlot) 113 | } 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ADTnorm 2 | 3 | 4 | [![R-CMD-check](https://github.com/yezhengSTAT/ADTnorm/workflows/R-CMD-check/badge.svg)](https://github.com/yezhengSTAT/ADTnorm/actions) 5 | [![docker](https://github.com/yezhengSTAT/ADTnorm/workflows/docker/badge.svg)](https://github.com/yezhengSTAT/ADTnorm/pkgs/container/adtnorm) 6 | 7 | 8 | ## What is `ADTnorm`? 9 | 10 | CITE-seq enables paired measurement of surface protein and mRNA expression in single cells using antibodies conjugated to oligonucleotide tags. Due to the high copy number of surface protein molecules, sequencing antibody-derived tags (ADTs) allows for robust protein detection, improving cell-type identification. However, variability in antibody staining leads to batch effects in the ADT expression, obscuring biological variation, reducing interpretability, and obstructing cross-study analyses. Here, we present ADTnorm, a normalization and integration method designed explicitly for ADT abundance. Benchmarking against 14 existing scaling and normalization methods, we show that ADTnorm accurately aligns populations with negative- and positive-expression of surface protein markers across 13 public datasets, effectively removing technical variation across batches and improving cell-type separation. ADTnorm enables efficient integration of public CITE-seq datasets, each with unique experimental designs, paving the way for atlas-level analyses. Beyond normalization, ADTnorm aids in automated threshold-gating as well as assessment of antibody staining quality for titration optimization and antibody panel selection. 11 | 12 | This repository is the ADTnorm R package. We also provide a [Python wrapper](https://github.com/donnafarberlab/ADTnormPy) by [Daniel P. Caron](https://github.com/Daniel-Caron). 13 | 14 | Manuscript: [Zheng et al. ADTnorm: Robust Integration of Single-cell Protein Measurement across CITE-seq Datasets. BioRxiv. 2024](https://www.biorxiv.org/content/10.1101/2022.04.29.489989v2) 15 | 16 | 17 | ## ADT Normalization Pipeline 18 | 19 | ADTnorm 20 | 21 | 22 | ## Installation 23 | 24 | ``` R 25 | # install.packages("remotes") 26 | remotes::install_github("yezhengSTAT/ADTnorm", build_vignettes = FALSE) 27 | ``` 28 | 29 | ### Using Docker 30 | 31 | There are many dependencies in `ADTnorm`, so it takes a long time to install them all. Instead, you can use the Docker image of `ADTnorm`. 32 | 33 | ``` sh 34 | docker pull ghcr.io/yezhengstat/adtnorm:latest 35 | docker run \ 36 | -it \ 37 | --user rstudio \ 38 | --volume :/home/rstudio/data \ 39 | yezhengstat/adtnorm:latest \ 40 | R 41 | ``` 42 | 43 | Replace `` with the local directory path (absolute path) where you have the input data and would like to store the output files. For more information on using docker containers, please read [this documentation](https://github.com/Bioconductor/bioconductor_docker/blob/master/README.md#using-the-containers) by Bioconductor. 44 | 45 | 46 | ## Input Data 47 | 48 | The 13 public datasets used in the [manuscript](https://www.biorxiv.org/content/10.1101/2022.04.29.489989v2) are also included in the R package as a demo data set. They can be loaded by 49 | 50 | ```{r loaddata, eval = FALSE} 51 | data(cell_x_adt) 52 | data(cell_x_feature) 53 | ``` 54 | 55 | - ```cell_x_adt``` contains raw counts for ADT markers in each cell. It is a data frame with 422682 cells (rows) and 9 ADT markers (columns): CD3, CD4, CD8, CD14, CD19, CD25, CD45RA, CD56, CD127. 56 | 57 | ``` 58 | CD3 CD4 CD8 CD14 CD19 CD25 CD45RA CD56 CD127 59 | 1 18 138 13 491 3 9 110 17 7 60 | 2 30 119 19 472 3 5 125 248 8 61 | 3 18 207 10 1289 8 15 5268 26 12 62 | 4 18 11 17 20 5 15 4743 491 16 63 | 5 5 14 14 19 4 16 4108 458 17 64 | 6 21 1014 29 2428 7 52 227 29 15 65 | ``` 66 | 67 | - ```cell_x_feature``` is a data frame with 422682 cells (rows) and 7 feature variables (columns): 68 | 69 | - sample: Sample name used in original data of each study. 70 | 71 | - batch: Batch information provided from each study. 72 | 73 | - sample_status: Sample status, i.e., Healthy, MALTtumor, HIV Vaccine, Lupus, B-ALL, AML. 74 | 75 | - study_name: Name of the data set/study. 76 | 77 | - ADTseqDepth: Total UMI per cell. 78 | 79 | - cell_type_l1: Broad level of cell type annotation using manual gating. 80 | 81 | - cell_type_l2: Fine level of cell type annotation using manual gating. 82 | 83 | 84 | ``` 85 | sample batch sample_status study_name 86 | 1 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1 healthy 10X_pbmc_10k 87 | 2 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1 healthy 10X_pbmc_10k 88 | 3 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1 healthy 10X_pbmc_10k 89 | 4 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1 healthy 10X_pbmc_10k 90 | 5 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1 healthy 10X_pbmc_10k 91 | 6 10X_pbmc_10k_sample1 10X_pbmc_10k_batch1 healthy 10X_pbmc_10k 92 | ADTseqDepth cell_type_l1 cell_type_l2 93 | 1 981 monocytes classical monocyte 94 | 2 1475 monocytes classical monocyte 95 | 3 7149 monocytes classical monocyte 96 | 4 6831 NK CD16+ NK 97 | 5 6839 NK CD16+ NK 98 | 6 4720 monocytes classical monocyte 99 | ``` 100 | 101 | ## Usage 102 | 103 | 104 | **For more detailed and typical parameter tuning examples, please visit [tutorial website](https://yezhengstat.github.io/ADTnorm/articles/ADTnorm-tutorial.html). We will illustrate using the demo data.** 105 | 106 | 107 | ### Case 1. Consider one study as a sample and normalize across studies. 108 | ```R 109 | library(ADTnorm) 110 | save_outpath <- "/path/to/output/location" 111 | run_name <- "ADTnorm_demoRun" 112 | data(cell_x_adt) 113 | data(cell_x_feature) 114 | 115 | cell_x_feature$sample = factor(cell_x_feature$study_name) ## consider each study as one sample 116 | cell_x_feature$batch = factor(cell_x_feature$study_name) ## consider each study as a batch 117 | 118 | cell_x_adt_norm <- ADTnorm( 119 | cell_x_adt = cell_x_adt, 120 | cell_x_feature = cell_x_feature, 121 | save_outpath = save_outpath, 122 | study_name = run_name, 123 | marker_to_process = c("CD3", "CD4", "CD8", "CD45RA"), 124 | trimodal_marker = c("CD4", "CD45RA"), 125 | positive_peak = list(ADT = "CD3", sample = "buus_2021_T"), 126 | save_fig = TRUE 127 | ) 128 | ``` 129 | 130 | ### Case 2. Consider each healthy donor/patient per time point/condition/response/etc as one sample and normalize across the individual sample. 131 | ``` R 132 | library(ADTnorm) 133 | save_outpath <- "/path/to/output/location" 134 | run_name <- "ADTnorm_demoRun" 135 | data(cell_x_adt) 136 | data(cell_x_feature) 137 | 138 | cell_x_feature$batch = factor(cell_x_feature$study_name) ## consider each study as a batch 139 | 140 | cell_x_adt_norm <- ADTnorm( 141 | cell_x_adt = cell_x_adt, 142 | cell_x_feature = cell_x_feature, 143 | save_outpath = save_outpath, 144 | study_name = run_name, 145 | marker_to_process = c("CD3", "CD4", "CD8", "CD45RA"), 146 | trimodal_marker = c("CD4", "CD45RA"), 147 | positive_peak = list(ADT = "CD3", sample = "buus_2021_T"), 148 | save_fig = TRUE 149 | ) 150 | ``` 151 | 152 | 153 | Basic parameters introduction. The full parameter explanation for the ```ADTnorm``` function can be found at [Reference - ADTnorm](https://yezhengstat.github.io/ADTnorm/reference/ADTnorm.html). 154 | 155 | ``` 156 | cell_x_adt: Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format. 157 | 158 | cell_x_feature: Matrix of cells (rows) by cell features (columns) such as sample, batch, and cell type-related information. Please note "sample" column is mandatory and should be the smallest unit to group the cells. At this resolution, ADTnorm will identify peaks and valleys to implement normalization. Please ensure the samples have different names across batches/conditions/studies. "batch" column is optional. It can be batches/conditions/studies/etc, that group the samples based on whether the samples are collected from the same batch run or experiment. This column is needed if the ```multi_sample_per_batch``` parameter is turned on to remove outlier positive peaks per batch or ```detect_outlier_valley``` for detecting and imputing outlier valleys per batch. If the "batch" column is not provided, it will be set as the same as the "sample" column. In the intermediate density plots that ADTnorm provides, density plots will be colored by the "batch" column. 159 | 160 | save_outpath: The path to save the results. 161 | 162 | study_name: Name of this run. 163 | 164 | marker_to_process: Markers to normalize. Leave empty to process all the ADT markers in the cell_x_adt matrix. 165 | 166 | bimodal_marker: Specify ADT markers that are likely to have two peaks based on researchers' prior knowledge or preliminary observation of the particular data to be processed. Leaving it as default, ADTnorm will try to find the bimodal peak in all markers that are not listed in `trimodal_marker.` 167 | 168 | trimodal_marker: Index of the ADT markers that tend to have three peaks based on researchers' prior knowledge (e.g., CD4) or preliminary observation of the particular data to be processed. 169 | 170 | positive_peak: A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells, the only CD3 peak should be aligned to the positive peaks of other samples. 171 | 172 | save_fig: Save the density plot figure for checking the peak and valley location detection. 173 | ``` 174 | 175 | **For more detailed and typical parameter tuning examples, please visit [tutorial website](https://yezhengstat.github.io/ADTnorm/articles/ADTnorm-tutorial.html). We will illustrate using the demo data.** 176 | 177 | 178 | ## Results 179 | 180 | ```ADTnorm``` function will generate a matrix of rows of the same number as input ```cell_x_adt``` row number and columns are ADT markers specified in ```marker_to_process```. The value in the matrix is normalized value by ADTnorm. In the `save_outpath` specified by the users, there will be two subfolders, `figures` and `RDS`, containing the intermediate object and density plot of detected peak and valley landmarks before and after ADTnorm. Those figures can be used to check whether certain ADT markers need further parameter tuning. 181 | 182 | ### Case 1. Consider one study as a sample and normalize across studies. 183 | 184 | #### Arcsinh Transformation on Raw Counts 185 | 186 | Arcsinh Transformation 187 | 188 | #### ADTnorm Counts 189 | 190 | ADTnorm Normalization 191 | 192 | ### Case 2. Consider each healthy donor/patient per time point/condition/response/etc as one sample and normalize across the individual sample. 193 | 194 | 195 | #### Arcsinh Transformation on Raw Counts 196 | 197 | Color-coded by studies as batches. 198 | 199 | Arcsinh Transformation 200 | 201 | #### ADTnorm Counts 202 | 203 | ADTnorm Normalization 204 | 205 | ## Manual Adjustment of Landmark Locations by R Shiny 206 | 207 | ```customize_landmark```: By setting it to TRUE, ADTnorm will trigger the interactive landmark tuning function and pop out a shiny application for the user's manual setting of peak and valley locations. The procedure for adjusting the landmarks (peaks and valleys) is below. 208 | 209 | ShinyR 210 | 211 | Please note: 212 | 213 | - We recommend using this function after initial rounds of ADTnorm normalization with a few parameter tuning attempts. It is better to narrow down a few ADT markers that need manual tuning and provide the list to ```marker_to_process``` as the interactive function will pop out for every marker being processed. 214 | 215 | - If zigzag discrete negative peaks are observed, users can first increase the "Bandwidth for Density Visualization" at the top of the right panel to smooth out the discrete negative peaks before setting the landmarks. 216 | 217 | - Currently, the shiny browser support setting any landmark (peaks or valleys) to NA as missing. However, it does not support inserting new landmark(s). For example, if the marker density distribution shows a triple peak pattern but ADTnorm only detects two peaks across all the samples. Shiny browser does not allow manual insertion of a new peak and valley, but the user can tune the other parameters to push ADTnorm to detect three peaks: specify the target marker as ```trimodal_marker```, reducing the ```bw_smallest_tri``` or setting smaller bandwidth value and specify for the target ADT marker through ```bw_smallest_adjustments```. 218 | 219 | **For more detailed and typical parameter tuning examples, please visit [tutorial website](https://yezhengstat.github.io/ADTnorm/articles/ADTnorm-tutorial.html). We will illustrate using the demo data.** 220 | 221 | ## Contact for questions, discussions, or potential collaborations 222 | 223 | [Ye Zheng](https://yezhengstat.github.io/) 224 | 225 | Email: yzheng23@fredhutch.org 226 | 227 | Twitter: @yezhengSTAT 228 | -------------------------------------------------------------------------------- /data/cell_x_adt.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/data/cell_x_adt.rda -------------------------------------------------------------------------------- /data/cell_x_feature.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/data/cell_x_feature.rda -------------------------------------------------------------------------------- /inst/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM bioconductor/bioconductor_docker:latest 2 | ADD . ~/ADTnorm/ 3 | WORKDIR ~/ 4 | RUN R -q -e 'install.packages("devtools")' 5 | RUN R -q -e 'devtools::install("./ADTnorm", dependencies = TRUE)' 6 | WORKDIR /home/rstudio 7 | -------------------------------------------------------------------------------- /man/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/.DS_Store -------------------------------------------------------------------------------- /man/ADTnorm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ADTnorm.R 3 | \name{ADTnorm} 4 | \alias{ADTnorm} 5 | \title{ADTnorm normalization to remove the technical variations across samples for each ADT marker.} 6 | \usage{ 7 | ADTnorm( 8 | cell_x_adt = NULL, 9 | cell_x_feature = NULL, 10 | save_outpath = NULL, 11 | study_name = "ADTnorm", 12 | marker_to_process = NULL, 13 | exclude_zeroes = FALSE, 14 | bimodal_marker = NULL, 15 | trimodal_marker = NULL, 16 | positive_peak = NULL, 17 | bw_smallest_bi = 1.1, 18 | bw_smallest_tri = 0.8, 19 | bw_smallest_adjustments = list(CD3 = 0.8, CD4 = 0.8, CD8 = 0.8), 20 | quantile_clip = 1, 21 | peak_type = "mode", 22 | multi_sample_per_batch = FALSE, 23 | shoulder_valley = TRUE, 24 | shoulder_valley_slope = -0.5, 25 | valley_density_adjust = 3, 26 | landmark_align_type = "negPeak_valley_posPeak", 27 | midpoint_type = "valley", 28 | neg_candidate_thres = NULL, 29 | lower_peak_thres = 0.001, 30 | brewer_palettes = "Set1", 31 | save_landmark = FALSE, 32 | save_fig = TRUE, 33 | detect_outlier_valley = FALSE, 34 | target_landmark_location = NULL, 35 | clean_adt_name = FALSE, 36 | customize_landmark = FALSE, 37 | override_landmark = NULL, 38 | verbose = FALSE 39 | ) 40 | } 41 | \arguments{ 42 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format. By default, ADTnorm expects raw counts as input data and arcsinh transformation to be performed by ADTnorm internally. If ADTnorm detects that the input count matrix is a non-integer matrix, it will skip the arcsinh transformation. Therefore, users also need to tune the parameters to fit their input transformation.} 43 | 44 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as sample, batch, or other cell-type related information. Please ensure that the cell_x_feature matrix at least contains a sample column with the exact "sample" column name. Please note that "sample" should be the smallest unit to group the cells. At this resolution, ADTnorm will identify peaks and valleys to implement normalization. Please ensure the samples have different names across batches/conditions/studies. "batch" column is optional. It can be batches/conditions/studies etc, that group the samples based on whether the samples are collected from the same batch run or experiment. This column is needed if \code{multi_sample_per_batch} parameter is turned on to remove outlier positive peaks per batch or \code{detect_outlier_valley} for detecting and imputing outlier valleys per batch. If "batch" column is not provided, it will be set as the same as "sample" column. In the intermediate density plots that ADTnorm provides, density plots will be colored by the "batch" column.} 45 | 46 | \item{save_outpath}{The path to save the results.} 47 | 48 | \item{study_name}{Name of this run.} 49 | 50 | \item{marker_to_process}{Markers to normalize. Leaving empty to process all the ADT markers in the cell_x_adt matrix.} 51 | 52 | \item{exclude_zeroes}{Indicator to consider zeros as NA, i.e., missing values. Recommend TRUE if zeroes in the data represent dropout, likely for large ADT panels, big datasets, or under-sequenced data. The default is FALSE.} 53 | 54 | \item{bimodal_marker}{Specify ADT markers that are likely to have two peaks based on researchers' prior knowledge or preliminary observation of particular data to be processed. Leaving it as default, ADTnorm will try to find the bimodal peak in all markers that are not listed in \code{trimodal_marker.}} 55 | 56 | \item{trimodal_marker}{Index of the ADT markers that tend to have three peaks based on researchers' prior knowledge (e.g., CD4) or preliminary observation on particular data to be processed.} 57 | 58 | \item{positive_peak}{A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells. The only CD3 peak should be aligned with the positive peaks of other samples.} 59 | 60 | \item{bw_smallest_bi}{The smallest bandwidth parameter value for bi-modal peaks. Recommend 1.1.} 61 | 62 | \item{bw_smallest_tri}{The smallest bandwidth parameter value for tri-modal peaks. Recommend the same value for CD4, such as 0.5.} 63 | 64 | \item{bw_smallest_adjustments}{A named list of floats, with names matching marker names, specifying the smallest bandwidth parameter value. The default value is bw_smallest_adjustments = list(CD3 = 0.8, CD4 = 0.8, CD8 = 0.8). Recommend 0.5 or 0.8 for the common multi-modal marker.} 65 | 66 | \item{quantile_clip}{Implement an upper quantile clipping to avoid warping function errors caused by outlier measurements of extremely high expression. Provide the quantile threshold to remove outlier points above such a quantile. The default is 1, meaning no filtering. 0.99 means 99th quantile and points above 99th quantile will be discard.} 67 | 68 | \item{peak_type}{The type of peak to be detected. Select from "midpoint" for setting the peak landmark to the midpoint of the peak region being detected or "mode" for setting the peak landmark to the mode location of the peak. "midpoint" can generally be more robust across samples and less impacted by the bandwidth. "mode" can be more accurate in determining the peak location if the bandwidth is generally ideal for the target marker. The default is "mode".} 69 | 70 | \item{multi_sample_per_batch}{Set it to TRUE to discard the positive peak that only appears in one sample per batch (sample number is >=3 per batch).} 71 | 72 | \item{shoulder_valley}{Indicator to specify whether a shoulder valley is expected in case of the heavy right tail where the population of cells should be considered as a positive population. The default is TRUE.} 73 | 74 | \item{shoulder_valley_slope}{The slope on the ADT marker density distribution to call shoulder valley. Default is -0.5} 75 | 76 | \item{valley_density_adjust}{Parameter for \code{density} function: bandwidth used is adjust*bw. This makes it easy to specify values like 'half the default' bandwidth. The default is 3.} 77 | 78 | \item{landmark_align_type}{Algin the peak and valleys using one of the "negPeak", "negPeak_valley", "negPeak_valley_posPeak", and "valley" alignment modes. The default is "negPeak_valley_posPeak".} 79 | 80 | \item{midpoint_type}{Fill in the missing first valley by the midpoint of two positive peaks ("midpoint") or impute by other valleys ("valley"). The default is "valley".} 81 | 82 | \item{neg_candidate_thres}{The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95\% quantile of IgG samples is large. The default is asinh(8/5+1) for raw count input. This filtering will be disabled if the input is not raw count data.} 83 | 84 | \item{lower_peak_thres}{The minimal ADT marker density height of calling it a real peak. Set it to 0.01 to avoid a suspicious positive peak. Set it to 0.001 or smaller to include some small but tend to be real positive peaks, especially for markers like CD19. The default is 0.001.} 85 | 86 | \item{brewer_palettes}{Set the color scheme of the color brewer. The default is "Set1".} 87 | 88 | \item{save_landmark}{Save the peak and valley locations in rds format. The default is FALSE.} 89 | 90 | \item{save_fig}{Save the density plot figure for checking the peak and valley location detection. We highly recommend checking the intermediate peak and valley locations identification on those density plots to visually check if the detection is accurate and if manual tuning is needed. The default is TRUE.} 91 | 92 | \item{detect_outlier_valley}{Detect the outlier valley within each batch of samples and impute by the neighbor samples' valley location. For outlier detection methods, choose from "MAD" (Median Absolute Deviation) or "IQR" (InterQuartile Range). Recommend trying "MAD" first if needed. The default is FALSE.} 93 | 94 | \item{target_landmark_location}{Align the landmarks to a fixed location or, by default, align to the mean across samples for each landmark. The default value is NULL. Setting it to "fixed" will align the negative peak to 1 and the right-most positive peak to 5. Users can also assign a two-element vector indicating the location of the negative and most positive peaks to be aligned.} 95 | 96 | \item{clean_adt_name}{Clean the ADT marker name. The default is FALSE.} 97 | 98 | \item{customize_landmark}{By setting it to be TRUE, ADTnorm will trigger the interactive landmark tuning function and pop out a shiny application for the user's manual setting of the peaks and valleys location. We recommend using this function after initial rounds of ADTnorm normalization with a few parameter tuning attempts. It is better to narrow down a few ADT markers that need manual tuning and provide the list to marker_to_process, as the interactive function will pop out for every marker being processed. The default is FALSE.} 99 | 100 | \item{override_landmark}{Override the peak and valley locations if prior information is available or the user wants to manually adjust the peak and valley locations for certain markers. Input is in the format of list, i.e., list(CD3 = list(peak_landmark_list = customized_peak_landmark_list, valley_landmark_list = customized_valley_landmark_list), CD4 = list(peak_landmark_list = customized_peak_landmark_list, valley_landmark_list = customized_valley_landmark_list)). "customized_peak_landmark_list" and "customized_valley_landmark_list" are matrices of customized landmark locations with matching sample names as the rownames. The default is NULL.} 101 | 102 | \item{verbose}{Set the verbosity of the function. The default is FALSE.} 103 | } 104 | \value{ 105 | A data frame list containing normalized count for the ADT markers specified to be normalized. To output the peak and valley locations before and after ADTnorm normalization, please set \code{save_landmark} to TRUE, and the landmarks will be saved as an rds file in the "save_outpath" directory. 106 | } 107 | \description{ 108 | This function removes the technical variations such as batch effect, sequencing depth biases, antibody selection differences and antibody concentration differences, etc. The normalized samples are ready for integration across studies. 109 | } 110 | \examples{ 111 | \dontrun{ 112 | ADTnorm( 113 | cell_x_adt = cell_x_adt, 114 | cell_x_feature = cell_x_feature, 115 | save_outpath = save_outpath, 116 | study_name = study_name, 117 | marker_to_process = c("CD3", "CD4", "CD8") 118 | ) 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /man/arcsinh_transform.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/arcsinh_transform.R 3 | \name{arcsinh_transform} 4 | \alias{arcsinh_transform} 5 | \title{arcsinh transformation.} 6 | \usage{ 7 | arcsinh_transform(cell_x_adt = NULL, parameter_list = NULL) 8 | } 9 | \arguments{ 10 | \item{cell_x_adt}{Matrix where rows are cells and columns are ADT markers.} 11 | 12 | \item{parameter_list}{Parameter list for a: positive double that corresponds to a shift about 0; b: positive double that corresponds to a scale factor; c: positive double. By default a = 1, b = 1/5 and c = 0.} 13 | } 14 | \description{ 15 | This function transforms the input cell_x_adt matrix by arcsinh with co-factor 5. The definition of this function is x_new <- asinh(a + b * x) + c) 16 | } 17 | \examples{ 18 | \dontrun{ 19 | arcsinh_transform(cell_x_adt) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /man/cell_x_adt.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{cell_x_adt} 5 | \alias{cell_x_adt} 6 | \title{A matrix of raw count for the cell by ADT markers} 7 | \format{ 8 | A data frame with 422682 rows and 9 variables: 9 | \describe{ 10 | \item{CD3}{CD3 ADT marker raw count across each cell} 11 | \item{CD4}{CD4 ADT marker raw count across each cell} 12 | \item{CD8}{CD8 ADT marker raw count across each cell} 13 | \item{CD14}{CD14 ADT marker raw count across each cell} 14 | \item{CD19}{CD19 ADT marker raw count across each cell} 15 | \item{CD25}{CD25 ADT marker raw count across each cell} 16 | \item{CD45RA}{CD45RA ADT marker raw count across each cell} 17 | \item{CD56}{CD56 ADT marker raw count across each cell} 18 | \item{CD127}{CD127 ADT marker raw count across each cell} 19 | } 20 | } 21 | \source{ 22 | See detailed description in the manuscript 23 | } 24 | \usage{ 25 | cell_x_adt 26 | } 27 | \description{ 28 | A dataset containing 422682 cells and 9 ADT markers for the CITE-seq raw measurement of 13 publicly available CITE-seq datasets. 29 | } 30 | \keyword{datasets} 31 | -------------------------------------------------------------------------------- /man/cell_x_feature.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{cell_x_feature} 5 | \alias{cell_x_feature} 6 | \title{A matrix of raw count for the cell by features} 7 | \format{ 8 | A data frame with 422682 rows and 7 variables: 9 | \describe{ 10 | \item{sample}{Sample name. In this demo data, the sample name is the same as the study_name, assuming that one study is one sample.} 11 | \item{batch}{Batch ID. In this demo data, the batch ID is the same as the study_name.} 12 | \item{sample_status}{Sample status, i.e., Healthy, MALTtumor, HIV Vaccine, Lupus, B-ALL, AML.} 13 | \item{study_name}{Name of the data set/study.} 14 | \item{ADTseqDepth}{Total UMI per cell.} 15 | \item{cell_type_l1}{Broad level of cell type annotation using manual gating.} 16 | \item{cell_type_l2}{Fine level of cell type annotation using manual gating.} 17 | } 18 | } 19 | \source{ 20 | See detailed description in the manuscript 21 | } 22 | \usage{ 23 | cell_x_feature 24 | } 25 | \description{ 26 | A dataset containing 422682 cells and 7 feature categories for the CITE-seq raw measurement of 13 publicly available CITE-seq datasets. 27 | } 28 | \keyword{datasets} 29 | -------------------------------------------------------------------------------- /man/clean_adt_name.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clean_adt_name.R 3 | \name{clean_adt_name} 4 | \alias{clean_adt_name} 5 | \title{Clean ADT marker name} 6 | \usage{ 7 | clean_adt_name(adt_name) 8 | } 9 | \arguments{ 10 | \item{adt_name}{A vector of ADT marker name.} 11 | } 12 | \description{ 13 | This function enables the general cleaning of ADT marker names. Regardless, users should try cleaning and unifying their ADT marker name first. Please also ensure there is no "/" in the ADT name, such as "TCRγ/δ". 14 | } 15 | \examples{ 16 | \dontrun{ 17 | clean_adt_name(colnames(cell_x_adt)) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /man/detect_impute_outlier_valley.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/detect_impute_outlier_valley.R 3 | \name{detect_impute_outlier_valley} 4 | \alias{detect_impute_outlier_valley} 5 | \title{Identify the valley outliers and impute by valley by closet neighbors.} 6 | \usage{ 7 | detect_impute_outlier_valley( 8 | valley_location_res, 9 | adt_marker_select, 10 | cell_x_adt, 11 | cell_x_feature, 12 | scale = 3, 13 | method = "MAD", 14 | nearest_neighbor_n = 3, 15 | nearest_neighbor_threshold = 0.75 16 | ) 17 | } 18 | \arguments{ 19 | \item{valley_location_res}{Matrix of valley landmark locations with rows being samples and columns being the valleys.} 20 | 21 | \item{adt_marker_select}{The marker whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample of the same ADT marker.} 22 | 23 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.} 24 | 25 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.} 26 | 27 | \item{scale}{Scale level to defining outlier. A larger scale value corresponds to more severe outliers.} 28 | 29 | \item{method}{Outlier detection methods, choose from "MAD" (Median Absolute Deviation) or "IQR" (InterQuartile Range). The default is MAD.} 30 | 31 | \item{nearest_neighbor_n}{Number of top nearest neighbor samples to detect.} 32 | 33 | \item{nearest_neighbor_threshold}{Threshold to call neighbor samples.} 34 | } 35 | \description{ 36 | This function identifies the valley(s) that tend to be outliers compared to other valley locations and tries to find the closest samples with similar density distribution to impute the valley. If no neighbor sample is detected, the valley will remain as original. 37 | } 38 | \examples{ 39 | \dontrun{ 40 | detect_impute_outlier_valley(valley_location_res, cell_x_feature) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /man/figures/ADTnorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/ADTnorm.png -------------------------------------------------------------------------------- /man/figures/ArcsinhTransform_log10_CD4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/ArcsinhTransform_log10_CD4.png -------------------------------------------------------------------------------- /man/figures/PublicData_samplelevel_adtnorm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/PublicData_samplelevel_adtnorm.png -------------------------------------------------------------------------------- /man/figures/PublicData_samplelevel_raw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/PublicData_samplelevel_raw.png -------------------------------------------------------------------------------- /man/figures/RawCount.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/RawCount.png -------------------------------------------------------------------------------- /man/figures/ShinyR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/ShinyR.png -------------------------------------------------------------------------------- /man/figures/lower_peak_thres.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/lower_peak_thres.png -------------------------------------------------------------------------------- /man/figures/multi_sample_per_batch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/multi_sample_per_batch.png -------------------------------------------------------------------------------- /man/figures/peak_type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/peak_type.png -------------------------------------------------------------------------------- /man/figures/pipeline_202208.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/pipeline_202208.png -------------------------------------------------------------------------------- /man/figures/shoulder_valley.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/shoulder_valley.png -------------------------------------------------------------------------------- /man/figures/shoulder_valley_CD3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/shoulder_valley_CD3.png -------------------------------------------------------------------------------- /man/figures/target_landmark_location.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/target_landmark_location.png -------------------------------------------------------------------------------- /man/figures/valley_density_adjust.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/man/figures/valley_density_adjust.png -------------------------------------------------------------------------------- /man/get_customize_landmark.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_customize_landmark.R 3 | \name{get_customize_landmark} 4 | \alias{get_customize_landmark} 5 | \title{Prompt Shiny browser to manually customize peaks and valleys locations.} 6 | \usage{ 7 | get_customize_landmark( 8 | cell_x_adt_sample, 9 | landmark_pos, 10 | bw, 11 | adt_marker_select_name, 12 | brewer_palettes = "Set1" 13 | ) 14 | } 15 | \arguments{ 16 | \item{cell_x_adt_sample}{Matrix of ADT counts of the selected marker, with columns of sample and batch information for each row of cells.} 17 | 18 | \item{landmark_pos}{Matrix of landmark location including peaks and valleys.} 19 | 20 | \item{bw}{Bandwidth for the density plot.} 21 | 22 | \item{adt_marker_select_name}{The ADT marker needed to be manually processed to set the landmarks.} 23 | 24 | \item{brewer_palettes}{Set the color scheme of the color brewer. The default is "Set1".} 25 | } 26 | \description{ 27 | This function will launch a shiny app allowing the user to set the location of peaks and valleys manually. The function will output the landmark positions that the user has set. 28 | } 29 | \examples{ 30 | \dontrun{ 31 | get_customize_landmark( 32 | cell_x_adt_sample, 33 | landmark_pos, 34 | bw, 35 | adt_marker_select_name, 36 | brewer_palettes = "Set1" 37 | )} 38 | } 39 | -------------------------------------------------------------------------------- /man/get_neighbors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_neighbors.R 3 | \name{get_neighbors} 4 | \alias{get_neighbors} 5 | \title{Find the closest neighbors to impute outlier valleys.} 6 | \usage{ 7 | get_neighbors( 8 | target_sample, 9 | adt_marker_select, 10 | cell_x_adt, 11 | cell_x_feature, 12 | nearest_neighbor_n = 3, 13 | nearest_neighbor_threshold = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{target_sample}{The target sample whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample.} 18 | 19 | \item{adt_marker_select}{The marker whose valley needs to be imputed. Find the neighbor samples whose density distribution is close to the target sample of the same ADT marker.} 20 | 21 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.} 22 | 23 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.} 24 | 25 | \item{nearest_neighbor_n}{Number of top nearest neighbor samples to detect.} 26 | 27 | \item{nearest_neighbor_threshold}{Threshold to call neighbor samples.} 28 | } 29 | \description{ 30 | This function identifies the valley that tends to be outliers compared to other valley locations and tries to find the closest samples that have similar density distribution to input the valley. If no neighbor sample is detected, the valley will remain as original. 31 | } 32 | \examples{ 33 | \dontrun{ 34 | get_neighbors(target_sample, adt_marker_select, cell_x_adt, cell_x_feature) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /man/get_peak_midpoint.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_peak_midpoint.R 3 | \name{get_peak_midpoint} 4 | \alias{get_peak_midpoint} 5 | \title{Get the peak landmarks location using the peak region midpoint} 6 | \usage{ 7 | get_peak_midpoint( 8 | cell_x_adt = NULL, 9 | cell_x_feature = NULL, 10 | adt_marker_select = NULL, 11 | adt_marker_index = NULL, 12 | bwFac_smallest = 1.1, 13 | bimodal_marker_index = NULL, 14 | trimodal_marker_index = NULL, 15 | positive_peak = NULL, 16 | neg_candidate_thres = asinh(10/5 + 1), 17 | lower_peak_thres = 0.001, 18 | arcsinh_transform_flag = TRUE 19 | ) 20 | } 21 | \arguments{ 22 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.} 23 | 24 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.} 25 | 26 | \item{adt_marker_select}{Markers to normalize. Leaving empty to process all the ADT markers in the cell_x_adt matrix.} 27 | 28 | \item{adt_marker_index}{Index of the ADT markers that will be normalized. Leaving empty if adt_marker_select is not specified.} 29 | 30 | \item{bwFac_smallest}{The smallest bandwidth parameter value. Recommend 1.1 for general bi-modal ADT markers except for CD3, CD4, and CD8.} 31 | 32 | \item{bimodal_marker_index}{Index of the ADT markers that tend to have two peaks based on researchers' prior knowledge or preliminary observation on particular data to be processed.} 33 | 34 | \item{trimodal_marker_index}{Index of the ADT markers that tend to have three peaks based on researchers' prior knowledge (e.g., CD4) or preliminary observation on particular data to be processed.} 35 | 36 | \item{positive_peak}{A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells. The only CD3 peak should be aligned with the positive peaks of other samples.} 37 | 38 | \item{neg_candidate_thres}{The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95\% quantile of IgG samples is large.} 39 | 40 | \item{lower_peak_thres}{The minimal ADT marker density height to call it a real peak. Set it to 0.01 to avoid a suspicious positive peak. Set it to 0.001 or smaller to include some small but tend to be real positive peaks, especially for markers like CD19.} 41 | 42 | \item{arcsinh_transform_flag}{The flag indicates if the input is raw count and arcsinh transformation is implemented.} 43 | } 44 | \description{ 45 | This function detects the peak landmark locations for each sample per ADT markers based on the midpoint of the peak region. Using the peak midpoint instead of the peak mode can be more stable across samples and less affected by the bandwidth. 46 | } 47 | \examples{ 48 | \dontrun{ 49 | get_peak_midpoint( 50 | cell_x_adt = cell_x_adt, 51 | cell_x_feature = cell_x_feature, 52 | adt_marker_select = "CD3", 53 | neg_candidate_thres = asinh(6/5+1) 54 | ) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /man/get_peak_mode.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_peak_mode.R 3 | \name{get_peak_mode} 4 | \alias{get_peak_mode} 5 | \title{Get the peak landmarks location using the peak mode} 6 | \usage{ 7 | get_peak_mode( 8 | cell_x_adt = NULL, 9 | cell_x_feature = NULL, 10 | adt_marker_select = NULL, 11 | adt_marker_index = NULL, 12 | bwFac_smallest = 1.1, 13 | bimodal_marker_index = NULL, 14 | trimodal_marker_index = NULL, 15 | positive_peak = NULL, 16 | neg_candidate_thres = asinh(10/5 + 1), 17 | lower_peak_thres = 0.001, 18 | arcsinh_transform_flag = TRUE 19 | ) 20 | } 21 | \arguments{ 22 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.} 23 | 24 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.} 25 | 26 | \item{adt_marker_select}{Markers to normalize. Leaving empty to process all the ADT markers in the cell_x_adt matrix.} 27 | 28 | \item{adt_marker_index}{Index of the ADT markers that will be normalized. Leaving empty if adt_marker_select is not specified.} 29 | 30 | \item{bwFac_smallest}{The smallest bandwidth parameter value. Recommend 1.1 for general bi-modal ADT markers except for CD3, CD4, and CD8.} 31 | 32 | \item{bimodal_marker_index}{Index of the ADT markers that tend to have two peaks based on researchers' prior knowledge or preliminary observation on particular data to be processed.} 33 | 34 | \item{trimodal_marker_index}{Index of the ADT markers that tend to have three peaks based on researchers' prior knowledge (e.g., CD4) or preliminary observation on particular data to be processed.} 35 | 36 | \item{positive_peak}{A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells. The only CD3 peak should be aligned with the positive peaks of other samples.} 37 | 38 | \item{neg_candidate_thres}{The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95\% quantile of IgG samples is large.} 39 | 40 | \item{lower_peak_thres}{The minimal ADT marker density height to call it a real peak. Set it to 0.01 to avoid suspicious positive peaks. Set it to 0.001 or smaller to include some small but tend to be real positive peaks, especially for markers like CD19.} 41 | 42 | \item{arcsinh_transform_flag}{The flag indicates if the input is raw count and arcsinh transformation is implemented.} 43 | } 44 | \description{ 45 | This function detects the peak mode locations for each sample per ADT markers. Using the peak mode instead of the peak region midpoint can be more accurate in determining the peak location if the bandwidth is generally proper and the local peak density is not too discrete. 46 | } 47 | \examples{ 48 | \dontrun{ 49 | get_peak_mode( 50 | cell_x_adt = cell_x_adt, 51 | cell_x_feature = cell_x_feature, 52 | adt_marker_select = "CD3", 53 | neg_candidate_thres = asinh(6/5+1) 54 | ) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /man/get_valley_location.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_valley_location.R 3 | \name{get_valley_location} 4 | \alias{get_valley_location} 5 | \title{Get the valley landmark locations.} 6 | \usage{ 7 | get_valley_location( 8 | cell_x_adt = NULL, 9 | cell_x_feature = NULL, 10 | adt_marker_select = NULL, 11 | peak_mode_res = NULL, 12 | shoulder_valley = TRUE, 13 | positive_peak = NULL, 14 | multi_sample_per_batch = FALSE, 15 | adjust = 1.5, 16 | min_fc = 20, 17 | shoulder_valley_slope = -1, 18 | lower_peak_thres = 0.01, 19 | neg_candidate_thres = asinh(10/5 + 1), 20 | arcsinh_transform_flag = TRUE 21 | ) 22 | } 23 | \arguments{ 24 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.} 25 | 26 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.} 27 | 28 | \item{adt_marker_select}{Markers to normalize. Leaving empty to process all the ADT markers in the cell_x_adt matrix.} 29 | 30 | \item{peak_mode_res}{The peak landmark results coming out of \code{get_peak_mode} or \code{get_peak_midpoint} function.} 31 | 32 | \item{shoulder_valley}{Indictor to specify whether a shoulder valley is expected in case of the heavy right tail where the population of cells should be considered as positive population.} 33 | 34 | \item{positive_peak}{A list variable containing a vector of ADT marker(s) and a corresponding vector of sample name(s) in matching order to specify that the uni-peak detected should be aligned to positive peaks. For example, for samples that only contain T cells. The only CD3 peak should be aligned with the positive peaks of other samples.} 35 | 36 | \item{multi_sample_per_batch}{Set it to TRUE to discard the positive peak that only appears in one sample per batch (sample number is >=3 per batch).} 37 | 38 | \item{adjust}{Parameter for \code{density} function: bandwidth used is actually adjust*bw. This makes it easy to specify values like 'half the default' bandwidth.} 39 | 40 | \item{min_fc}{Mimimal fold change between the highest peak density height and candidate valley density height. The default is 20.} 41 | 42 | \item{shoulder_valley_slope}{The slope on the ADT marker density distribution to call shoulder valley.} 43 | 44 | \item{lower_peak_thres}{The minimal ADT marker density height to call it a real peak. Set it to 0.01 to avoid suspicious positive peaks. Set it to 0.001 or smaller to include some small but tend to be real positive peaks, especially for markers like CD19.} 45 | 46 | \item{neg_candidate_thres}{The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95\% quantile of IgG samples is large.} 47 | 48 | \item{arcsinh_transform_flag}{The flag indicates if the input is raw count and arcsinh transformation is implemented.} 49 | } 50 | \description{ 51 | This function detects the valley locations either between every two peak landmarks or cut at the right heavy tails. If the specified positive uni-peak, the valley location will be set at the left side of the uni-peak. 52 | } 53 | \examples{ 54 | \dontrun{ 55 | get_valley_location(cell_x_adt, cell_x_feature, peak_mode_res) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /man/landmark_fill_na.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/landmark_fill_na.R 3 | \name{landmark_fill_na} 4 | \alias{landmark_fill_na} 5 | \title{Merge locations of peak and valley landmarks} 6 | \usage{ 7 | landmark_fill_na( 8 | peak_landmark_list = NULL, 9 | valley_landmark_list = NULL, 10 | landmark_align_type = NULL, 11 | midpoint_type = "valley", 12 | neg_candidate_thres = asinh(10/5 + 1) 13 | ) 14 | } 15 | \arguments{ 16 | \item{peak_landmark_list}{Matrix of peak landmark detection results. Rows are samples, and column(s) are the peak locations.} 17 | 18 | \item{valley_landmark_list}{Matrix of valley landmark detection results. Rows are samples, and column(s) are the valley locations.} 19 | 20 | \item{landmark_align_type}{Algin the peak and valleys using one of the "negPeak", "negPeak_valley", "negPeak_valley_posPeak", and "valley" alignment modes.} 21 | 22 | \item{midpoint_type}{Fill in the missing first valley by the midpoint of two positive peaks ("midpoint") or impute by other valleys ("valley").} 23 | 24 | \item{neg_candidate_thres}{The upper bound for the negative peak. Users can refer to their IgG samples to obtain the minimal upper bound of the IgG sample peak. It can be one of the values of asinh(4/5+1), asinh(6/5+1), or asinh(8/5+1) if the right 95\% quantile of IgG samples is large.} 25 | } 26 | \description{ 27 | This function merges the peak and valley landmarks locations and fills in NA if the landmark is not detected. 28 | } 29 | \examples{ 30 | \dontrun{ 31 | landmark_fill_na( 32 | peak_landmark_list = peak_landmark_list, 33 | valley_landmark_list = valley_landmark_list, 34 | landmark_align_type = "negPeak_valley_posPeak" 35 | ) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /man/load_landmarks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/load_landmarks.R 3 | \name{load_landmarks} 4 | \alias{load_landmarks} 5 | \title{Load saved landmarks into a list of matrices, as required for input into "override_landmark".} 6 | \usage{ 7 | load_landmarks(dir, append_rds = TRUE) 8 | } 9 | \arguments{ 10 | \item{dir}{Directory where landmark location .rds files are stored} 11 | 12 | \item{append_rds}{Whether to append "/RDS" to the provided directory path. The default is TRUE, which aligns with ADTnorm's default save location.} 13 | } 14 | \description{ 15 | Load saved landmarks into a list of matrices, as required for input into "override_landmark". 16 | } 17 | \examples{ 18 | \dontrun{ 19 | override_landmark = load_landmarks(dir = save_outpath) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /man/peak_alignment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/peak_alignment.R 3 | \name{peak_alignment} 4 | \alias{peak_alignment} 5 | \title{Align the peak and valley landmarks by the warpset function} 6 | \usage{ 7 | peak_alignment( 8 | cell_x_adt, 9 | cell_x_feature = NULL, 10 | landmark_matrix = NULL, 11 | target_landmark = NULL 12 | ) 13 | } 14 | \arguments{ 15 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.} 16 | 17 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.} 18 | 19 | \item{landmark_matrix}{Matrix of peak and valley landmarks after filling in NA using the \code{landmark_fill_na} function.} 20 | 21 | \item{target_landmark}{Leave it as NULL to align the landmark to the mean location across samples. Denote it by a vector of the same length as the column number of the landmark to align the negative peak, valley, and positive peak(s) to the specified fixed location.} 22 | } 23 | \description{ 24 | This function monotonously transforms the ADT marker counts to align the landmarks detected in previous steps. By aligning the landmarks, ADTnorm removes the batch effect and allows integration across batches/studies. 25 | } 26 | \examples{ 27 | \dontrun{ 28 | peak_alignment(cell_x_adt, cell_x_feature, landmark_matrix) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /man/plot_adt_density_each.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_adt_density_each.R 3 | \name{plot_adt_density_each} 4 | \alias{plot_adt_density_each} 5 | \title{Plot the expression density profile for ONE ADT marker} 6 | \usage{ 7 | plot_adt_density_each( 8 | adt_count, 9 | cell_x_feature, 10 | brewer_palettes, 11 | parameter_list = NULL 12 | ) 13 | } 14 | \arguments{ 15 | \item{adt_count}{Matrix of ADT raw counts in cells (rows) by one target ADT marker (column) format.} 16 | 17 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.} 18 | 19 | \item{brewer_palettes}{Set the color scheme of the color brewer.} 20 | 21 | \item{parameter_list}{Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot.} 22 | } 23 | \description{ 24 | This function plots the ADT expression density profile for only one ADT marker. Each track is a sample. Color by batch 25 | } 26 | \examples{ 27 | \dontrun{ 28 | plot_adt_density_each( 29 | cell_x_adt, 30 | cell_x_feature, 31 | brewer_palettes = "Set1", 32 | parameter_list = list(bw = 0.1, run_label = "ADTnorm") 33 | ) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /man/plot_adt_density_with_peak_valley.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_adt_density_with_peak_valley.R 3 | \name{plot_adt_density_with_peak_valley} 4 | \alias{plot_adt_density_with_peak_valley} 5 | \title{Plot ADT marker expression density profile with identified peak and valley locations} 6 | \usage{ 7 | plot_adt_density_with_peak_valley( 8 | cell_x_adt, 9 | cell_x_feature, 10 | adt_marker_select = NULL, 11 | peak_landmark_list, 12 | valley_landmark_list, 13 | brewer_palettes = "Set1", 14 | parameter_list = NULL 15 | ) 16 | } 17 | \arguments{ 18 | \item{cell_x_adt}{Matrix of ADT raw counts in cells (rows) by ADT markers (columns) format.} 19 | 20 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.} 21 | 22 | \item{adt_marker_select}{The target ADT marker(s) that the denstiy plot is about. Leave it NULL will generate figures for all the ADT markers available in cell_x_adt.} 23 | 24 | \item{peak_landmark_list}{Matrix of peak landmark locations with rows being samples and columns being the peaks.} 25 | 26 | \item{valley_landmark_list}{Matrix of valley landmark locations with rows being samples and columns being the valleys.} 27 | 28 | \item{brewer_palettes}{Set the color scheme of the color brewer.} 29 | 30 | \item{parameter_list}{Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot.} 31 | } 32 | \description{ 33 | This function plots the ADT expression density profile with identified peak and valley locations. Each panel is an ADT marker, and each track is a sample. Color by batch 34 | } 35 | \examples{ 36 | \dontrun{ 37 | plot_adt_density_with_peak_valley( 38 | cell_x_adt, 39 | cell_x_feature, 40 | adt_marker_select = c("CD3", "CD4", "CD8", "CD19"), 41 | peak_landmark_list = peak_mode_norm_res, 42 | valley_landmark_list = valley_location_norm_res, 43 | brewer_palettes = "Set1", 44 | parameter_list = list(bw = 0.1, run_label = "ADTnorm") 45 | ) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /man/plot_adt_density_with_peak_valley_each.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_adt_density_with_peak_valley_each.R 3 | \name{plot_adt_density_with_peak_valley_each} 4 | \alias{plot_adt_density_with_peak_valley_each} 5 | \title{Plot the expression density profile for ONE ADT marker with identified peak and valley locations} 6 | \usage{ 7 | plot_adt_density_with_peak_valley_each( 8 | adt_count, 9 | cell_x_feature, 10 | peak_landmark_list, 11 | valley_landmark_list, 12 | brewer_palettes = "Set1", 13 | parameter_list = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{adt_count}{Matrix of ADT raw counts in cells (rows) by one target ADT marker (column) format.} 18 | 19 | \item{cell_x_feature}{Matrix of cells (rows) by cell features (columns) such as cell type, sample, and batch-related information.} 20 | 21 | \item{peak_landmark_list}{Matrix of peak landmark locations with rows being samples and columns being the peaks.} 22 | 23 | \item{valley_landmark_list}{Matrix of valley landmark locations with rows being samples and columns being the valleys.} 24 | 25 | \item{brewer_palettes}{Set the color scheme of the color brewer.} 26 | 27 | \item{parameter_list}{Users can specify: "run_label" to give a name for this run; "bw" to adjust the bandwidth of the density plot.} 28 | } 29 | \description{ 30 | This function plots the ADT expression density profile with identified peak and valley locations for only one ADT marker. Each track is a sample. Color by batch 31 | } 32 | \examples{ 33 | \dontrun{ 34 | plot_adt_density_with_peak_valley_each( 35 | cell_x_adt, 36 | cell_x_feature, 37 | peak_landmark_list = peak_mode_norm_res, 38 | valley_landmark_list = valley_location_norm_res, 39 | brewer_palettes = "Set1", 40 | parameter_list = list(bw = 0.1, run_label = "ADTnorm") 41 | ) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ComparativeAnalysis/WNN_integrateRNA_ADT.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | library(Seurat) 5 | library(dplyr) 6 | library(ggplot2) 7 | library(httpgd) 8 | 9 | method <- args[1] #"Arcsinh_b5" 10 | run_name = args[2] #"publicData_CITEseq" 11 | # run_name = "public13Dataset_CITEseq" 12 | 13 | master_path = "./" 14 | in_path = "/publicData_CITEseq/data/" 15 | out_path = paste0(master_path, "manuscript/results/", run_name) 16 | fig_path = paste0(out_path, "/Figures/") 17 | 18 | ## load data directly 19 | marker_list = c("CD3", "CD4", "CD8", "CD14", "CD19", "CD25", "CD45RA", "CD56", "CD127") 20 | adt_data = readRDS(file = paste0(out_path, "/RDS/adt_data_RawCount_", run_name, ".rds")) 21 | adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", run_name, ".rds")) 22 | 23 | ## RPCA integration methods 24 | # creates a Seurat object based on the CITE-seq data 25 | rna_data = readRDS(file = './results/publicData_CITEseq/RDS/rna_data_common_RawCount_public13Dataset_CITEseq.rds') 26 | citeseq_obj = CreateSeuratObject(counts = t(rna_data)) 27 | citeseq_obj = AddMetaData(citeseq_obj, metadata = adt_feature) 28 | 29 | cite_list = SplitObject(citeseq_obj, split.by = "study_name") 30 | cite_list <- lapply(X = cite_list, FUN = function(x) { 31 | x <- NormalizeData(x) 32 | x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 5000) ## 33 | }) 34 | 35 | # select features that are repeatedly variable across datasets for integration run PCA on each 36 | # dataset using these features 37 | features <- SelectIntegrationFeatures(object.list = cite_list) 38 | cite_list <- lapply(X = cite_list, FUN = function(x) { 39 | x <- ScaleData(x, features = features, verbose = FALSE) 40 | x <- RunPCA(x, features = features, verbose = FALSE) 41 | }) 42 | immune.anchors <- FindIntegrationAnchors(object.list = cite_list, anchor.features = features, reduction = "rpca") ## 43 | # this command creates an 'integrated' data assay 44 | immune.combined <- IntegrateData(anchorset = immune.anchors) 45 | # specify that we will perform downstream analysis on the corrected data note that the 46 | # original unmodified data still resides in the 'RNA' assay 47 | DefaultAssay(immune.combined) <- "integrated" 48 | 49 | # Run the standard workflow for visualization and clustering 50 | immune.combined <- ScaleData(immune.combined, verbose = FALSE) 51 | immune.combined <- RunPCA(immune.combined, npcs = 100, verbose = FALSE) 52 | immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:50) 53 | saveRDS(immune.combined, file = paste0(out_path, "/RDS/citeseq_obj_RPCA_", run_name, ".rds")) 54 | 55 | citeNorm = readRDS(file = paste0(out_path, "/RDS/citeseq_obj_RPCA_", run_name, ".rds")) 56 | 57 | adt_data_adtnorm = readRDS(file = paste0(out_path, "/RDS/adt_data_norm_", method, "_", run_name, ".rds")) 58 | adt_data_adtnorm[is.na(adt_data_adtnorm)] = 0 59 | 60 | rownames(adt_data_adtnorm) = colnames(citeNorm) 61 | adt_assay <- CreateAssay5Object(counts = t(adt_data_adtnorm), data = t(adt_data_adtnorm)) 62 | citeNorm[["ADT"]] <- adt_assay 63 | DefaultAssay(citeNorm) <- 'ADT' 64 | VariableFeatures(citeNorm) <- rownames(citeNorm[["ADT"]]) 65 | ## set scale.data 66 | if(method %in% c("fastMNN_study", "fastMNN_sample", "logCPM", "decontPro")){ 67 | citeNorm = ScaleData(citeNorm) 68 | }else{ 69 | citeNorm <- SetAssayData(citeNorm, assay = "ADT", slot = "scale.data", new.data = t(adt_data_adtnorm)) 70 | } 71 | citeNorm <- RunPCA(citeNorm, reduction.name = 'apca') 72 | 73 | DefaultAssay(citeNorm) <- 'integrated' 74 | citeNorm <- FindMultiModalNeighbors( 75 | citeNorm, reduction.list = list("pca", "apca"), 76 | dims.list = list(1:15, 1:8), modality.weight.name = "RNA.weight", k.nn = 30 77 | ) 78 | 79 | citeNorm <- RunUMAP(citeNorm, nn.name = "weighted.nn", reduction.name = "wnn.umap", reduction.key = "wnnUMAP_", min.dist = 0.01) 80 | citeNorm <- FindClusters(citeNorm, graph.name = "wsnn", algorithm = 1, resolution = 0.15, verbose = FALSE) 81 | saveRDS(citeNorm, file = paste0(out_path, "/RDS/citeseq_obj_RPCA_citeNorm_WNN_15_8_withCluster_noScalePCA_nneighbor20_mindist0.01_", run_name, "_", method, ".rds")) 82 | 83 | ans_umap = Embeddings(citeNorm, reduction = "wnn.umap") %>% data.frame 84 | colnames(ans_umap) = c("X1", "X2") 85 | adt_feature$seurat_clusters = citeNorm@meta.data$seurat_clusters 86 | 87 | 88 | pdf(paste0("./manuscript/results/public13Dataset_CITEseq/Figures/WNN/citeseq_obj_RPCA_citeNorm_WNN_15_8_withCluster_noScalePCA_nneighbor20_mindist0.01_", run_name, "_", method, ".pdf"), width = 15, height = 11) 89 | 90 | reindex1 = which(adt_feature$cell_type_l1 == "undefined") 91 | reindex2 = which(adt_feature$cell_type_l1 != "undefined") 92 | reindex = c(reindex1, reindex2) 93 | 94 | # print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list( 95 | # target_feature = "seurat_clusters", 96 | # color_design = colorRampPalette(brewer.pal(8, "Spectral"))(length(adt_feature$seurat_clusters %>% unique)), 97 | # method_label = method 98 | # )) #+ scale_color_brewer(palette = "Dark2") 99 | # ) 100 | 101 | 102 | print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list( 103 | target_feature = "sample", 104 | color_design = colorRampPalette(brewer.pal(8, "Spectral"))(length(adt_feature$sample %>% unique)), 105 | method_label = method 106 | )) #+ scale_color_brewer(palette = "Dark2") 107 | ) 108 | 109 | print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list( 110 | target_feature = "study_name", 111 | color_design = colorRampPalette(brewer.pal(8, "Dark2"))(length(adt_feature$study_name %>% unique)), 112 | method_label = method 113 | ))) 114 | 115 | print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list( 116 | target_feature = "sample_status", 117 | color_design = colorRampPalette(brewer.pal(8, "Set1"))(length(adt_feature$sample_status %>% unique)), 118 | method_label = method 119 | ))) 120 | 121 | adt_feature$cell_type_l1 = factor(adt_feature$cell_type_l1, levels = c("B", "CD4 T", "CD8 T", "monocytes", "NK", "DCs", "undefined")) 122 | print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list( 123 | target_feature = "cell_type_l1", 124 | color_design = c("#B00000", "#3F918B", "#896191", "#FF980A", "#226fa7", "#F781BF", "grey"), 125 | #color_design = c(colorRampPalette(brewer.pal(8, "Set1"))(6), "grey"), 126 | color_break = c("B", "CD4 T", "CD8 T", "monocytes", "NK", "DCs", "undefined"), 127 | method_label = method 128 | ))) 129 | 130 | adt_feature$cell_type_l2 = factor(adt_feature$cell_type_l2, levels = c("naive B", "memory B", 131 | "naive CD4", "memory CD4", "Treg", 132 | "naive CD8", "memory CD8", 133 | "classical monocyte", "intermediate monocyte", "non-classical CD16+ monocyte", 134 | "CD16- NK", "CD16+ NK", 135 | "myeloid DC", "plasmacytoid DC", "undefined")) 136 | print(plot_umap_raster(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list( 137 | target_feature = "cell_type_l2", 138 | color_design = c( 139 | "#B00000", "#FF3380", 140 | "#9ef954", "#3F918B", "#03500e", 141 | "#896191", "#350154", 142 | "#FF980A", "#A78300", "#DB6400", 143 | "#1000a0", "#226fa7", 144 | "#F781BF", "#7B0054", "grey"), 145 | #color_design = c(colorRampPalette(brewer.pal(8, "Dark2"))(14), "grey"), 146 | color_break = c( 147 | "naive B", "memory B", 148 | "naive CD4", "memory CD4", "Treg", 149 | "naive CD8", "memory CD8", 150 | "classical monocyte", "intermediate monocyte", "non-classical CD16+ monocyte", 151 | "CD16- NK", "CD16+ NK", 152 | "myeloid DC", "plasmacytoid DC", "undefined"), 153 | method_label = method 154 | ))) 155 | 156 | dev.off() 157 | 158 | pdf(paste0("./manuscript/results/public13Dataset_CITEseq/Figures/WNN/citeseq_obj_RPCA_citeNorm_WNN_15_8_withCluster_noScalePCA_nneighbor20_mindist0.01_", run_name, "_", method, "_weights.pdf"), width = 18, height = 9) 159 | VlnPlot(citeNorm, features = "ADT.weight", group.by = 'cell_type_l1', sort = TRUE, pt.size = 0) + 160 | scale_fill_manual(values = c("#B00000", "#3F918B", "#896191", "#FF980A", "#226fa7", "#F781BF", "grey"), 161 | breaks = c("B", "CD4 T", "CD8 T", "monocytes", "NK", "DCs", "undefined")) 162 | VlnPlot(citeNorm, features = "ADT.weight", group.by = 'cell_type_l2', sort = TRUE, pt.size = 0) + 163 | scale_fill_manual(values = c( 164 | "#B00000", "#FF3380", 165 | "#9ef954", "#3F918B", "#03500e", 166 | "#896191", "#350154", 167 | "#FF980A", "#A78300", "#DB6400", 168 | "#1000a0", "#226fa7", 169 | "#F781BF", "#7B0054", "grey"), 170 | breaks = c( 171 | "naive B", "memory B", 172 | "naive CD4", "memory CD4", "Treg", 173 | "naive CD8", "memory CD8", 174 | "classical monocyte", "intermediate monocyte", "non-classical CD16+ monocyte", 175 | "CD16- NK", "CD16+ NK", 176 | "myeloid DC", "plasmacytoid DC", "undefined")) 177 | VlnPlot(citeNorm, features = "ADT.weight", group.by = 'seurat_clusters', sort = TRUE, pt.size = 0) + 178 | scale_fill_manual(values = colorRampPalette(brewer.pal(8, "Spectral"))(length(adt_feature$seurat_clusters %>% unique))) 179 | dev.off() 180 | 181 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ComparativeAnalysis/evaluate_norm.R: -------------------------------------------------------------------------------- 1 | ## evaluation using ARI, silhouette score on cell types and batches, LISI on batches 2 | 3 | library(data.table) 4 | library(gridExtra) 5 | library(mclust) 6 | library(cluster) 7 | library(stringr) 8 | library(harmony) 9 | library(Seurat) 10 | library(pdfCluster) 11 | library(dplyr) 12 | library(umap) 13 | # library(lisi) 14 | 15 | mlgFindNeighbors = function(latent){ 16 | rownames(latent) = paste0("cell", 1:nrow(latent)) 17 | NN_graph = FindNeighbors(latent, verbose = FALSE, nn.method = "rann") ## "annoy" 18 | G = NN_graph$snn 19 | return(G) 20 | } 21 | mlgARI = function(embed, cluster_label){ ## louvain clustering using resolution 0.8 22 | cluster = cluster_label 23 | G = mlgFindNeighbors(embed) 24 | mlgc = FindClusters(G, resolution = 0.8, random.seed = 20220525, verbose = FALSE)[[1]] #0.8 25 | mlgARI = adjustedRandIndex(mlgc, cluster) 26 | return(mlgARI) 27 | } 28 | # evaluationLISI = function(embed, summary){ ## run it using conda environment 29 | # if ("batch" %in% colnames(summary)) { 30 | # return(compute_lisi(embed, summary, 'batch')$batch) 31 | # } else { 32 | # return(rep(NA, nrow(summary))) 33 | # } 34 | # } 35 | 36 | 37 | ## latent can be PCA 38 | ## latent_type = "PCA" or "UMAP" 39 | ## evaluation_method = "ARI", "Si" 40 | ## label_type = "cell_type" or "batch" 41 | ## cluster_label = c(......) cell types of batches of each cell 42 | evaluation_norm = function(latent = NULL, latent_type = NULL, evaluation_method = NULL, cluster_label = NULL){ 43 | if(evaluation_method == "ARI"){ 44 | return(mlgARI(latent, cluster_label)) 45 | } 46 | 47 | if(evaluation_method == "Si"){ 48 | if(latent_type %in% c("UMAP", "TSNE")){ 49 | clusterNum = factor(cluster_label, levels = unique(cluster_label), label = 1:length(unique(cluster_label))) %>% as.numeric 50 | score = silhouette(clusterNum, dist(as.matrix(latent), method = "manhattan"))[,3] # 51 | return(score) 52 | }else{ 53 | if(latent_type == "PCA"){ 54 | embed = umap(latent)$layout 55 | clusterNum = factor(cluster_label, levels = unique(cluster_label), label = 1:length(unique(cluster_label))) %>% as.numeric 56 | score = silhouette(clusterNum, dist(as.matrix(embed)))[,3] 57 | return(score) 58 | } 59 | } 60 | 61 | } 62 | 63 | # if(evaluation_method == "LISI"){ 64 | # return(evaluationLISI(latent, data.frame(batch = cluster_label))) 65 | # } 66 | } 67 | 68 | 69 | args = commandArgs(trailingOnly=TRUE) 70 | 71 | method = args[1] 72 | evaluation_type = args[2] 73 | study_name = args[3] #"public13Dataset_CITEseq" 74 | out_path = "./manuscript/results/public13Dataset_CITEseq/" 75 | 76 | # adt_data = readRDS(file = paste0(out_path, "/RDS/adt_data_RawCount_", study_name, ".rds")) 77 | if(method %in% c("totalVI_sample_GPU", "totalVI_study_GPU", "sciPENN_sample_GPU", "sciPENN_study_GPU")){ 78 | adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", method, "_", study_name, ".rds")) 79 | 80 | }else{ 81 | adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", study_name, ".rds")) 82 | } 83 | 84 | tmp = readRDS( file = paste0(out_path, "/RDS/adt_", method, "_", study_name, "_pca_umap.rds")) 85 | ans_pca = tmp[[1]] 86 | ans_umap = tmp[[2]] 87 | dim(ans_pca) 88 | dim(ans_umap) 89 | 90 | seed_list = c( 91 | 20220524, 20230817, 20200502, 20230801, 20190826, 20220525, 20230818, 20200503, 20230829, 20190827, 92 | 20137910, 20349283, 24918503, 38298493, 150, 2940, 10, 302934245, 3428, 9123784 93 | ) 94 | 95 | # subsample_index = sample(1:nrow(ans_umap), 50000, replace = FALSE) 96 | generate_index = function(seed){ 97 | set.seed(seed) 98 | subsample_index = c() 99 | for(study_each in unique(adt_feature$study_name)){ 100 | study_index = which(adt_feature$study_name == study_each) 101 | if(length(study_index) < 5000){ 102 | subsample_index = c(subsample_index, study_index) 103 | }else( 104 | subsample_index = c(subsample_index, sample(study_index, 5000, replace = FALSE)) 105 | ) 106 | } 107 | print(length(subsample_index)) 108 | 109 | return(subsample_index) 110 | } 111 | 112 | 113 | ## Silhouette score 114 | if(evaluation_type == "Si"){ 115 | print("Silhouette score...") 116 | si_score_sample = c() 117 | si_score_study = c() 118 | si_score_broadCT = c() 119 | si_score_refineCT = c() 120 | 121 | for (seed in seed_list){ 122 | subsample_index = generate_index(seed) 123 | # latent = ans_pca[subsample_index, 1:min(ncol(ans_pca), 50)] #ans_umap[subsample_index, ] 124 | latent = ans_umap[subsample_index, ] #ans_umap[subsample_index, ] 125 | 126 | cluster_label = adt_feature$sample[subsample_index] %>% as.character 127 | si_score_sample = evaluation_norm(latent, "UMAP", "Si", cluster_label) %>% rbind(si_score_sample, .) 128 | 129 | cluster_label = adt_feature$study_name[subsample_index] %>% as.character 130 | si_score_study = evaluation_norm(latent, "UMAP", "Si", cluster_label) %>% rbind(si_score_study, .) 131 | 132 | cluster_label = adt_feature$cell_type_l2[subsample_index] %>% as.character 133 | cell_select = which(cluster_label != "undefined") 134 | cluster_label = cluster_label[cell_select] 135 | si_score_refineCT = evaluation_norm(latent[cell_select, ], "UMAP", "Si", cluster_label) %>% rbind(si_score_refineCT, .) 136 | 137 | cluster_label = adt_feature$cell_type_l1[subsample_index] %>% as.character 138 | cell_select = which(cluster_label != "undefined") 139 | cluster_label = cluster_label[cell_select] 140 | si_score_broadCT = evaluation_norm(latent[cell_select, ], "UMAP", "Si", cluster_label) %>% rbind(si_score_broadCT, .) 141 | saveRDS(list(si_score_sample, si_score_study, si_score_refineCT, si_score_broadCT), paste0(out_path, "/RDS/adt_", method, "_", study_name, "_umap_si.rds")) 142 | 143 | 144 | } 145 | print(rowMeans(si_score_sample)) 146 | print(rowMeans(si_score_study)) 147 | print(rowMeans(si_score_broadCT)) 148 | print(rowMeans(si_score_refineCT)) 149 | 150 | # saveRDS(list(si_score_sample, si_score_study, si_score_refineCT, si_score_broadCT), paste0(out_path, "/RDS/adt_", method, "_", study_name, "_umap_si.rds")) 151 | 152 | } 153 | 154 | ## ARI 155 | if(evaluation_type == "ARI"){ 156 | print("ARI score") 157 | 158 | ari_sample = c() 159 | ari_study = c() 160 | ari_refineCT = c() 161 | ari_broadCT = c() 162 | 163 | for(seed in seed_list){ 164 | print(seed) 165 | subsample_index = generate_index(seed) 166 | latent = ans_pca[subsample_index, 1:min(ncol(ans_pca), 50)] 167 | 168 | cluster_label = adt_feature$sample[subsample_index] %>% as.character 169 | ari_sample = c(ari_sample, evaluation_norm(latent, "UMAP", "ARI", cluster_label)) 170 | 171 | cluster_label = adt_feature$study_name[subsample_index] %>% as.character 172 | ari_study = c(ari_study, evaluation_norm(latent, "UMAP", "ARI", cluster_label)) 173 | 174 | cluster_label = adt_feature$cell_type_l2[subsample_index] %>% as.character 175 | ari_refineCT = c(ari_refineCT, evaluation_norm(latent, "UMAP", "ARI", cluster_label)) 176 | 177 | cluster_label = adt_feature$cell_type_l1[subsample_index] %>% as.character 178 | ari_broadCT = c(ari_broadCT, evaluation_norm(latent, "UMAP", "ARI", cluster_label)) 179 | saveRDS(list(ari_sample, ari_study, ari_refineCT, ari_broadCT), paste0(out_path, "/RDS/adt_", method, "_", study_name, "_ari.rds")) 180 | 181 | } 182 | print(ari_sample) 183 | print(ari_study) 184 | print(ari_refineCT) 185 | print(ari_broadCT) 186 | 187 | saveRDS(list(ari_sample, ari_study, ari_refineCT, ari_broadCT), paste0(out_path, "/RDS/adt_", method, "_", study_name, "_ari.rds")) 188 | } 189 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ComparativeAnalysis/publicData_CITEseq_umap.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | library(dplyr) 5 | library(umap) 6 | library(viridis) 7 | 8 | 9 | method <- args[1] #"Arcsinh_b5" 10 | run_name = args[2] #"publicData_CITEseq" 11 | out_path = "./manuscript/results/public13Dataset_CITEseq" 12 | # adt_data = readRDS(file = paste0(out_path, "/RDS/adt_data_RawCount_", run_name, ".rds")) 13 | # adt_feature = readRDS(file = paste0("./results/publicData_CITEseq/RDS/adt_feature_", run_name, ".rds")) 14 | adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", run_name, ".rds")) 15 | 16 | ans = readRDS(file = paste0(out_path, "/RDS/adt_data_norm_", method, "_", run_name, ".rds")) 17 | ans[is.na(ans)] = 0 18 | ans_pca = prcomp(ans)$x 19 | print("Start UMAP") 20 | ans_umap = umap(ans_pca[, 1:min(ncol(ans), 50)])$layout %>% data.frame 21 | saveRDS(list(ans_pca, ans_umap), file = paste0(out_path, "/RDS/adt_", method, "_", run_name, "_pca_umap.rds")) 22 | 23 | # tmp = readRDS(file = paste0(out_path, "/RDS/adt_", method, "_", run_name, "_pca_umap.rds")) 24 | # ans_umap = tmp[[2]] 25 | 26 | 27 | print("Start plotting") 28 | pdf(paste0(out_path, "/Figures/adt_", method, "_", run_name, "_umap.pdf"), width = 15, height = 15) 29 | reindex1 = which(adt_feature$cell_type_l1 == "undefined") 30 | reindex2 = which(adt_feature$cell_type_l1 != "undefined") 31 | reindex = c(reindex1, reindex2) 32 | 33 | print(plot_umap(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list( 34 | target_feature = "sample", 35 | color_design = colorRampPalette(brewer.pal(8, "Spectral"))(length(adt_feature$sample %>% unique)), 36 | method_label = method 37 | )) #+ scale_color_brewer(palette = "Dark2") 38 | ) 39 | 40 | print(plot_umap(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list( 41 | target_feature = "study_name", 42 | color_design = colorRampPalette(brewer.pal(8, "Dark2"))(length(adt_feature$study_name %>% unique)), 43 | method_label = method 44 | ))) 45 | 46 | print(plot_umap(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list( 47 | target_feature = "sample_status", 48 | color_design = colorRampPalette(brewer.pal(8, "Set1"))(length(adt_feature$sample_status %>% unique)), 49 | method_label = method 50 | ))) 51 | 52 | adt_feature$cell_type_l1 = factor(adt_feature$cell_type_l1, levels = c("B", "CD4 T", "CD8 T", "monocytes", "NK", "DCs", "undefined")) 53 | print(plot_umap(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list( 54 | target_feature = "cell_type_l1", 55 | color_design = c("#B00000", "#3F918B", "#896191", "#FF980A", "#226fa7", "#F781BF", "grey"), 56 | #color_design = c(colorRampPalette(brewer.pal(8, "Set1"))(6), "grey"), 57 | color_break = c("B", "CD4 T", "CD8 T", "monocytes", "NK", "DCs", "undefined"), 58 | method_label = method 59 | ))) 60 | 61 | adt_feature$cell_type_l2 = factor(adt_feature$cell_type_l2, levels = c("naive B", "memory B", 62 | "naive CD4", "memory CD4", "Treg", 63 | "naive CD8", "memory CD8", 64 | "classical monocyte", "intermediate monocyte", "non-classical CD16+ monocyte", 65 | "CD16- NK", "CD16+ NK", 66 | "myeloid DC", "plasmacytoid DC", "undefined")) 67 | print(plot_umap(ans_umap[reindex, ], adt_feature[reindex, ], point_size = 0.3, parameter_list = list( 68 | target_feature = "cell_type_l2", 69 | color_design = c( 70 | "#B00000", "#FF3380", 71 | "#9ef954", "#3F918B", "#03500e", 72 | "#896191", "#350154", 73 | "#FF980A", "#A78300", "#DB6400", 74 | "#1000a0", "#226fa7", 75 | "#F781BF", "#7B0054", "grey"), 76 | #color_design = c(colorRampPalette(brewer.pal(8, "Dark2"))(14), "grey"), 77 | color_break = c( 78 | "naive B", "memory B", 79 | "naive CD4", "memory CD4", "Treg", 80 | "naive CD8", "memory CD8", 81 | "classical monocyte", "intermediate monocyte", "non-classical CD16+ monocyte", 82 | "CD16- NK", "CD16+ NK", 83 | "myeloid DC", "plasmacytoid DC", "undefined"), 84 | method_label = method 85 | ))) 86 | dev.off() 87 | 88 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ComparativeAnalysis/stain_quality.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(dplyr) 3 | library(RColorBrewer) 4 | library(httpgd) 5 | library(ggpubr) 6 | library(splines) 7 | 8 | run_name = "public13Dataset_CITEseq" 9 | 10 | master_path = "./" 11 | in_path = "./publicData_CITEseq/data/" 12 | out_path = paste0(master_path, "manuscript/results/", run_name) 13 | fcs_path = paste0(out_path, "/FCS/") 14 | fig_path = paste0(out_path, "/Figures/") 15 | 16 | marker_list = c("CD3", "CD4", "CD8", "CD14", "CD19", "CD25", "CD45RA", "CD56", "CD127") 17 | adt_data_full = readRDS(file = paste0(out_path, "/RDS/adt_data_full_RawCount_", run_name, ".rds")) 18 | adt_data = readRDS(file = paste0(out_path, "/RDS/adt_data_RawCount_", run_name, ".rds")) 19 | adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", run_name, ".rds")) 20 | 21 | cell_x_adt = arcsinh_transform(cell_x_adt = adt_data) 22 | cell_x_feature = adt_feature 23 | 24 | 25 | area_under_curve = function(densityObj, threshold, peak_num){ 26 | 27 | if(peak_num > 1){ 28 | index <- which(densityObj$x >= threshold) 29 | auc <- sum(densityObj$y[index] * diff(c(densityObj$x[index], max(densityObj$x)))) 30 | return(auc) 31 | 32 | }else{ 33 | index <- which(densityObj$x >= threshold) 34 | auc <- sum(densityObj$y[index] * diff(c(densityObj$x[index], max(densityObj$x)))) 35 | return(min(auc, 1-auc)) 36 | } 37 | 38 | } 39 | 40 | valley_deep_scaler = function(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, adjust = 3, peak_num = NA){ 41 | ind = which(cell_x_feature$sample == each_sample) 42 | # mean_diff = (peak_info[each_sample, ncol(peak_info)] - peak_info[each_sample, 1]) 43 | adt_tmp = cell_x_adt[ind, adt_marker_select] 44 | 45 | density_res = stats::density( 46 | adt_tmp, 47 | adjust = adjust , na.rm = TRUE 48 | ) 49 | x = density_res$x 50 | y = density_res$y 51 | if(peak_num >1){ 52 | x_peakR = which(abs(x - peak_info[each_sample, ncol(peak_info)]) == min(abs(x - peak_info[each_sample, ncol(peak_info)]))) 53 | y_peakR = y[x_peakR] 54 | }else{ 55 | y_peakR = 0 56 | } 57 | 58 | # x_peakL = which(abs(x - peak_info[each_sample, 1]) == min(abs(x - peak_info[each_sample, 1]))) 59 | if(peak_num <=2){ 60 | x_valley = which(abs(x - valley_info[each_sample, 1]) == min(abs(x - valley_info[each_sample, 1]))) 61 | }else{ 62 | x_valley = which(abs(x - valley_info[each_sample, ncol(valley_info)]) == min(abs(x - valley_info[each_sample, ncol(valley_info)]))) 63 | } 64 | # y_peakL = y[x_peakL] 65 | y_valley = y[x_valley] 66 | auc_scaler = area_under_curve(density_res, threshold = valley_info[each_sample, 1], peak_num = peak_num) 67 | deep_scaler = (1 + y_peakR - y_valley) * (1 + auc_scaler) 68 | return(deep_scaler) 69 | } 70 | 71 | two_peak_stain_quality = function(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = 2){ 72 | 73 | ind = which(cell_x_feature$sample == each_sample) 74 | mean_diff = (peak_info[each_sample, ncol(peak_info)] - peak_info[each_sample, 1]) 75 | adt_tmp = cell_x_adt[ind, adt_marker_select] 76 | within_peak_sd = sqrt((sum((adt_tmp[which(adt_tmp < valley_info[each_sample, 1])] - peak_info[each_sample, 1])^2) + sum((adt_tmp[which(adt_tmp > valley_info[each_sample, 1])] - peak_info[each_sample, ncol(peak_info)])^2))/length(adt_tmp)) 77 | 78 | deep_scaler = valley_deep_scaler(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num) 79 | # return(mean_diff/within_peak_sd) 80 | return((mean_diff*deep_scaler)/within_peak_sd) 81 | } 82 | 83 | multi_peak_stain_quality = function(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = 3){ 84 | 85 | ind = which(cell_x_feature$sample == each_sample) 86 | mean_diff = (peak_info[each_sample, ncol(peak_info)] - peak_info[each_sample, 1]) 87 | adt_tmp = cell_x_adt[ind, adt_marker_select] 88 | within_peak_var = 0 89 | for(col_index in 1:ncol(peak_info)){ 90 | if(col_index == 1){ 91 | ## first peak 92 | adt_tmp_select = which(adt_tmp < valley_info[each_sample, col_index]) 93 | within_peak_var = within_peak_var + sum((adt_tmp[adt_tmp_select] - peak_info[each_sample, col_index])^2) 94 | }else if(col_index == ncol(peak_info)){ 95 | ## last peak 96 | adt_tmp_select = which(adt_tmp > valley_info[each_sample, col_index-1]) 97 | within_peak_var = within_peak_var + sum((adt_tmp[adt_tmp_select] - peak_info[each_sample, col_index])^2) 98 | }else{ 99 | ## middle peak 100 | adt_tmp_select = which(adt_tmp < valley_info[each_sample, col_index] & adt_tmp > valley_info[each_sample, col_index-1]) 101 | within_peak_var = within_peak_var + sum((adt_tmp[adt_tmp_select] - peak_info[each_sample, col_index])^2) 102 | } 103 | } 104 | within_peak_sd = sqrt(within_peak_var/length(adt_tmp)) 105 | 106 | deep_scaler = valley_deep_scaler(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num) 107 | # return(mean_diff/within_peak_sd) 108 | return((mean_diff*deep_scaler)/within_peak_sd) 109 | } 110 | 111 | one_peak_stain_quality = function(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = 1){ 112 | 113 | ind = which(cell_x_feature$sample == each_sample) 114 | adt_tmp = cell_x_adt[ind, adt_marker_select] 115 | 116 | # median_pos = median(adt_tmp[which(adt_tmp > valley_info[each_sample, 1])]) 117 | if(length(peak_info[each_sample,]) == 1){ 118 | mean_diff = abs(valley_info[each_sample, 1] - peak_info[each_sample, 1]) 119 | }else if(is.na(peak_info[each_sample, 1])){ 120 | mean_diff = abs(valley_info[each_sample, 1] - peak_info[each_sample, ncol(peak_info)]) 121 | }else{ 122 | mean_diff = abs(valley_info[each_sample, 1] - peak_info[each_sample, 1]) 123 | } 124 | within_peak_sd = sqrt(sd(adt_tmp)) 125 | deep_scaler = valley_deep_scaler(cell_x_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num) 126 | # return(mean_diff/within_peak_sd) 127 | return((mean_diff*deep_scaler)/within_peak_sd) 128 | 129 | 130 | } 131 | 132 | ## use valley and peak 133 | peak_num_summary = c() 134 | peak_sep_summary = c() 135 | 136 | for(adt_marker_select in colnames(cell_x_adt)){ ## 137 | 138 | peak_valley_density = readRDS(paste0(out_path, "/RDS/peak_valley_locations_", adt_marker_select, "_ADTnorm_sample_manual_keepZero.rds")) 139 | 140 | peak_info = peak_valley_density$peak_landmark_list 141 | valley_info = peak_valley_density$valley_landmark_list 142 | 143 | for(each_sample in unique(adt_feature$sample)){ 144 | batch_info = unique(adt_feature$study_name[which(adt_feature$sample == each_sample)]) %>% as.vector 145 | each_peak_info = peak_info[each_sample, ] 146 | peak_num = sum(is.na(each_peak_info) == FALSE) 147 | peak_num_summary = data.frame( 148 | peak_num = peak_num, 149 | batch = batch_info, 150 | sample = each_sample, adt_marker = adt_marker_select) %>% 151 | rbind(peak_num_summary, .) 152 | if(peak_num == 1){ 153 | peak_sep_summary = data.frame( 154 | sep_power = one_peak_stain_quality(adt_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num), 155 | adt_marker = adt_marker_select, sample = each_sample, batch = batch_info, peak_num = paste0("# of peak: ", peak_num)) %>% rbind(peak_sep_summary, .) 156 | } 157 | if(peak_num == 2){ 158 | peak_sep_summary = data.frame( 159 | sep_power = two_peak_stain_quality(adt_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num), 160 | adt_marker = adt_marker_select, sample = each_sample, batch = batch_info, peak_num = paste0("# of peak: ", peak_num)) %>% rbind(peak_sep_summary, .) 161 | } 162 | if(peak_num > 2){ 163 | peak_sep_summary = data.frame( 164 | sep_power = multi_peak_stain_quality(adt_feature, cell_x_adt, adt_marker_select, each_sample, peak_info, valley_info, peak_num = peak_num), 165 | adt_marker = adt_marker_select, sample = each_sample, batch = batch_info, peak_num = paste0("# of peak: ", peak_num)) %>% rbind(peak_sep_summary, .) 166 | } 167 | } 168 | } 169 | 170 | file_list <- c( 171 | "10X_pbmc_10k", "10X_pbmc_1k", "10X_pbmc_5k_v3", "10X_pbmc_5k_nextgem", "10X_malt_10k", 172 | "stuart_2019", "granja_2019_pbmc", "granja_2019_bmmc", "hao_2020", 173 | "kotliarov_2020", "witkowski_2020", "triana_2021", "buus_2021_T" 174 | ) 175 | 176 | peak_sep_summary$batch = factor(peak_sep_summary$batch, levels = file_list) 177 | peak_sep_summary$sample = factor(peak_sep_summary$sample, levels = unique(cell_x_feature$sample)) 178 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/arcsinh_centered_scaled_transform.R: -------------------------------------------------------------------------------- 1 | ## Arcsinh centered scaled transformation 2 | require(dplyr) 3 | require(Seurat) 4 | 5 | arcsinh_centered_scaled_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){ 6 | ## parameters 7 | a = 1 8 | b = 1/5 9 | c = 0 10 | if(!is.null(parameter_list)){ 11 | if("a" %in% names(parameter_list)){ 12 | a = parameter_list[["a"]] 13 | } 14 | if("b" %in% names(parameter_list)){ 15 | b = parameter_list[["b"]] 16 | } 17 | if("c" %in% names(parameter_list)){ 18 | c = parameter_list[["c"]] 19 | } 20 | } 21 | ## transformation 22 | asinhTrans <- arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c) 23 | 24 | ## output 25 | out <- ((cell_x_adt %>% 26 | asinhTrans() %>% 27 | NormalizeData(normalization.method = "CLR", scale.factor = 1000000, margin = 1) %>% 28 | exp()) - 1) * 3 29 | return(out) 30 | } -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/arcsinh_centered_transform.R: -------------------------------------------------------------------------------- 1 | ## Arcsinh centered transformation 2 | require(dplyr) 3 | require(Seurat) 4 | 5 | arcsinh_centered_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){ 6 | ## parameters 7 | a = 1 8 | b = 1/5 9 | c = 0 10 | if(!is.null(parameter_list)){ 11 | if("a" %in% names(parameter_list)){ 12 | a = parameter_list[["a"]] 13 | } 14 | if("b" %in% names(parameter_list)){ 15 | b = parameter_list[["b"]] 16 | } 17 | if("c" %in% names(parameter_list)){ 18 | c = parameter_list[["c"]] 19 | } 20 | } 21 | ## transformation 22 | asinhTrans <- arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c) 23 | 24 | ## output 25 | out <- (cell_x_adt %>% 26 | asinhTrans() %>% 27 | NormalizeData(normalization.method = "CLR", scale.factor = 1000000, margin = 1) %>% 28 | exp()) - 1 29 | return(out) 30 | } -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/arcsinh_clr_transform.R: -------------------------------------------------------------------------------- 1 | ## Arcsinh CLR transformation 2 | require(dplyr) 3 | require(Seurat) 4 | 5 | arcsinh_clr_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){ 6 | ## parameters 7 | a = 1 8 | b = 1/5 9 | c = 0 10 | if(!is.null(parameter_list)){ 11 | if("a" %in% names(parameter_list)){ 12 | a = parameter_list[["a"]] 13 | } 14 | if("b" %in% names(parameter_list)){ 15 | b = parameter_list[["b"]] 16 | } 17 | if("c" %in% names(parameter_list)){ 18 | c = parameter_list[["c"]] 19 | } 20 | } 21 | ## transformation 22 | asinhTrans <- arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c) 23 | 24 | ## output 25 | out <- cell_x_adt %>% 26 | asinhTrans %>% 27 | NormalizeData(normalization.method = "CLR", scale.factor = 1000000, margin = 1) 28 | return(out) 29 | } -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/arcsinh_transform.R: -------------------------------------------------------------------------------- 1 | ## Arcsinh transformation 2 | arcsinh_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){ 3 | ## parameters 4 | a = 1 5 | b = 1/5 6 | c = 0 7 | if(!is.null(parameter_list)){ 8 | if("a" %in% names(parameter_list)){ 9 | a = parameter_list[["a"]] 10 | } 11 | if("b" %in% names(parameter_list)){ 12 | b = parameter_list[["b"]] 13 | } 14 | if("c" %in% names(parameter_list)){ 15 | c = parameter_list[["c"]] 16 | } 17 | } 18 | ## transformation 19 | asinhTrans <- arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c) 20 | 21 | ## output 22 | out <- asinhTrans(cell_x_adt) 23 | return(out) 24 | } -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/centered_arcsinh_transform.R: -------------------------------------------------------------------------------- 1 | ## centered arcsinh transformation 2 | require(dplyr) 3 | require(Seurat) 4 | 5 | centered_arcsinh_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){ 6 | 7 | ## parameters 8 | a = 1 9 | b = 1/5 10 | c = 0 11 | if(!is.null(parameter_list)){ 12 | if("a" %in% names(parameter_list)){ 13 | a = parameter_list[["a"]] 14 | } 15 | if("b" %in% names(parameter_list)){ 16 | b = parameter_list[["b"]] 17 | } 18 | if("c" %in% names(parameter_list)){ 19 | c = parameter_list[["c"]] 20 | } 21 | } 22 | ## transformation 23 | asinhTrans <- arcsinhTransform(transformationId = "ln-transformation", a = a, b = b, c = c) 24 | 25 | ## output 26 | out <- ((cell_x_adt %>% 27 | NormalizeData(normalization.method = "CLR", scale.factor = 1000000, margin = 1) %>% 28 | exp()) - 1) %>% 29 | asinhTrans() 30 | return(out) 31 | } -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/clr_transformation.R: -------------------------------------------------------------------------------- 1 | ## CLR transformation 2 | require(dplyr) 3 | require(Seurat) 4 | 5 | clr_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){ 6 | 7 | ## output 8 | out <- cell_x_adt %>% 9 | NormalizeData(normalization.method = "CLR", scale.factor = 1000000, margin = 1) 10 | return(out) 11 | } -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/cytofruv_transform.R: -------------------------------------------------------------------------------- 1 | ## CytofRUV normalization 2 | require(writexl) 3 | require(CATALYST) 4 | 5 | cytofruv_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL) { 6 | 7 | ## read parameters 8 | tmp_path <- parameter_list$tmp_path 9 | fcs_path <- parameter_list$fcs_path 10 | condition <- parameter_list$condition 11 | patient_id <- parameter_list$patient_id 12 | batch <- parameter_list$batch 13 | clusters_nb <- parameter_list$clusters_nb 14 | start_adt_method <- parameter_list$start_adt_method 15 | 16 | ## change column name to ensure all end with number 17 | ori_col_name <- colnames(cell_x_adt) 18 | colnames(cell_x_adt) <- paste0(colnames(cell_x_adt), "0") 19 | 20 | for (each_sample in cell_x_feature$sample %>% levels()) { 21 | if (!dir.exists(paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV"))) { 22 | dir.create(paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV")) 23 | } 24 | fcs_file_name <- paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV/", each_sample, ".fcs") 25 | if (!file.exists(fcs_file_name)) { 26 | sample_ind <- which(cell_x_feature$sample == each_sample) 27 | fcs_count <- cell_x_adt[sample_ind, ] %>% as.matrix() 28 | fcs <- flowFrame(fcs_count) 29 | write.FCS(fcs, filename = fcs_file_name) 30 | } 31 | } 32 | md <- data.frame( 33 | file_name = paste0(levels(cell_x_feature$sample), ".fcs"), 34 | sample_id = levels(cell_x_feature$sample), 35 | condition = condition, 36 | patient_id = patient_id, 37 | batch = batch 38 | ) 39 | 40 | panel <- data.frame( 41 | fcs_colname = colnames(cell_x_adt), 42 | antigen = colnames(cell_x_adt), 43 | marker_class = "type" 44 | ) 45 | write_xlsx(x = md, path = paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV/", "/Metadata.xlsx")) 46 | write_xlsx(x = panel, path = paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV/", "/Panel.xlsx")) 47 | 48 | cytof_data <- load_data(paste0(fcs_path, "/", start_adt_method, "_for_CytofRUV/"), metadata_filename = "Metadata.xlsx", panel_filename = "Panel.xlsx") 49 | cytof_data$daf <- cluster_data( 50 | cytof_data$daf, 51 | seed = 12345, 52 | markers_to_use = cytof_data$daf %>% rownames(), 53 | clusters_nb = clusters_nb) 54 | cytof_data$lineage_markers <- cytof_data$daf %>% rownames() 55 | 56 | dir_name_norm_data <- "CytofRUV" 57 | raw_data <- data.frame( 58 | sample = cytof_data$daf$sample_id, 59 | cluster = cluster_ids(cytof_data$daf), t(SummarizedExperiment::assay(cytof_data$daf, "exprs")) 60 | ) 61 | rep_samples <- levels(cell_x_feature$sample) %>% list() # list(file_list) 62 | cluster_list_rep_samples <- list(seq(1, clusters_nb)) 63 | k_value <- 5 64 | seed <- 1234 65 | 66 | normalise_data( 67 | data = cytof_data, 68 | raw_data = raw_data, 69 | rep_samples = rep_samples, 70 | norm_clusters = cluster_list_rep_samples, 71 | k = k_value, 72 | num_clusters = clusters_nb, 73 | wd_data = fcs_path, 74 | dir_norm_data = dir_name_norm_data 75 | ) 76 | out <- c() 77 | for (each_sample in levels(cell_x_feature$sample)) { 78 | tmpNorm <- read.FCS(filename = paste0(fcs_path, dir_name_norm_data, "/Norm_RUVIII_k", k_value, "_", each_sample, ".fcs")) 79 | 80 | out <- exprs(tmpNorm) %>% 81 | round(3) %>% 82 | data.frame() %>% 83 | rbind(out, .) 84 | } 85 | colnames(out) <- ori_col_name 86 | return(out) 87 | 88 | 89 | } 90 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/decontPro_transform.R: -------------------------------------------------------------------------------- 1 | ## decontPro transformation 2 | require(decontX) 3 | require(Seurat) 4 | require(dplyr) 5 | 6 | decontPro_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL){ 7 | 8 | if(!is.null(parameter_list)){ 9 | delta_sd = parameter_list$delta_sd, 10 | background_sd = parameter_list$background_sd 11 | }else{ 12 | ## give a default value 13 | delta_sd = 2e-5 14 | background_sd = 2e-6 15 | } 16 | adt_seurat <- CreateSeuratObject(t(cell_x_adt), assay = "ADT") 17 | adt_seurat <- NormalizeData(adt_seurat, normalization.method = "CLR", margin = 2) %>% 18 | ScaleData(assay = "ADT") %>% 19 | RunPCA(assay = "ADT", features = rownames(adt_seurat), npcs = min(ncol(cell_x_adt), 10), reduction.name = "pca_adt") 20 | 21 | adt_seurat = FindNeighbors(adt_seurat, dims = 1:min(ncol(cell_x_adt)-1, 10), assay = "ADT", reduction = "pca_adt") %>% 22 | FindClusters(resolution = 0.5) 23 | 24 | clusters <- as.integer(Idents(adt_seurat)) 25 | 26 | rm_cell_index = which(rowSums(cell_x_adt) == 0) 27 | out <- decontPro(t(cell_x_adt[-rm_cell_index, ]), 28 | clusters[-rm_cell_index], 29 | delta_sd = delta_sd, 30 | background_sd = background_sd) 31 | 32 | decontaminated_counts <- out$decontaminated_counts 33 | norm_counts <- matrix(0, nrow(decontaminated_counts), nrow(cell_x_adt)) 34 | norm_counts[, -rm_cell_index] <- decontaminated_counts 35 | 36 | return(t(norm_counts)) 37 | } 38 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/fdanorm_transform.R: -------------------------------------------------------------------------------- 1 | ## fdaNorm transformation 2 | require(flowStats) 3 | 4 | fdanorm_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL) { 5 | 6 | ## read in parameters 7 | parameter_list_name <- names(parameter_list) 8 | if ("fcs_path" %in% parameter_list_name) { 9 | fcs_path <- parameter_list$fcs_path 10 | } else { 11 | return("fcs_path is not provided in the parameter_list!") 12 | } 13 | 14 | if ("start_adt_method" %in% parameter_list_name) { 15 | start_adt_method <- parameter_list$start_adt_method 16 | } else { 17 | return("start_adt_method is not provided in the paramter_list!") 18 | } 19 | 20 | ## write out fcs files 21 | for (each_sample in cell_x_feature$sample %>% levels()) { 22 | if (!dir.exists(paste0(fcs_path, "/", start_adt_method))) { 23 | dir.create(paste0(fcs_path, "/", start_adt_method)) 24 | } 25 | fcs_file_name <- paste0(fcs_path, "/", start_adt_method, "/", each_sample, ".fcs") 26 | if (!file.exists(fcs_file_name)) { 27 | sample_ind <- which(cell_x_feature$sample == each_sample) 28 | fcs_count <- cell_x_adt[sample_ind, ] %>% as.matrix() 29 | fcs <- flowFrame(fcs_count) 30 | write.FCS(fcs, filename = fcs_file_name) 31 | } 32 | } 33 | 34 | ## fdanorm 35 | file_fcs <- read.ncdfFlowSet(paste0(fcs_path, "/", start_adt_method, "/", levels(cell_x_feature$sample), ".fcs")) 36 | adt_fdanorm <- warpSet(file_fcs, colnames(cell_x_adt), monwrd = TRUE) %>% as.flowSet() 37 | 38 | out <- c() 39 | for (i in 1:length(adt_fdanorm)) { 40 | out <- adt_fdanorm[[i]] %>% 41 | exprs() %>% 42 | rbind(out, .) 43 | } 44 | 45 | return(out) 46 | } 47 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/gaussnorm_transform.R: -------------------------------------------------------------------------------- 1 | ## gaussNorm transformation 2 | require(flowStats) 3 | 4 | gaussnorm_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL) { 5 | 6 | ## read in parameters 7 | parameter_list_name <- names(parameter_list) 8 | if ("fcs_path" %in% parameter_list_name) { 9 | fcs_path <- parameter_list$fcs_path 10 | } else { 11 | return("fcs_path is not provided in the parameter_list!") 12 | } 13 | 14 | if ("start_adt_method" %in% parameter_list_name) { 15 | start_adt_method <- parameter_list$start_adt_method 16 | } else { 17 | return("start_adt_method is not provided in the paramter_list!") 18 | } 19 | 20 | ## write out fcs files 21 | for (each_sample in cell_x_feature$sample %>% levels()) { 22 | if (!dir.exists(paste0(fcs_path, "/", start_adt_method))) { 23 | dir.create(paste0(fcs_path, "/", start_adt_method)) 24 | } 25 | fcs_file_name <- paste0(fcs_path, "/", start_adt_method, "/", each_sample, ".fcs") 26 | if (!file.exists(fcs_file_name)) { 27 | sample_ind <- which(cell_x_feature$sample == each_sample) 28 | fcs_count <- cell_x_adt[sample_ind, ] %>% as.matrix() 29 | fcs <- flowFrame(fcs_count) 30 | write.FCS(fcs, filename = fcs_file_name) 31 | } 32 | } 33 | 34 | ## gaussnorm 35 | file_fcs <- read.ncdfFlowSet(paste0(fcs_path, "/", start_adt_method, "/", levels(cell_x_feature$sample), ".fcs")) 36 | adt_gaussnorm <- gaussNorm(file_fcs, colnames(cell_x_adt))$flowset 37 | 38 | out <- c() 39 | for (i in 1:length(adt_gaussnorm)) { 40 | out <- adt_gaussnorm[[i]] %>% 41 | exprs() %>% 42 | rbind(out, .) 43 | } 44 | 45 | return(out) 46 | } 47 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/get_peak_mode.R: -------------------------------------------------------------------------------- 1 | ## get peak location 2 | require(flowStats) 3 | require(dplyr) 4 | 5 | get_peak_mode <- function(cell_x_adt = NULL, cell_x_feature = NULL, adt_marker_select = NULL, parameter_list = NULL) { 6 | 7 | ## get parameters 8 | user_define_marker <- NULL ## known marker that tend to have bimodal 9 | user_define_peak <- NULL ## known peak that tend to be positive if only one peak detected 10 | bwFac <- 1.2 11 | border <- 0.01 12 | peakNr <- NULL 13 | densities <- NULL 14 | n <- 201 15 | indices <- FALSE 16 | lower_peak_thres <- 0.001 17 | 18 | start_adt_method <- parameter_list$start_adt_method 19 | fcs_path <- parameter_list$fcs_path 20 | parameter_list_name <- names(parameter_list) 21 | 22 | if ("user_define_marker" %in% parameter_list_name) { 23 | user_define_marker <- parameter_list$user_define_marker ## known marker that tend to have bimodal 24 | } 25 | if ("user_define_peak" %in% parameter_list_name) { 26 | user_define_peak <- parameter_list$user_define_peak ## known peak that tend to be positive if only one peak detected 27 | } 28 | if ("bwFac_smallest" %in% parameter_list_name) { 29 | bwFac_smallest <- parameter_list$bwFac_smallest 30 | } else { 31 | bwFac_smallest <- 1.5 32 | } 33 | 34 | ## write out fcs files 35 | for (each_sample in cell_x_feature$sample %>% levels()) { 36 | if (!dir.exists(paste0(fcs_path, "/", start_adt_method))) { 37 | dir.create(paste0(fcs_path, "/", start_adt_method)) 38 | } 39 | fcs_file_name <- paste0(fcs_path, "/", start_adt_method, "/", each_sample, ".fcs") 40 | if (!file.exists(fcs_file_name)) { 41 | sample_ind <- which(cell_x_feature$sample == each_sample) 42 | fcs_count <- cell_x_adt[sample_ind, ] %>% as.matrix() 43 | fcs <- flowFrame(fcs_count) 44 | write.FCS(fcs, filename = fcs_file_name) 45 | } 46 | } 47 | 48 | dat <- read.ncdfFlowSet(paste0(fcs_path, "/", start_adt_method, "/", levels(cell_x_feature$sample), ".fcs")) 49 | 50 | ranges <- fsApply(dat, range) 51 | from <- min(sapply(ranges, function(z) z[1, adt_marker_select] - diff(z[, adt_marker_select]) * 0.15), na.rm = TRUE) 52 | to <- max(sapply(ranges, function(z) z[2, adt_marker_select] + diff(z[, adt_marker_select]) * 0.15), na.rm = TRUE) 53 | 54 | peak_num <- 0 55 | peak_mode <- list() 56 | peak_region <- list() 57 | zero_prop_list <- list() 58 | 59 | for (sample_name in sampleNames(dat)) { 60 | try_out <- tryCatch(filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = 2)), error = function(e) { 61 | c(1, 2) 62 | }) 63 | if (length(try_out) == 1) { 64 | fres1 <- filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = 2)) 65 | fres2 <- filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = 3)) 66 | fres3 <- filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = 3.1)) 67 | 68 | cell_ind <- which(cell_x_feature$sample == gsub(".fcs", "", sample_name)) 69 | zero_prop <- sum(cell_x_adt[cell_ind, adt_marker_select] < 2) / length(cell_x_adt[cell_ind, adt_marker_select]) 70 | zero_prop_list[[gsub(".fcs", "", sample_name)]] <- zero_prop 71 | 72 | ## different bandwidth w.r.t the zero proportion. 73 | if (zero_prop > 0.5) { 74 | fres <- fres3 75 | } else if (zero_prop > 0.3) { 76 | fres <- fres2 77 | } else { 78 | fres <- fres1 79 | } 80 | 81 | if (adt_marker_select == "CD4") { 82 | fres <- filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = bwFac_smallest)) 83 | } 84 | peak_info <- flowStats:::curvPeaks( 85 | x = fres[[sample_name]], 86 | dat = exprs(dat[[sample_name]])[, adt_marker_select], 87 | borderQuant = border, 88 | from = from, 89 | to = to 90 | ) 91 | # peak_info$midpoints = peak_info$peaks[, "x"] 92 | 93 | ## User defined the marker that is known to usually have multiple peaks (n = 2) 94 | if (adt_marker_select %in% user_define_marker) { 95 | if (length(peak_info$midpoints) == 2) { 96 | ## 2 peaks, perfect! 97 | res <- peak_info$midpoints 98 | res_region <- peak_info$regions 99 | } else if (length(peak_info$midpoints) > 2) { 100 | ## more than 2 peaks, consider filtering out very low density peaks 101 | peak_ind <- peak_info$peaks[, "y"] > lower_peak_thres 102 | res <- peak_info$midpoints[peak_ind] 103 | res_region <- peak_info$regions[peak_ind, ] 104 | } else if (zero_prop > 0.3 && length(peak_info$midpoints) < 2) { 105 | ## less than 2 peaks and zero proportion is larger than 0.3, use finer bandwidth:fres1 instead of fres2 106 | peak_info <- flowStats:::curvPeaks( 107 | x = fres1[[sample_name]], 108 | dat = exprs(dat[[sample_name]])[, adt_marker_select], 109 | borderQuant = 0, 110 | from = from, 111 | to = to 112 | ) 113 | # peak_info$midpoints = peak_info$peaks[, "x"] 114 | if (length(peak_info$midpoints) <= 2) { 115 | ## peak number <=2 output results. 116 | res <- peak_info$midpoints 117 | res_region <- peak_info$regions 118 | } else if (length(peak_info$midpoints > 2)) { 119 | ## using new bandwidth, too many peaks, consider filtering out very low density peaks 120 | res <- peak_info$midpoints[peak_info$peaks[, "y"] > lower_peak_thres] 121 | res_region <- peak_info$regions[peak_info$peaks[, "y"] > lower_peak_thres, ] 122 | } 123 | } else if (zero_prop <= 0.3 && length(peak_info$midpoints) < 2) { 124 | ## less than 2 peaks and small zero proportion, user finer bandwidth: fres0 instead of fres1 125 | fres0 <- filter(dat[sample_name], curv1Filter(adt_marker_select, bwFac = bwFac_smallest)) ## 1.5 126 | peak_info <- flowStats:::curvPeaks( 127 | x = fres0[[sample_name]], 128 | dat = exprs(dat[[sample_name]])[, adt_marker_select], 129 | borderQuant = 0, 130 | from = from, 131 | to = to 132 | ) 133 | # peak_info$midpoints = peak_info$peaks[, "x"] 134 | if (length(peak_info$midpoints) <= 2) { 135 | res <- peak_info$midpoints 136 | res_region <- peak_info$regions 137 | } else if (length(peak_info$midpoints > 2)) { 138 | res <- peak_info$midpoints[peak_info$peaks[, "y"] > lower_peak_thres] 139 | res_region <- peak_info$regions[peak_info$peaks[, "y"] > lower_peak_thres, ] 140 | } 141 | } else { 142 | ## no other cases? 143 | res <- peak_info$midpoints[peak_info$peaks[, "y"] > lower_peak_thres] 144 | res_region <- peak_info$regions[peak_info$peaks[, "y"] > lower_peak_thres, ] 145 | } 146 | } else { 147 | ## not in user defined marker list, can have 1 peaks. Filter very low density peaks 148 | res <- peak_info$midpoints[peak_info$peaks[, "y"] > lower_peak_thres] 149 | res_region <- peak_info$regions[peak_info$peaks[, "y"] > lower_peak_thres, ] 150 | } 151 | 152 | 153 | ## all the multiple peaks are around 0 154 | if (length(res) > 1 && zero_prop <= 0.3 && (sum(res < 2) == length(res))) { 155 | ## use broader bandwidth to merge multiple peaks around 0. Use fres2 instead fres1 156 | peak_infoTmp <- flowStats:::curvPeaks( 157 | x = fres2[[sample_name]], 158 | dat = exprs(dat[[sample_name]])[, adt_marker_select], 159 | borderQuant = border, 160 | from = from, 161 | to = to 162 | ) 163 | # peak_infoTmp$midpoints = peak_infoTmp$peaks[, "x"] 164 | 165 | if (adt_marker_select %in% user_define_marker) { 166 | ## if user define this marker to have 2 peaks. 167 | if (length(peak_infoTmp$midpoints) == 2) { 168 | resTmp <- peak_infoTmp$midpoints 169 | res_regionTmp <- peak_infoTmp$regions 170 | } else { 171 | resTmp <- peak_infoTmp$midpoints[peak_infoTmp$peaks[, "y"] > lower_peak_thres] 172 | res_regionTmp <- peak_infoTmp$regions[peak_infoTmp$peaks[, "y"] > lower_peak_thres, ] 173 | } 174 | } else { 175 | resTmp <- peak_infoTmp$midpoints[peak_infoTmp$peaks[, "y"] > lower_peak_thres] 176 | res_regionTmp <- peak_infoTmp$regions[peak_infoTmp$peaks[, "y"] > lower_peak_thres, ] 177 | } 178 | 179 | indTmp <- which(!is.na(resTmp)) 180 | resTmp <- resTmp[indTmp] 181 | if (is.null(nrow(res_regionTmp))) { 182 | res_regionTmp <- res_regionTmp %>% 183 | as.matrix() %>% 184 | t() 185 | } 186 | res_regionTmp <- res_regionTmp[indTmp, ] 187 | if (length(resTmp) > 1 && (sum(resTmp < 2) < length(resTmp))) { 188 | res <- resTmp 189 | res_region <- res_regionTmp 190 | } 191 | } 192 | 193 | ## remove small negative peak around 0 194 | if (length(res) > 1 && zero_prop < 0.3 && (sum(res < 2) < length(res))) { 195 | if (peak_info$peaks[1, "x"] < 0.9 && peak_info$peaks[1, "y"] < 1 && peak_info$peaks[2, "x"] > 2 && peak_info$peaks[2, "y"] / peak_info$peaks[1, "y"] > 5) { 196 | res <- res[-1] 197 | res_region <- res_region[-1, ] 198 | } 199 | } 200 | 201 | ## all the peaks around 2 and zero proportion very large. Highly likely to have only one peak. 202 | if (length(res) > 1 && zero_prop > 0.5 && (sum(res < 2) == length(res))) { 203 | res <- peak_info$midpoints[which(peak_info$peaks[, "y"] == max(peak_info$peak[, "y"]))] 204 | res_region <- peak_info$regions[which(peak_info$peaks[, "y"] == max(peak_info$peak[, "y"])), ] 205 | } 206 | 207 | 208 | 209 | peak_num <- max(peak_num, length(res)) 210 | peak_mode[[sample_name]] <- res 211 | peak_region[[sample_name]] <- matrix(NA, ncol = 2, nrow = length(res)) 212 | peak_region[[sample_name]][1:length(res), ] <- res_region 213 | } else { 214 | ## only one value for this marker 215 | peak_mode[[sample_name]] <- NA 216 | peak_region[[sample_name]] <- matrix(NA, ncol = 2, nrow = 1) 217 | print(paste0(i, "-Single Value!")) 218 | } 219 | } 220 | 221 | landmark <- matrix(NA, ncol = peak_num, nrow = length(peak_mode)) 222 | landmarkRegion <- list() 223 | for (i in 1:peak_num) { 224 | landmarkRegion[[i]] <- matrix(NA, ncol = 2, nrow = length(peak_mode)) 225 | } 226 | for (i in 1:length(peak_mode)) { 227 | if (!is.na(peak_mode[[i]][1])) { 228 | peak_modeNum <- length(peak_mode[[i]]) 229 | if (peak_modeNum == 1) { 230 | if (paste0(names(peak_mode)[i], ">", adt_marker_select) %in% user_define_peak) { 231 | landmark[i, min(2, peak_num)] <- peak_mode[[i]] 232 | landmarkRegion[[min(2, peak_num)]][i, ] <- peak_region[[i]] 233 | } else { 234 | landmark[i, 1] <- peak_mode[[i]] 235 | landmarkRegion[[1]][i, ] <- peak_region[[i]] 236 | } 237 | } else if (peak_modeNum == 2) { 238 | landmark[i, c(1, max(2, peak_num))] <- peak_mode[[i]] 239 | 240 | landmarkRegion[[1]][i, ] <- peak_region[[i]][1, ] 241 | landmarkRegion[[max(2, peak_num)]][i, ] <- peak_region[[i]][2, ] 242 | } else if (peak_modeNum == 3) { 243 | landmark[i, c(1, 2, max(3, peak_num))] <- peak_mode[[i]] 244 | landmarkRegion[[1]][i, ] <- peak_region[[i]][1, ] 245 | landmarkRegion[[2]][i, ] <- peak_region[[i]][2, ] 246 | landmarkRegion[[max(3, peak_num)]][i, ] <- peak_region[[i]][3, ] 247 | } else { 248 | landmark[i, 1:peak_modeNum] <- peak_mode[[i]] 249 | for (k in 1:peak_modeNum) { 250 | landmarkRegion[[k]][i, ] <- peak_region[[i]][k, ] 251 | } 252 | } 253 | } 254 | } 255 | 256 | ## if all the peaks are within 1 - highly likely that there is only one negative peak 257 | if (max(landmark[!is.na(landmark)]) < 2) { 258 | landmark_new <- matrix(NA, ncol = 1, nrow = nrow(landmark)) 259 | landmarkAllMedian <- median(landmark[!is.na(landmark)]) 260 | for (i in 1:nrow(landmark)) { 261 | landmark_nonNA <- landmark[i, !is.na(landmark[i, ])] 262 | if (length(landmark_nonNA) > 0) { 263 | landmark_new[i, ] <- landmark[i, which.min(abs(landmark_nonNA - landmarkAllMedian))] 264 | } else { 265 | landmark_new[i, ] <- NA 266 | } 267 | } 268 | landmark <- landmark_new 269 | } 270 | 271 | rownames(landmark) <- levels(cell_x_feature$sample) 272 | return(list(peak_landmark_list = landmark, zero_prop_list = zero_prop_list)) 273 | } 274 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/get_valley.R: -------------------------------------------------------------------------------- 1 | ## get valley location 2 | require(dplyr) 3 | 4 | get_valley_location <- function(cell_x_adt = NULL, cell_x_feature = NULL, adt_marker_select = NULL, parameter_list = NULL, ....) { 5 | adjust <- parameter_list$adjust 6 | peak_landmark_list <- parameter_list$peak_mode_res$peak_landmark_list 7 | zero_prop_list <- parameter_list$peak_mode_res$zero_prop_list 8 | user_define_peak <- parameter_list$user_define_peak 9 | 10 | parameter_list_name <- names(parameter_list) 11 | 12 | if ("min_fc" %in% parameter_list_name) { 13 | min_fc <- parameter_list$min_fc 14 | } 15 | valley_location_list <- matrix(NA, nrow = nrow(peak_landmark_list), ncol = max(1, ncol(peak_landmark_list) - 1)) 16 | rownames(valley_location_list) <- cell_x_feature$sample %>% levels() 17 | for (sample_name in cell_x_feature$sample %>% levels()) { 18 | peak_landmark <- peak_landmark_list[sample_name, ] 19 | zero_prop <- zero_prop_list[[sample_name]] 20 | 21 | ## check if user define single peak to be positive peak 22 | if (paste0(sample_name, ">", adt_marker_select) %in% user_define_peak) { 23 | lower_valley <- TRUE 24 | } else { 25 | lower_valley <- FALSE 26 | } 27 | 28 | density_res <- density( 29 | cell_x_adt[which(cell_x_feature$sample == sample_name), adt_marker_select], 30 | adjust = adjust 31 | ) 32 | x <- density_res$x 33 | y <- density_res$y 34 | 35 | sign_diff <- sign(diff(y)) 36 | diff_sign_diff <- diff(sign_diff) 37 | # peak <- which(diff_sign_diff == -2) + 1 38 | valley <- which(diff_sign_diff == 2) + 1 39 | 40 | # yp <- y[peak] 41 | x_valley <- x[valley] 42 | y_valley <- y[valley] 43 | # stdy <- std(yp) 44 | real_peak <- peak_landmark[!is.na(peak_landmark)] # peak 45 | np <- length(real_peak) 46 | 47 | 48 | if (np > 1) { 49 | real_valley <- c() 50 | for (i in 1:(np - 1)) { 51 | tmp_valley <- x_valley[(x_valley > real_peak[i]) & (x_valley < real_peak[i + 1])] 52 | real_valley <- c(real_valley, tmp_valley[which.min(y_valley[(x_valley > real_peak[i]) & (x_valley < real_peak[i + 1])])]) 53 | } 54 | } else { 55 | if (lower_valley == FALSE) { 56 | real_valley <- x[which(y < max(y) / min_fc)[which(y < max(y) / min_fc) > max(which(y == max(y)), which(x > real_peak[1]) %>% min())] %>% min()] # x_valley[x_valley > real_peak[1]][1] 57 | if (zero_prop > 0.8) { 58 | real_valley <- max(2, real_valley) 59 | } 60 | } else { 61 | real_valley <- x[which(y < max(y) / min_fc)[which(y < max(y) / min_fc) < min(which(y == max(y)), which(x < real_peak[1]) %>% max())] %>% max()] 62 | } 63 | } 64 | ## check if no valley is detected due to shoulder peak 65 | if (length(real_valley) == 0) { 66 | real_valley <- (real_peak[1] + real_peak[2]) / 2 67 | } 68 | 69 | valley_location_list[sample_name, 1:length(real_valley)] <- real_valley 70 | } 71 | 72 | 73 | ## rownames(valley_location_list) <- cell_x_feature$sample %>% levels() 74 | return(valley_location_list) 75 | } 76 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/harmony_transform.R: -------------------------------------------------------------------------------- 1 | ## Harmony remove batch effect 2 | require(harmony) 3 | require(dplyr) 4 | 5 | harmony_transform <- function(cell_x_adt = NULL, cell_x_feature = NULL, parameter_list = NULL) { 6 | out <- HarmonyMatrix( 7 | data_mat = cell_x_adt, 8 | meta_data = cell_x_feature$batch, 9 | do_pca = FALSE 10 | ) 11 | return(out) 12 | } 13 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/landmark_fill_na.R: -------------------------------------------------------------------------------- 1 | ## merge peak and valley and fill in NA 2 | 3 | landmark_fill_na <- function(peak_landmark_list = NULL, valley_landmark_list = NULL, landmark_align_type = NULL){ 4 | if(!landmark_align_type %in% c("negPeak", "negPeak_valley", "negPeak_valley_posPeak", "valley")){ 5 | return("Please provide one of the landmark_align_type from: negPeak, negPeak_valley, negPeak_valley_posPeak, valley") 6 | } 7 | if(landmark_align_type == "valley"){ 8 | ## only use the first valley to align 9 | landmark_matrix <- valley_landmark_list[, 1] %>% t %>% t 10 | landmark_matrix[is.na(landmark_matrix), 1] <- 2 ## fill in na by background level 2 after arcinsh_b5_a1 transformation 11 | }else{ 12 | ## involve negative peaks in landmark alignment 13 | if(ncol(peak_landmark_list) == 1){ 14 | ## only have negative peaks 15 | landmark_matrix <- cbind( 16 | peak_landmark_list[, 1], 17 | valley_landmark_list[, 1] 18 | ) 19 | ## fill in na 20 | landmark_matrix[is.na(landmark_matrix[, 1]), 1] <- median(landmark_matrix[!is.na(landmark_matrix[, 1]), 1]) 21 | landmark_matrix[is.na(landmark_matrix[, 2]), 2] <- 2 22 | }else{ 23 | ## have positive peaks 24 | landmark_matrix <- cbind( 25 | peak_landmark_list[, 1], 26 | valley_landmark_list[, 1], 27 | peak_landmark_list[, ncol(peak_landmark_list)] 28 | ) 29 | 30 | ## fill in na 31 | ## fill in valley first 32 | landmark_matrix[is.na(landmark_matrix[, 2]), 2] <- 2 33 | ## due to user_define_peak where unique peak is deemed positive. 34 | ## fill in either 0.5 or half of the first valley 35 | landmark_matrix[is.na(landmark_matrix[, 1]), 1] <- landmark_matrix[is.na(landmark_matrix[, 1]), 2]/2 #0.5 36 | ## fill in the last positive peak: add on the valley using the median distance from the last positive peak to the first valley 37 | landmark_matrix[is.na(landmark_matrix[, 3]), 3] <- landmark_matrix[is.na(landmark_matrix[, 3]), 2] + median(landmark_matrix[!is.na(landmark_matrix[, 3]), 3] - landmark_matrix[!is.na(landmark_matrix[, 3]), 2]) 38 | 39 | } 40 | } 41 | 42 | ## only provide negative peak location 43 | if (landmark_align_type == "negPeak") { 44 | return(landmark_matrix[, 1] %>% t %>% t) 45 | } 46 | 47 | ## only provide negative peak and valley location 48 | if (landmark_align_type == "negPeak_valley") { 49 | return(landmark_matrix[, 1:2]) 50 | } 51 | 52 | ## provide negative peak, first valley and last postiive peak location 53 | return(landmark_matrix) 54 | } 55 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/sciPENN_PublicDataset.py: -------------------------------------------------------------------------------- 1 | ## This the following python 2 | ## /app/software/Python/3.8.2-GCCcore-9.3.0/bin/python 3 | ## 4 | import numpy as np 5 | from matplotlib import pyplot 6 | import os 7 | from copy import deepcopy 8 | 9 | from time import time 10 | 11 | from math import ceil 12 | from scipy.stats import spearmanr, gamma, poisson 13 | 14 | from anndata import AnnData, read_h5ad 15 | import scanpy as sc 16 | from scanpy import read 17 | import pandas as pd 18 | 19 | from torch.utils.data import DataLoader, TensorDataset 20 | from torch import tensor 21 | from torch.cuda import is_available 22 | 23 | from sciPENN.sciPENN_API import sciPENN_API 24 | import anndata 25 | import json 26 | import psutil 27 | 28 | ## original tutorial 29 | # """Read in the data""" 30 | # data_path = './manuscript/scripts/sciPENN_tutorial/data/' 31 | 32 | # adata_gene = sc.read(data_path + "pbmc_gene.h5ad") 33 | # adata_protein = sc.read(data_path + "pbmc_protein.h5ad") 34 | 35 | # doublet_bool = (adata_gene.obs['celltype.l2'] != 'Doublet') 36 | 37 | # adata_gene = adata_gene[doublet_bool].copy() 38 | # adata_protein = adata_protein[doublet_bool].copy() 39 | 40 | 41 | # """Create training and test""" 42 | 43 | # train_bool = [x in ['P1', 'P3', 'P4', 'P7'] for x in adata_gene.obs['donor']] 44 | 45 | # adata_gene_set1 = adata_gene[train_bool].copy() 46 | # adata_protein_set1 = adata_protein[train_bool].copy() 47 | # adata_gene_set2 = adata_gene[np.invert(train_bool)].copy() 48 | # adata_protein_set2 = adata_protein[np.invert(train_bool)].copy() 49 | 50 | # common_proteins = adata_protein_set1.var.index 51 | # set1only_proteins = np.random.choice(common_proteins, len(common_proteins)//3, False) 52 | # common_proteins = np.setdiff1d(common_proteins, set1only_proteins) 53 | # set2only_proteins = np.random.choice(common_proteins, len(common_proteins)//2, False) 54 | 55 | # set1only_proteins = set(set1only_proteins) 56 | # set2only_proteins = set(set2only_proteins) 57 | 58 | # keep_set1 = [x not in set2only_proteins for x in adata_protein_set1.var.index] 59 | # keep_set2 = [x not in set1only_proteins for x in adata_protein_set1.var.index] 60 | 61 | # adata_protein_set1 = adata_protein_set1[:, keep_set1].copy() 62 | # adata_protein_set2 = adata_protein_set2[:, keep_set2].copy() 63 | 64 | 65 | # sciPENN = sciPENN_API(gene_trainsets = [adata_gene_set1, adata_gene_set2], 66 | # protein_trainsets = [adata_protein_set1, adata_protein_set2], 67 | # train_batchkeys = ['donor', 'donor']) 68 | 69 | 70 | # sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, 71 | # decay_step = 0.1, lr = 10**(-3), weights_dir = "pbmc_to_pbmcINTEGRATE", load = True) 72 | 73 | 74 | # imputed_test = sciPENN.impute() 75 | 76 | 77 | # imputed_test.X 78 | 79 | # imputed_test.obs['Dataset'] 80 | # imputed_test.obs['batch'] 81 | 82 | 83 | 84 | # proteins = imputed_test.var.index 85 | 86 | # proteins1 = proteins[imputed_test.var['Dataset 1']] #get proteins sequenced in Dataset 1 87 | # proteins2 = proteins[imputed_test.var['Dataset 2']] #get proteins sequenced in Dataset 1 88 | 89 | 90 | # ds1_cells = imputed_test.obs['Dataset'] == 'Dataset 1' 91 | # ds2_cells = imputed_test.obs['Dataset'] == 'Dataset 2' 92 | 93 | # ds1_pred, ds1_seq = np.invert(imputed_test.var['Dataset 1']), imputed_test.var['Dataset 1'] 94 | # ds2_pred, ds2_seq = np.invert(imputed_test.var['Dataset 2']), imputed_test.var['Dataset 2'] 95 | 96 | # pred1 = imputed_test[ds1_cells, ds1_pred] #imputed protein array in dataset 1 97 | # sequenced1 = imputed_test[ds1_cells, ds1_seq] #sequenced protein array in dataset 1 98 | # pred2 = imputed_test[ds2_cells, ds2_pred] #imputed protein array in dataset 2 99 | # sequenced2 = imputed_test[ds2_cells, ds2_seq] #sequenced protein array in dataset 2 100 | 101 | # embedding = sciPENN.embed() 102 | # embedding.obs['batch'] 103 | 104 | # q10_pred = imputed_test[ds1_cells, ds1_pred].layers['q10'] #get q10 for imputed proteins from reference 1 105 | # q10_truth = imputed_test[ds1_cells, ds1_seq].layers['q10'] #get q10 for sequenced proteins from reference 1, not useful 106 | 107 | 108 | 109 | # ## rewrite according to my scenario 110 | # """Read in the data""" 111 | # data_path = './manuscript/scripts/sciPENN_tutorial/data/' 112 | # adata_gene = sc.read(data_path + "pbmc_gene.h5ad") 113 | # adata_protein = sc.read(data_path + "pbmc_protein.h5ad") 114 | # doublet_bool = (adata_gene.obs['celltype.l2'] != 'Doublet') 115 | # adata_gene = adata_gene[doublet_bool].copy() 116 | # adata_protein = adata_protein[doublet_bool].copy() 117 | 118 | # sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 119 | # protein_trainsets = [adata_protein], 120 | # train_batchkeys = ['donor']) 121 | 122 | 123 | # sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, 124 | # decay_step = 0.1, lr = 10**(-3), weights_dir = "pbmc_to_pbmcINTEGRATE", load = True) 125 | 126 | # sciPENN.train(n_epochs = 10000, ES_max = 12, decay_max = 6, 127 | # decay_step = 0.1, lr = 10**(-3), weights_dir = "pbmc_to_pbmcINTEGRATE") 128 | 129 | # embedding = sciPENN.embed() 130 | 131 | # embedding.obs['batch'] 132 | 133 | # imputed_test = sciPENN.impute() 134 | 135 | ## define memory usage 136 | def log_memory_usage(file): 137 | memory = psutil.virtual_memory() 138 | file.write(f'Total memory: {memory.total / (1024.0 ** 3)} GB\n') 139 | file.write(f'Used memory: {memory.used / (1024.0 ** 3)} GB\n') 140 | file.write(f'Available memory: {memory.available / (1024.0 ** 3)} GB\n') 141 | 142 | 143 | ## Public Data 13datasets 144 | out_path = './manuscript/results/public13Dataset_CITEseq/CSV/' 145 | 146 | 147 | run_name = 'public13Dataset_CITEseq' 148 | adts = pd.read_csv('./results/publicData_CITEseq/CSV/adt_data_common_RawCount_'+ run_name + '.csv', index_col=0) 149 | gex = pd.read_csv('./results/publicData_CITEseq/CSV/rna_data_common_RawCount_'+ run_name + '.csv', index_col=0) 150 | obs = pd.read_csv('./results/publicData_CITEseq/CSV/adt_feature_common_RawCount_'+ run_name + '.csv', index_col=0) 151 | # obs.reset_index(drop=True, inplace=True) 152 | # gex.reset_index(drop=True, inplace=True) 153 | # adts.reset_index(drop=True, inplace=True) 154 | 155 | adts.index = obs.index 156 | gex.index = obs.index 157 | 158 | adata_gene = anndata.AnnData(gex, obs) 159 | adata_gene.index = adata_gene.obs_names 160 | adata_protein = anndata.AnnData(adts, obs) 161 | adata_protein.index = adata_protein.obs_names 162 | 163 | batch_key = 'study_name' 164 | with open(out_path + 'sciPENN_memory_usage_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f: 165 | log_memory_usage(f) 166 | start = time() 167 | 168 | sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 169 | protein_trainsets = [adata_protein], 170 | train_batchkeys = [batch_key], use_gpu = True, min_cells = False, min_genes = False) 171 | log_memory_usage(f) 172 | sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, decay_step = 0.1, lr = 10**(-3), weights_dir = "sciPENN_public13Dataset_CITEseq_study_name_GPU", load = True) 173 | embedding = sciPENN.embed() 174 | imputed_test = sciPENN.impute() 175 | log_memory_usage(f) 176 | end = time() 177 | print(f'Seconds to complete: {round(end - start,0)}') 178 | train_time = round(end - start, 0) 179 | 180 | with open(out_path + 'sciPENN_runtime_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f: 181 | json.dump(train_time, f) 182 | 183 | imputed_res = pd.DataFrame(imputed_test.X, index=imputed_test.obs_names, columns=imputed_test.var_names) 184 | imputed_res.to_csv(out_path + 'sciPENN_Imputed_Protein_' + run_name + '_' + batch_key + '.csv') 185 | embedding_res = pd.DataFrame(embedding.X, index=embedding.obs_names, columns=embedding.var_names) 186 | embedding_res.to_csv(out_path + 'sciPENN_Embedding_' + run_name + '_' + batch_key + '.csv') 187 | embedding_obs_res = pd.DataFrame(embedding.obs) 188 | embedding_obs_res.to_csv(out_path + 'sciPENN_Embedding_DataFeature_' + run_name + '_' + batch_key + '.csv') 189 | 190 | batch_key = 'sample' 191 | with open(out_path + 'sciPENN_memory_usage_' + run_name + '_' + batch_key + '_CPU.csv', 'w') as f: 192 | log_memory_usage(f) 193 | start = time() 194 | sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 195 | protein_trainsets = [adata_protein], 196 | train_batchkeys = [batch_key], use_gpu = False, min_cells = False, min_genes = False) 197 | log_memory_usage(f) 198 | sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, decay_step = 0.1, lr = 10**(-3), weights_dir = "sciPENN_public13Dataset_CITEseq_sample_CPU", load = True) 199 | embedding = sciPENN.embed() 200 | imputed_test = sciPENN.impute() 201 | log_memory_usage(f) 202 | end = time() 203 | print(f'Seconds to complete: {round(end - start,0)}') 204 | train_time = round(end - start, 0) 205 | 206 | with open(out_path + 'sciPENN_runtime_' + run_name + '_' + batch_key + '_CPU.csv', 'w') as f: 207 | json.dump(train_time, f) 208 | 209 | 210 | batch_key = 'sample' 211 | with open(out_path + 'sciPENN_memory_usage_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f: 212 | log_memory_usage(f) 213 | start = time() 214 | sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 215 | protein_trainsets = [adata_protein], 216 | train_batchkeys = [batch_key], use_gpu = True, min_cells = False, min_genes = False) 217 | log_memory_usage(f) 218 | sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, decay_step = 0.1, lr = 10**(-3), weights_dir = "sciPENN_public13Dataset_CITEseq_sample_GPU", load = True) 219 | embedding = sciPENN.embed() 220 | imputed_test = sciPENN.impute() 221 | log_memory_usage(f) 222 | end = time() 223 | print(f'Seconds to complete: {round(end - start,0)}') 224 | train_time = round(end - start, 0) 225 | 226 | with open(out_path + 'sciPENN_runtime_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f: 227 | json.dump(train_time, f) 228 | 229 | imputed_res = pd.DataFrame(imputed_test.X, index=imputed_test.obs_names, columns=imputed_test.var_names) 230 | imputed_res.to_csv(out_path + 'sciPENN_Imputed_Protein_' + run_name + '_' + batch_key + '.csv') 231 | embedding_res = pd.DataFrame(embedding.X, index=embedding.obs_names, columns=embedding.var_names) 232 | embedding_res.to_csv(out_path + 'sciPENN_Embedding_' + run_name + '_' + batch_key + '.csv') 233 | embedding_obs_res = pd.DataFrame(embedding.obs) 234 | embedding_obs_res.to_csv(out_path + 'sciPENN_Embedding_DataFeature_' + run_name + '_' + batch_key + '.csv') 235 | 236 | 237 | with open(out_path + 'sciPENN_memory_usage_' + run_name + '_' + batch_key + '_CPU.csv', 'w') as f: 238 | log_memory_usage(f) 239 | start = time.time() 240 | 241 | sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 242 | protein_trainsets = [adata_protein], 243 | train_batchkeys = [batch_key], use_gpu = False, min_cells = False, min_genes = False) 244 | sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, decay_step = 0.1, lr = 10**(-3), weights_dir = "sciPENN_public13Dataset_CITEseq_sample_CPU", load = True) 245 | embedding = sciPENN.embed() 246 | imputed_test = sciPENN.impute() 247 | end = time.time() 248 | print(f'Seconds to complete: {round(end - start,0)}') 249 | train_time = round(end - start, 0) 250 | 251 | with open(out_path + 'sciPENN_runtime_' + run_name + '_' + batch_key + '_CPU.csv', 'w') as f: 252 | json.dump(train_time, f) 253 | 254 | 255 | 256 | ## T cell dataset 257 | run_name = 'public13Dataset_CITEseq_Tcelllargeprop' #'public13Dataset_CITEseq_Tcelllargeprop_cd8T' 258 | out_path = './manuscript/results/' + run_name + '/CSV/' 259 | 260 | adts = pd.read_parquet('./manuscript/results/' + run_name + '/CSV/adt_data_common_RawCount_'+ run_name + '.parquet') 261 | gex = pd.read_parquet('./manuscript/results/' + run_name + '/CSV/rna_data_common_RawCount_'+ run_name + '.parquet') 262 | obs = pd.read_parquet('./manuscript/results/' + run_name + '/CSV/adt_feature_common_RawCount_'+ run_name + '.parquet') 263 | 264 | adts.index = obs.index 265 | gex.index = obs.index 266 | 267 | adata_gene = anndata.AnnData(gex, obs) 268 | adata_gene.index = adata_gene.obs_names 269 | adata_protein = anndata.AnnData(adts, obs) 270 | adata_protein.index = adata_protein.obs_names 271 | 272 | batch_key = 'sample' # 'sample' #'study_name' # 273 | with open(out_path + 'sciPENN_memory_usage_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f: 274 | log_memory_usage(f) 275 | start = time() 276 | sciPENN = sciPENN_API(gene_trainsets = [adata_gene], 277 | protein_trainsets = [adata_protein], 278 | train_batchkeys = [batch_key], use_gpu = True, min_cells = False, min_genes = False) 279 | log_memory_usage(f) 280 | sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 10000, ES_max = 12, decay_max = 6, decay_step = 0.1, lr = 10**(-3), weights_dir = "sciPENN_public12Dataset_CITEseq_notriana_study_name", load = True) 281 | log_memory_usage(f) 282 | embedding = sciPENN.embed() 283 | imputed_test = sciPENN.impute() 284 | log_memory_usage(f) 285 | end = time() 286 | print(f'Seconds to complete: {round(end - start,0)}') 287 | train_time = round(end - start, 0) 288 | 289 | with open(out_path + 'sciPENN_runtime_' + run_name + '_' + batch_key + '_GPU.csv', 'w') as f: 290 | json.dump(train_time, f) 291 | 292 | 293 | imputed_res = pd.DataFrame(imputed_test.X, index=imputed_test.obs_names, columns=imputed_test.var_names) 294 | imputed_res.to_csv(out_path + 'sciPENN_Imputed_Protein_' + run_name + '_' + batch_key + '.csv') 295 | 296 | embedding_res = pd.DataFrame(embedding.X, index=embedding.obs_names, columns=embedding.var_names) 297 | embedding_res.to_csv(out_path + 'sciPENN_Embedding_' + run_name + '_' + batch_key + '.csv') 298 | 299 | embedding_obs_res = pd.DataFrame(embedding.obs) 300 | embedding_obs_res.to_csv(out_path + 'sciPENN_Embedding_DataFeature_' + run_name + '_' + batch_key + '.csv') 301 | -------------------------------------------------------------------------------- /manuscript/BenchmarkingAnalysis/ImplementMethods/warpset_transform.R: -------------------------------------------------------------------------------- 1 | ## warpset peak valley alignment 2 | require(dplyr) 3 | require(flowStats) 4 | warpset_transform <- function(landmark_matrix = NULL, cell_x_feature = NULL, adt_marker_select = NULL, parameter_list = NULL) { 5 | ## get parameters 6 | grouping <- NULL 7 | monwrd <- TRUE 8 | subsample <- NULL 9 | peakNr <- NULL 10 | clipRange <- 0.01 11 | nbreaks <- 11 12 | bwFac <- 2 13 | warpFuns <- FALSE 14 | chunksinze <- 10 15 | newNcFile <- NULL 16 | z <- NULL 17 | nb <- 1001 18 | eps <- .Machine$double.eps 19 | target <- NULL 20 | 21 | 22 | start_adt_method <- parameter_list$start_adt_method 23 | fcs_path <- parameter_list$fcs_path 24 | 25 | ## read in original adt expression matrix 26 | exp_data <- read.ncdfFlowSet(paste0(fcs_path, "/", start_adt_method, "/", levels(cell_x_feature$sample), ".fcs")) %>% as.flowSet() 27 | 28 | ## check if target marker is available 29 | adt_marker_flag <- adt_marker_select %in% colnames(exp_data) 30 | if (!all(adt_marker_flag)) { 31 | stop( 32 | "Invalid stain(s) not matching the flowSet:\n ", 33 | paste(adt_marker_select[!adt_marker_flag], collapse = ", ") 34 | ) 35 | } 36 | 37 | ## expression range and sample name list 38 | ranges <- fsApply(exp_data, range) 39 | samples <- sampleNames(exp_data) 40 | 41 | ## set up fda parameters 42 | extend <- 0.15 43 | from <- min(sapply(ranges, function(z) z[1, adt_marker_select] - diff(z[, adt_marker_select]) * extend), na.rm = TRUE) 44 | to <- max(sapply(ranges, function(z) z[2, adt_marker_select] + diff(z[, adt_marker_select]) * extend), na.rm = TRUE) 45 | wbasis <- create.bspline.basis( 46 | rangeval = c(from, to), 47 | norder = 4, breaks = seq(from, to, len = nbreaks) 48 | ) 49 | Wfd0 <- fd(matrix(0, wbasis$nbasis, 1), wbasis) 50 | WfdPar <- fdPar(Wfd0, 1, 1e-4) 51 | 52 | density_y <- t(fsApply(exp_data, function(exp_data_each) { # t(fsApply(thisX, function(z){ 53 | exp_data_each_range <- range(exp_data_each)[, adt_marker_select] 54 | exp_data_each <- exprs(exp_data_each) 55 | exp_data_each <- exp_data_each[exp_data_each[, adt_marker_select] > exp_data_each_range[1] + eps & exp_data_each[, adt_marker_select] < exp_data_each_range[2] - eps, adt_marker_select] 56 | density(exp_data_each, from = from, to = to, n = nb, na.rm = TRUE)$y 57 | })) 58 | arg_vals <- seq(from, to, len = nb) 59 | fdobj <- Data2fd(arg_vals, density_y, wbasis) 60 | 61 | if (ncol(landmark_matrix) == 1) { ## only one peak: offset 62 | if (is.null(target)) { 63 | offsets <- landmark_matrix - median(landmark_matrix) 64 | names(offsets) <- sampleNames(exp_data) 65 | } else { 66 | offsets <- landmark_matrix - landmark_matrix[sampleNames(exp_data) %in% target] 67 | names(offsets) <- sampleNames(exp_data) 68 | } 69 | funs <- funsBack <- vector("list", length(landmark_matrix)) 70 | names(funs) <- samples 71 | names(funsBack) <- samples 72 | for (j in seq_along(funs)) { 73 | funs[[samples[[j]]]] <- function(x) x - z 74 | e1 <- new.env(hash = TRUE) 75 | e1$z <- offsets[samples[[j]]] 76 | environment(funs[[samples[[j]]]]) <- e1 77 | funsBack[[samples[[j]]]] <- function(x) x + z 78 | e2 <- new.env(hash = TRUE) 79 | e2$z <- offsets[samples[[j]]] 80 | environment(funsBack[[samples[[j]]]]) <- e2 81 | } 82 | } else { ## multiple peaks: warping 83 | if (is.null(target)) { 84 | capture.output(regDens <- landmarkreg(fdobj, landmark_matrix, WfdPar = WfdPar, monwrd = monwrd)) 85 | } else { 86 | capture.output(regDens <- landmarkreg(fdobj, landmark_matrix, x0marks = apply(landmark_matrix, 2, jitter)[rownames(landmark_matrix) %in% target, ], WfdPar = WfdPar, monwrd = monwrd)) 87 | } 88 | warpfdobj <- regDens$warpfd 89 | warpedX <- eval.fd(warpfdobj, arg_vals) 90 | warpedX[1, ] <- head(arg_vals, 1) 91 | warpedX[nrow(warpedX), ] <- tail(arg_vals, 1) 92 | ## compute warping functions 93 | ## funs <- apply(warpedX, 2, function(y) approxfun(arg_vals, y)) 94 | funs <- apply(warpedX, 2, approxfun, arg_vals) 95 | funsBack <- apply(warpedX, 2, function(a, b) approxfun(b, a), arg_vals) 96 | } 97 | 98 | 99 | names(funs) <- names(funsBack) <- samples # sampleNames(thisX) 100 | 101 | warped_landmark_matrix <- landmark_matrix 102 | leftBoard <- rightBoard <- vector("list", length(funs)) 103 | # chunkleftBoard<-chunkrightBoard<-rep(list(length(funs)),max(1:length(funs)%/%chunksize)+1) 104 | newRange <- c(Inf, -Inf) 105 | 106 | ## transform the raw data using the warping functions 107 | for (i in seq_along(funs)) { 108 | thisDat <- exprs(exp_data[[i]][, adt_marker_select]) 109 | lb <- thisDat < ranges[[i]][1, adt_marker_select] + eps 110 | lb[is.na(lb)] <- TRUE 111 | leftBoard[[i]] <- lb 112 | rb <- thisDat > ranges[[i]][2, adt_marker_select] - eps 113 | rb[is.na(rb)] <- TRUE 114 | rightBoard[[i]] <- rb 115 | # Include ALL data, none of this thresholding crap at borders. 116 | sel <- leftBoard[[i]] | rightBoard[[i]] 117 | # sel<-rep(FALSE,length(thisDat)) 118 | # browser(); 119 | newDat <- as.matrix(funs[[i]](thisDat[!sel, ])) 120 | newDat[is.na(newDat)] <- thisDat[!sel, ][is.na(newDat)] 121 | exprs(exp_data[[i]])[!sel, adt_marker_select] <- newDat 122 | warped_landmark_matrix[i, ] <- funs[[i]](landmark_matrix[i, ]) 123 | newRange[1] <- min(newRange[1], min(exprs(exp_data[[i]])[, adt_marker_select], na.rm = TRUE)) 124 | newRange[2] <- max(newRange[2], max(exprs(exp_data[[i]])[, adt_marker_select], na.rm = TRUE)) 125 | } 126 | ## make sure that edge envents are set to the extreme values 127 | ## of the warped data range and update the parameters slot 128 | ## accordingly 129 | for (i in seq_along(funs)) { 130 | minSel <- leftBoard[[i]] 131 | maxSel <- rightBoard[[i]] 132 | exprs(exp_data[[i]])[minSel, adt_marker_select] <- as.matrix(rep( 133 | newRange[1], 134 | sum(minSel, na.rm = TRUE) 135 | ), 136 | ncol = 1 137 | ) 138 | exprs(exp_data[[i]])[maxSel, adt_marker_select] <- as.matrix(rep( 139 | newRange[2], 140 | sum(maxSel, na.rm = TRUE) 141 | ), 142 | ncol = 1 143 | ) 144 | ip <- match(adt_marker_select, pData(parameters(exp_data[[i]]))$name) 145 | tmp <- parameters(exp_data[[i]]) 146 | oldRanges <- unlist(range(exp_data[[i]])[, adt_marker_select]) 147 | pData(tmp)[ip, c("minRange", "maxRange")] <- c( 148 | min(oldRanges[1], newRange[1]), 149 | max(oldRanges[2], newRange[2]) 150 | ) 151 | exp_data[[i]]@parameters <- tmp 152 | } 153 | 154 | 155 | # exp_data 156 | exp_data_aligned <- as(exp_data, "flowSet") 157 | phenoData(exp_data_aligned) <- phenoData(exp_data) 158 | exp_data_aligned <- exp_data_aligned[sampleNames(exp_data)] 159 | 160 | ## convert into matrix 161 | cell_x_adt_aligned <- c() 162 | for(sample_name in paste0(levels(cell_x_feature$sample), ".fcs")){ 163 | cell_x_adt_aligned <- rbind(cell_x_adt_aligned, exp_data_aligned[[sample_name]] %>% exprs) 164 | } 165 | return(cell_x_adt_aligned %>% data.frame) 166 | } 167 | -------------------------------------------------------------------------------- /manuscript/COVID19/ADTnorm_parallel.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args = commandArgs(trailingOnly=TRUE) 3 | 4 | library(pacman) 5 | library(ggplot2) 6 | library(viridis) 7 | library(data.table) 8 | library(tidyr) 9 | library(gridExtra) 10 | library(magrittr) 11 | library(umap) 12 | library(ggpubr) 13 | library(SingleCellExperiment) 14 | library(RColorBrewer) 15 | library(ggridges) 16 | library(dplyr) 17 | library(CytofRUV) 18 | library(shapr) 19 | library(fda) 20 | library(pryr) 21 | library(Seurat) 22 | library(systemfonts) 23 | library(shiny) 24 | library(httpgd) 25 | library(ggrepel) 26 | require(magrittr) 27 | require(ggplot2) 28 | require(RColorBrewer) 29 | require(tidyr) 30 | require(ggridges) 31 | require(cytotidyr) 32 | library(harmony) 33 | library(zellkonverter) 34 | library(ADTnorm) 35 | # p_load(flowStats, flowCore, FlowSOM, ncdfFlow, flowViz, pdfCluster, cluster) 36 | 37 | ## ===================== 38 | ## study name and paths 39 | ## ===================== 40 | run_name = "COVID19" 41 | master_path = "./" 42 | out_path = paste0(master_path, "manuscript/results/", run_name) 43 | # fcs_path = paste0(out_path, "/FCS/") 44 | fig_path = paste0(out_path, "/Figures/", run_name) 45 | 46 | adt_data = readRDS(file = paste0(out_path, "/RDS/adt_data_", run_name, ".rds")) 47 | adt_feature = readRDS(file = paste0(out_path, "/RDS/adt_feature_", run_name, ".rds")) %>% data.frame 48 | 49 | arcsinh_param <- list(a = 1, b = 1 / 5, c = 0) 50 | 51 | method = "ADTnorm_sample_manual" 52 | adt_feature$sample = factor(adt_feature$site_sample) 53 | res_norm = ADTnorm( 54 | cell_x_adt = adt_data, 55 | cell_x_feature = adt_feature, 56 | marker_to_process = args[1], 57 | customize_landmark = FALSE, 58 | save_fig = TRUE, 59 | save_landmark = TRUE, 60 | save_outpath = out_path, 61 | study_name = run_name, 62 | trimodal_marker = c("CD4", "CD45RA", "CD49f", "CD62L"), 63 | bw_smallest_bi = 1.1, 64 | bw_smallest_tri = 1.1, 65 | bw_smallest_adjustments = list(CD3 = 1.1, CD4 = 1.1, CD8 = 1.1), 66 | shoulder_valley_slope = -1, 67 | exclude_zeroes = FALSE, 68 | bimodal_marker = NULL, 69 | positive_peak = NULL, 70 | quantile_clip = 1, 71 | peak_type = "mode", 72 | multi_sample_per_batch = TRUE, 73 | shoulder_valley = TRUE, 74 | valley_density_adjust = 3, 75 | landmark_align_type = "negPeak_valley_posPeak", 76 | midpoint_type = "valley", 77 | neg_candidate_thres = asinh(3/5 + 1), 78 | lower_peak_thres = 0.005, 79 | brewer_palettes = "Set1", 80 | detect_outlier_valley = FALSE, 81 | target_landmark_location = NULL, 82 | clean_adt_name = FALSE, 83 | override_landmark = NULL, 84 | verbose = TRUE) 85 | 86 | 87 | saveRDS(res_norm, file = paste0(out_path, "/RDS/adt_data_norm_", method, "_", args[1], "_", run_name, ".rds")) 88 | 89 | -------------------------------------------------------------------------------- /manuscript/COVID19/RNA_Mono_DE.R: -------------------------------------------------------------------------------- 1 | rna_data = readRDS(file = paste0(out_path, "/RDS/rna_data_", run_name, ".rds")) 2 | rm_ind = readRDS(file = paste0(out_path, "/RDS/rm_ind1_ind2_", run_name, ".rds")) 3 | 4 | sce = readH5AD("./scripts/COVID19/data/covid_portal_210320_with_raw.h5ad") 5 | 6 | rownames(rna_data) = colnames(sce@assays@data@listData$X[, -rm_ind]) 7 | colnames(rna_data) = rownames(sce@assays@data@listData$X)[1:24737] 8 | 9 | > which(colnames(rna_data) == "CD38") 10 | [1] 5476 11 | > which(colnames(rna_data) == "FCGR1A") 12 | [1] 1327 13 | > which(colnames(rna_data) == "SIGLEC1") 14 | [1] 21457 15 | 16 | 17 | rna_data[cell_hd, which(colnames(rna_data) == "CD38")] %>% summary 18 | rna_data[cell_pt, which(colnames(rna_data) == "CD38")] %>% summary 19 | 20 | rna_data[cell_hd, which(colnames(rna_data) == "FCGR1A")] %>% summary 21 | rna_data[cell_pt, which(colnames(rna_data) == "FCGR1A")] %>% summary 22 | 23 | rna_data[cell_hd, which(colnames(rna_data) == "SIGLEC1")] %>% summary 24 | rna_data[cell_pt, which(colnames(rna_data) == "SIGLEC1")] %>% summary 25 | 26 | 27 | rna_norm = sce@assays@data@listData$X[1:24737, -rm_ind] %>% t() 28 | 29 | ## for each mono cell type 30 | rna_mono_summary = c() 31 | for(cell_type_each in c("CD16_mono", "CD14_mono", "CD83_CD14_mono")){ 32 | cell_hd = which(adt_feature$full_clustering == cell_type_each & adt_feature$Status_on_day_collection_summary == "Healthy") 33 | cell_pt = which(adt_feature$full_clustering == cell_type_each & adt_feature$Status_on_day_collection_summary %in% c("Asymptomatic", "Mild", "Moderate", "Severe", "Critical")) 34 | 35 | rna_mono_summary = data.frame( 36 | gene_exp = rna_norm[cell_hd, which(colnames(rna_norm) == "CD38")], 37 | cell_type = cell_type_each, 38 | status = "Healthy", 39 | gene = "CD38") %>% rbind(rna_mono_summary) 40 | 41 | rna_mono_summary = data.frame( 42 | gene_exp = rna_norm[cell_pt, which(colnames(rna_norm) == "CD38")], 43 | cell_type = cell_type_each, 44 | status = "Patient", 45 | gene = "CD38") %>% rbind(rna_mono_summary) 46 | 47 | rna_mono_summary = data.frame( 48 | gene_exp = rna_norm[cell_hd, which(colnames(rna_norm) == "FCGR1A")], 49 | cell_type = cell_type_each, 50 | status = "Healthy", 51 | gene = "FCGR1A") %>% rbind(rna_mono_summary) 52 | 53 | rna_mono_summary = data.frame( 54 | gene_exp = rna_norm[cell_pt, which(colnames(rna_norm) == "FCGR1A")], 55 | cell_type = cell_type_each, 56 | status = "Patient", 57 | gene = "FCGR1A") %>% rbind(rna_mono_summary) 58 | 59 | rna_mono_summary = data.frame( 60 | gene_exp = rna_norm[cell_hd, which(colnames(rna_norm) == "SIGLEC1")], 61 | cell_type = cell_type_each, 62 | status = "Healthy", 63 | gene = "SIGLEC1") %>% rbind(rna_mono_summary) 64 | 65 | rna_mono_summary = data.frame( 66 | gene_exp = rna_norm[cell_pt, which(colnames(rna_norm) == "SIGLEC1")], 67 | cell_type = cell_type_each, 68 | status = "Patient", 69 | gene = "SIGLEC1") %>% rbind(rna_mono_summary) 70 | 71 | } 72 | rna_mono_summary$status = factor(rna_mono_summary$status, levels = c("Patient", "Healthy")) 73 | 74 | rna_mono_summary %>% ggplot(aes(x = gene, y = gene_exp, fill = status)) + 75 | geom_boxplot() + 76 | facet_wrap(~cell_type, scales = "free_y") + 77 | theme_bw(base_size = 25) + 78 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 79 | scale_fill_brewer(palette = "Set1") + 80 | xlab("") + 81 | ylab("Normalized Gene Expression") 82 | 83 | pdf("./results/COVID19/Figures/RNA_Mono_DE.pdf", width = 17, height = 8) 84 | rna_mono_summary %>% dplyr::group_by(cell_type, status, gene) %>% dplyr::summarize(mean_gene_exp = mean(gene_exp), prop_pos_cell = sum(gene_exp > 0)/length(gene_exp) * 100) %>% ggplot(aes(x = gene, y = status, fill = mean_gene_exp, size = prop_pos_cell)) + 85 | geom_point(shape = 21) + 86 | facet_wrap(~cell_type) + 87 | theme_bw(base_size = 25) + 88 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 89 | scale_fill_gradient(low = "white", high = "red") + 90 | xlab("") + 91 | ylab("") + 92 | scale_size_continuous(range = c(1, 40)) + 93 | theme(legend.position = "top") 94 | dev.off() 95 | 96 | 97 | 98 | ## for each mono cell type 99 | rna_mono_summary = c() 100 | for(cell_type_each in c("CD16_mono", "CD14_mono", "CD83_CD14_mono")){ 101 | cell_hd = which(adt_feature$full_clustering == cell_type_each & adt_feature$Status_on_day_collection_summary == "Healthy") 102 | cell_pt = which(adt_feature$full_clustering == cell_type_each & adt_feature$Status_on_day_collection_summary %in% c("Asymptomatic", "Mild", "Moderate", "Severe", "Critical")) 103 | 104 | rna_mono_summary = data.frame( 105 | gene_exp = rna_data[cell_hd, which(colnames(rna_data) == "CD38")], 106 | cell_type = cell_type_each, 107 | status = "Healthy", 108 | gene = "CD38") %>% rbind(rna_mono_summary) 109 | 110 | rna_mono_summary = data.frame( 111 | gene_exp = rna_data[cell_pt, which(colnames(rna_data) == "CD38")], 112 | cell_type = cell_type_each, 113 | status = "Patient", 114 | gene = "CD38") %>% rbind(rna_mono_summary) 115 | 116 | rna_mono_summary = data.frame( 117 | gene_exp = rna_data[cell_hd, which(colnames(rna_data) == "FCGR1A")], 118 | cell_type = cell_type_each, 119 | status = "Healthy", 120 | gene = "FCGR1A") %>% rbind(rna_mono_summary) 121 | 122 | rna_mono_summary = data.frame( 123 | gene_exp = rna_data[cell_pt, which(colnames(rna_data) == "FCGR1A")], 124 | cell_type = cell_type_each, 125 | status = "Patient", 126 | gene = "FCGR1A") %>% rbind(rna_mono_summary) 127 | 128 | rna_mono_summary = data.frame( 129 | gene_exp = rna_data[cell_hd, which(colnames(rna_data) == "SIGLEC1")], 130 | cell_type = cell_type_each, 131 | status = "Healthy", 132 | gene = "SIGLEC1") %>% rbind(rna_mono_summary) 133 | 134 | rna_mono_summary = data.frame( 135 | gene_exp = rna_data[cell_pt, which(colnames(rna_data) == "SIGLEC1")], 136 | cell_type = cell_type_each, 137 | status = "Patient", 138 | gene = "SIGLEC1") %>% rbind(rna_mono_summary) 139 | 140 | } 141 | rna_mono_summary$status = factor(rna_mono_summary$status, levels = c("Patient", "Healthy")) 142 | rna_mono_summary %>% ggplot(aes(x = gene, y = gene_exp, fill = status)) + 143 | geom_boxplot() + 144 | facet_wrap(~cell_type, scales = "free_y") + 145 | theme_bw(base_size = 25) + 146 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 147 | scale_fill_brewer(palette = "Set1") + 148 | xlab("") + 149 | ylab("Raw Gene Expression") 150 | 151 | wilcox.test(rna_data[cell_hd, which(colnames(rna_data) == "CD38")], 152 | rna_data[cell_pt, which(colnames(rna_data) == "CD38")]) 153 | 154 | rna_data[cell_hd, which(colnames(rna_data) == "FCGR1A")] %>% summary 155 | rna_data[cell_pt, which(colnames(rna_data) == "FCGR1A")] %>% summary 156 | 157 | rna_data[cell_hd, which(colnames(rna_data) == "SIGLEC1")] %>% summary 158 | rna_data[cell_pt, which(colnames(rna_data) == "SIGLEC1")] %>% summary 159 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/tests.html 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files 8 | 9 | library(testthat) 10 | library(ADTnorm) 11 | 12 | test_check("ADTnorm") 13 | -------------------------------------------------------------------------------- /tests/testthat/test-ADTnorm.R: -------------------------------------------------------------------------------- 1 | test_that("ADTnorm works", { 2 | data(cell_x_adt) 3 | data(cell_x_feature) 4 | save_outpath <- tempdir() 5 | study_name <- "ADTnorm_demoRun" 6 | 7 | suppressWarnings({ 8 | res <- ADTnorm( 9 | cell_x_adt = cell_x_adt, 10 | cell_x_feature = cell_x_feature, 11 | save_outpath = save_outpath, 12 | study_name = study_name, 13 | marker_to_process = c("CD3", "CD4", "CD8") 14 | ) 15 | }) 16 | 17 | expect_type(res, "list") 18 | expect_equal(nrow(res), 422682) 19 | expect_equal(ncol(res), 3) 20 | }) 21 | -------------------------------------------------------------------------------- /vignettes/.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yezhengSTAT/ADTnorm/02321b18ccec3ec5ebfcbbb40c28c3a3edb2ec0e/vignettes/.RData --------------------------------------------------------------------------------