├── .Rbuildignore ├── .gitattributes ├── .github └── workflows │ └── main.yml ├── .gitignore ├── .lintr ├── DESCRIPTION ├── FAQs.md ├── InSituType.Rproj ├── LICENSE ├── NAMESPACE ├── NEWS.md ├── R ├── RcppExports.R ├── chooseClusterNumber.R ├── colorCellTypes.R ├── data.R ├── fastCohorting.R ├── find_anchor_cells.R ├── flightpath_layout.R ├── gen_profiles_protein.R ├── geoSketch.R ├── getProfiles.R ├── getSpatialContext.R ├── insitutype.R ├── insitutypeML.R ├── nbclust.R ├── refineClusters.R ├── rescaleProfiles.R ├── spatialUpdate.R └── utilities.R ├── README.md ├── azure-pipelines.yml ├── data ├── human_signature.RData ├── iocolors.RData ├── ioprofiles.RData ├── mini_nsclc.RData ├── mouse_signature.RData ├── tonsil_annotation.RData ├── tonsil_protein.RData └── tonsil_reference_profile.RData ├── man ├── Estep.Rd ├── Mstep.Rd ├── alignGenes.Rd ├── chooseClusterNumber.Rd ├── choose_anchors_from_stats.Rd ├── colorCellTypes.Rd ├── estimateBackground.Rd ├── estimatePlatformEffects.Rd ├── fastCohorting.Rd ├── find_anchor_cells.Rd ├── flightpath_layout.Rd ├── flightpath_plot.Rd ├── gen_profiles_protein_annotation.Rd ├── gen_profiles_protein_expression.Rd ├── geoSketch.Rd ├── geoSketch_get_plaid.Rd ├── geoSketch_sample_from_plaids.Rd ├── getMeanClusterConfidence.Rd ├── getProteinParameters.Rd ├── getRNAprofiles.Rd ├── getSpatialContext.Rd ├── get_anchor_stats.Rd ├── get_neighborhood_expression.Rd ├── human_signature.Rd ├── insitutype.Rd ├── insitutypeML.Rd ├── iocolors.Rd ├── ioprofiles.Rd ├── ismax.Rd ├── lldist.Rd ├── lls_protein.Rd ├── lls_rna.Rd ├── logliks2probs.Rd ├── mini_nsclc.Rd ├── mouse_signature.Rd ├── nbclust.Rd ├── nearestNeighborGraph.Rd ├── neighbor_colMeans.Rd ├── neighbor_colSums.Rd ├── numCores.Rd ├── prepDataForSketching.Rd ├── probs2logliks.Rd ├── radiusBasedGraph.Rd ├── refineAnchors.Rd ├── refineClusters.Rd ├── spatialUpdate.Rd ├── tonsil_annotation.Rd ├── tonsil_protein.Rd ├── tonsil_reference_profile.Rd ├── updateProfilesFromAnchors.Rd ├── updateReferenceProfiles.Rd └── update_logliks_with_cohort_freqs.Rd ├── reqs.md ├── specs.md ├── src ├── Makevars ├── Makevars.win ├── RcppExports.cpp └── rcpparma_dnbinom_sparse.cpp ├── tests ├── testthat.R └── testthat │ ├── test-colorCellTypes.R │ ├── test_RCppExports.R │ ├── test_flightpath.R │ ├── test_getProfiles.R │ ├── test_getSpatialContext.R │ ├── test_insitutype_Protein.R │ ├── test_insitutype_RNA.R │ ├── test_refinecells_cell_merging_logic.R │ └── test_spatialUpdate.R └── vignettes ├── NSCLC-RNA_InsituType-vignette.Rmd ├── NSCLC-RNA_InsituType-vignette.html ├── NSCLC-clustering-SingleCellExperiment-vignette.Rmd ├── NSCLC-clustering-SingleCellExperiment-vignette.html ├── NSCLC-clustering-vignette.Rmd ├── NSCLC-clustering-vignette.html ├── NSCLC-semi-supervised-cell-typing-vignette.Rmd ├── NSCLC-semi-supervised-cell-typing-vignette.html ├── NSCLC-supervised-cell-typing-vignette.Rmd ├── NSCLC-supervised-cell-typing-vignette.html ├── TONSIL_Protein_Insitu_Cell_Typing-vignette.Rmd └── TONSIL_Protein_Insitu_Cell_Typing-vignette.html /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^azure-pipelines\.yml$ 4 | ^\.vscode$ 5 | ^reqs\.md$ 6 | ^specs\.md$ 7 | ^LICENSE\.md$ 8 | ^\.lintr$ 9 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | on: 2 | issues: 3 | types: [opened] 4 | 5 | 6 | jobs: 7 | welcome: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: EddieHubCommunity/gh-action-community/src/welcome@main 11 | with: 12 | github-token: ${{ secrets.GITHUB_TOKEN }} 13 | issue-message: "Thank you for contacting us about our tools! To receive assistance, kindly email support.spatial@bruker.com with detailed information about your issue. If applicable, attach a screenshot of any encountered errors and include a copy of the modified script in Notepad. Our customer support team will help facilitate a review and resolution of the issue." 14 | footer: "Thank you for choosing Bruker Spatial Biology,\nBruker Spatial Biology Dev Team" 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | *.Rproj 3 | .Rhistory 4 | .RData 5 | .Ruserdata 6 | .DS_Store 7 | *.o 8 | *.so 9 | *.dll 10 | inst/doc 11 | -------------------------------------------------------------------------------- /.lintr: -------------------------------------------------------------------------------- 1 | linters: linters_with_defaults( 2 | line_length_linter(120), 3 | object_name_linter("camelCase") 4 | ) 5 | encoding: "UTF-8" 6 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: InSituType 2 | Type: Package 3 | Title: An R package for performing cell typing in SMI and other single cell data 4 | Version: 2.0 5 | Authors@R: c(person("Patrick", "Danaher", email = "pdanaher@nanostring.com", role = c("aut")), 6 | person("Sangsoon", "Woo", email = "sawoo@nanostring.com", role = c("aut")), 7 | person("Zhi", "Yang", email = "zyang@nanostring.com", role = c("aut")), 8 | person("David", "Ross", email = "dross@nanostring.com", role = c("aut", "cre")), 9 | person("Lidan", "Wu", email = "lwu@nanostring.com", role = c("aut")), 10 | person("Yongfang", "Lu", email = "ylu@nanostring.com", role = c("aut"))) 11 | Description: Insitutype is an algorithm for performing cell typing in single cell 12 | spatial transcriptomics data, such as is generated by the CosMx platform. 13 | It can perform supervised cell typing from reference profiles, unsupervised clustering, 14 | or semi-supervised cell typing in which cells both reference cell types and de novo 15 | clusters are fit. 16 | Imports: 17 | data.table, 18 | dplyr, 19 | fastglm, 20 | ggplot2, 21 | graphics, 22 | grDevices, 23 | irlba, 24 | lsa, 25 | magrittr, 26 | Matrix, 27 | mclust, 28 | methods, 29 | Rcpp (>= 1.0.9), 30 | rlang, 31 | scales, 32 | SingleCellExperiment, 33 | sparseMatrixStats, 34 | spatstat.geom, 35 | stats, 36 | SummarizedExperiment, 37 | tibble, 38 | umap, 39 | utils, 40 | uwot 41 | License: NanoString Technologies, Inc. Software License Agreement for Non-Commercial Use 42 | Encoding: UTF-8 43 | LazyData: true 44 | Suggests: 45 | rmarkdown, 46 | knitr, 47 | testthat 48 | VignetteBuilder: knitr 49 | Depends: 50 | R (>= 3.5.0) 51 | RoxygenNote: 7.3.1 52 | LinkingTo: Rcpp, RcppArmadillo 53 | -------------------------------------------------------------------------------- /FAQs.md: -------------------------------------------------------------------------------- 1 | # FAQs and advanced methods 2 | 3 | #### Topics 4 | 5 | - [Workflow overview](#workflow-overview) 6 | - [Choosing the n_clust argument](#choosing-nclust) 7 | - [Updating reference profiles](#updating-reference-profiles) 8 | - [On confidence scores](#confidence-scores) 9 | - [Which genes to use](#which-genes-to-use) 10 | - [Interpreting clustering results](#interpreting-clustering-results) 11 | - [Targeted subclustering](#targeted-subclustering) 12 | 13 | ## Workflow overview 14 | The broad Insitutype workflow is as follows: 15 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/45d89004-dc46-40a1-bde8-33d204e0f0b8) 16 | 17 | 18 | ## Unsupervised vs. Supervised vs. Semi-supervised cell typing 19 | InSituType runs in 3 modes: 20 | - Supervised: call only cell types defined in reference profiles. Set `nclust = 0` to run in fully supervised mode. 21 | - Unsupervised: de novo clustering, with no reference cell types 22 | - Semi-supervised: find new clusters while also calling reference cell types. `Set reference_profiles = NULL` to run in unsupervised mode. 23 | 24 | Considerations for choosing a workflow: 25 | - Supervised is most convenient if you are confident that your reference profiles contain all the cell types in your dataset. 26 | However, many reference profiles from scRNA-seq don't fit spatial data well, so using reference profiles can be challenging. 27 | - Semi-supervised mode is the most powerful but most challenging workflow. We use this in >80% of analyses. 28 | Success hinges on how well the reference profiles are calibrated to spatial data. InSituType tries to 29 | perform this calibration using anchor cells, but this does not always succeed. 30 | - We recommend trying semi-supervised cell typing first, assuming there are new clusters you expect to discover. 31 | - Unsupervised has no difficulty with poorly-calibrated reference profiles, but it requires you to name each cluster, 32 | which can be onerous. It may also fail to define distinctions that are important to you. 33 | 34 | ## Choosing reference profiles 35 | Keep in mind the following when selecting reference profiles: 36 | - Quality of scRNA-seq references varies greatly. Finding mis-annotated cell types is not uncommon, 37 | and for smaller datasets, profiles of rare cell types will be noisy. Exercsie some skepticism. 38 | - Large platform effects separate scRNA-seq and spatial platforms. When possible, use a reference from the same platform as your data. 39 | - A large collection of single cell references can be found here: https://github.com/Nanostring-Biostats/cellprofilelibrary 40 | - A growing collection of CosMx references is here: https://github.com/Nanostring-Biostats/CosMx-Cell-Profiles 41 | 42 | 43 | ## Choosing nclust 44 | We recommend choosing a slightly generous value of `nclust`, then using `refineClusters` to condense the resulting clusters. For example, if you're running semi-supervised cell typing and you expect to find 5 new clusters, set `nclust = 8`. Or for unsupervised clustering with an expectation of 12 cell types, set `nclust = 16`. 45 | It's generally easy to tell when two clusters come from the same cell type: they'll be adjacent in UMAP space, and the flightpath plot will show them frequently confused with each other. 46 | 47 | Final note: Insitutype splits big clusters with higher counts more aggressively than other clusters. For example, in a tumor study, it will subcluster tumor cells many times before it subclusters e.g. fibroblasts. The simplest solution is to increase nclust as needed, then condense the over-clustered cell type as desired. 48 | 49 | 50 | ## Updating reference profiles 51 | 52 | Cell typing's biggest challenge is using a reference dataset from a different platform. Platform effects between scRNA-seq and spatial platforms can be profound. 53 | Insitutype has 3 treatments for reference profiles: 54 | 1. Use the reference profile matrix as-is 55 | 2. Choose anchor cells, then rescale genes based on estimated platform effects. (Less aggressive, only fits gene-level effects.) 56 | 3. Choose anchor cells, then refit the reference profiles entirely. (Most aggressive, fits a new value for every gene x cell type.) 57 | 58 | We suggest using the below flowchart to choose from among these options: 59 | 60 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/824dec47-2221-4fe8-92a0-15693c749d55) 61 | 62 | For more on starting with a coarse reference then subclustering, see the "Targeted subclustering" discussion further on. 63 | 64 | ## Confidence Scores 65 | Insitutype returns a posterior probability for each cell type call. In practice, we have found these probabilities to be overconfident. 66 | Below is an image from the preprint demonstrating this phenomenon. For various posterior probability bins, it shows the accuracy rate actually achieved (with a confidence interval). 67 | 68 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/f02df11d-405b-411d-8049-4ab3d021d0a4) 69 | 70 | So 100% confident probabilties appear to be accurate, but lower probabilities are overconfident. 71 | Also, remember that these probabilities are based on all the information available to the model. They don't consider that the model might be missing cell types, or that the reference profiles could be incorrect. 72 | 73 | In short, the posterior probabilities are useful for differentiating strong from weak cell typing calls, but you should be conservative when choosing a threshold. We often use a threshold of 80%, calling cells below that confidence as "unclassified". 74 | 75 | ## Which genes to use 76 | 77 | Insitutype was designed using 1000-plex CosMx data, where we found it most powerful to use all genes in the panel. 78 | In our new 6000-plex data, it's worth considering using Insitutype on a well-chosen subset of genes. As a rule of thumb, genes should be retained if either of the following applies: 79 | 1. They have solidly above-background expression in the CosMx data 80 | 2. They have moderate-to-high expression in at least one reference profile 81 | 82 | For typical 6000plex experiments, we speculate that cell typing using somewhere between 3000-5000 genes would be optimal. 83 | 84 | 85 | ## Interpreting clustering results 86 | 87 | Once Insitutype has run, take time to scrutinize the results. You'll need to: 88 | 1. Confirm cell types from the reference profiles are correct 89 | 2. Interpret new clusters 90 | 91 | First, we recommend the following QC plots: 92 | 93 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/aa2c47ba-8c4e-412d-b790-5205ae9739fc) 94 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/f1f1694c-c0df-41fe-a823-ca34a16d553b) 95 | 96 | Example code for generating the above profiles heatmap: 97 | ``` 98 | pdf("", height = 20, width = 6) 99 | mat <- res$profiles # ("res" is the insitutype output) 100 | mat <- sweep(mat, 1, pmax(apply(mat, 1 ,max), 0.1), "/") 101 | pheatmap(mat, col = colorRampPalette(c("white", "darkblue"))(100), 102 | fontsize_row = 5) 103 | dev.off() 104 | ``` 105 | 106 | We have found the below workflows to be effective and efficent: 107 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/3adda877-53e7-48ca-8781-927e77739943) 108 | 109 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/24a28e1b-e1bf-4be1-bf38-0c4ebeb574d4) 110 | 111 | 112 | 113 | ## Targeted subclustering 114 | 115 | This is an advanced method. Sometimes it can be hard to subcluster a cell type if many of its genes are impacted by contamination from segmentation errors. Immune cells in the context of tumors are a good example. 116 | To subcluster say T-cells in a tumor, you might initially call a single T-cell cluster. Then, considering just these cells and just the genes unlikely to be contaminated in T-cells (genes with high T-cell expression or with low expression in surrounding cell types), run unsupervised Insitutype. 117 | 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /InSituType.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | NanoString Technologies, Inc. 2 | Software License Agreement for Non-Commercial Use 3 | By downloading, installing, accessing, modifying or otherwise making use of the Program (defined below), you agree to be bound by the terms and conditions of this Software License Agreement for Non-Commercial Use (this “License”). 4 | 1. DEFINITIONS 5 | 1.1. “Affiliate” means, with respect to an individual or entity, another individual or entity: (i) on whose behalf such individual or entity is acting, or (ii) that exercises control, is controlled by, or is under common control with such individual or entity. For the purposes of this definition, the term “control” means the right, whether by ownership, exercise of voting rights, contract, or otherwise, to direct the actions of an individual or entity. 6 | 1.2. “Distribute” means to distribute, share, make available, or otherwise provide the Program or Modified Program, as applicable, or access thereto (including via a computer network) to any third party. 7 | 1.3. “Licensor” means the individual or entity licensing the rights granted in this License. 8 | 1.4. “Licensee” or “you” means the individual or entity receiving or exercising the rights granted under this License, provided that the individual or entity is not a NanoString Competitor. 9 | 1.5. “Non-Commercial Use” means any use where profit or other commercial benefit is not a direct or indirect motive or intended result. 10 | 1.6. “Modified Program” means a derivative work of, or a work that is based on, uses or incorporates, the Program (whether or not in combination with other works, materials or content). 11 | 1.7. “NanoString” means NanoString Technologies, Inc. 12 | 1.8. “NanoString Competitor” means any individual or entity that directly or indirectly competes with NanoString or any of NanoString’s Affiliates or whose Affiliate directly or indirectly competes with NanoString or any of NanoString’s Affiliates. 13 | 1.9. “Program” means the copyrightable work of authorship, program, code, or software licensed under this License. 14 | 2. LICENSE 15 | 2.1. Grant. Subject to the terms and conditions of this License, Licensor hereby grants to Licensee a worldwide, royalty-free, non-exclusive, revocable license to: (a) use, Distribute, and reproduce the Program, and (b) use, create, Distribute, and reproduce Modified Programs, in each case, solely for your internal, Non-Commercial Use. No rights are granted to NanoString Competitors. 16 | 2.2. No Endorsement. Nothing in this License may be construed as permission to assert or imply that Licensor, NanoString, or other contributors to the Program sponsors, endorses, or is otherwise connected with the Licensee or the entity or institution that Licensee represents. 17 | 2.3. Trademarks. Trademark rights are not licensed to you under this License. 18 | 2.4. Grant of Patent License. Subject to the terms and conditions of this License, NanoString hereby grants to you a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, import, and otherwise transfer the Program, where such license applies only to those patent claims licensable by NanoString that are necessarily infringed by Licensee alone or by combination of its modification(s) to the Program or Modified Program to which such modification(s) was submitted. If you institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Program, Modified Program, or a modification incorporated within the Program or a Modified Program constitutes direct or contributory patent infringement, then any patent licenses granted to you under this License for the Program or any such Modified Program shall terminate as of the date such litigation is filed. 19 | 3. CONDITIONS TO THE RIGHT TO DISTRIBUTE 20 | 3.1. Notices. If you Distribute the Program or a Modified Program in any form, you must also provide to the recipient: 21 | 3.1.1. a copy of this License; and 22 | 3.1.2. for Modified Programs, prominent notices identifying the portions of the Modified Program that have been modified, stating that you have modified the Program. 23 | 3.2. Attribution. Except as otherwise expressly permitted under this License, you must keep intact, and you may not modify or remove, any notices, disclaimers, or attributions included in or provided with the Program. In addition, you must also include a prominent hypertext link back to NanoString’s website at www.nanostring.com. 24 | 3.3. License. You may only Distribute the Program or the Modified Program under the terms of this License (or any later version, at your election). You may not offer or impose any additional or different terms or conditions that, or take any measures to, restrict the exercise of the rights granted under this License. 25 | 4. NO REPRESENTATIONS OR WARRANTIES; LIMITATIONS OF LIABILITY 26 | 4.1. Disclaimer. UNLESS OTHERWISE AGREED BY LICENSOR IN WRITING, TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, LICENSOR OFFERS THE PROGRAM AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND WITH REGARD TO THE PROGRAM, WHETHER EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. THE LICENSOR DOES NOT REPRESENT OR WARRANT THAT THE PROGRAM WILL BE ERROR FREE AND DOES NOT PROMISE THAT ANY SUCH ERRORS WILL BE CORRECTED. 27 | SOME JURISDICTIONS DO NOT ALLOW FOR THE EXCLUSION OF IMPLIED WARRANTIES, SO THE FOREGOING MAY NOT APPLY TO YOU. 28 | 4.2. Limitation of Liability. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT WILL THE LICENSOR OR NANOSTRING BE LIABLE TO YOU UNDER ANY LEGAL THEORY FOR ANY DAMAGES OF ANY KIND, INCLUDING ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF OR RELATED TO THE PROGRAM OR USE THEREOF, EVEN IF LICENSOR OR NANOSTRING HAS BEEN ADVISED OF THE POSSIBILITY OR LIKELIHOOD OF SUCH DAMAGES. 29 | 5. MISCELLANEOUS 30 | 5.1. Right to Enforce. NanoString is an express third-party beneficiary of this License and will be entitled to enforce the provisions of this License as if it were a party hereto. 31 | 5.2. Waiver; Amendment. No term or provision hereof will be considered waived by the Licensor, and no breach excused by Licensor, unless such waiver or consent is in writing and signed by an authorized representative of Licensor. The waiver by Licensor of, or consent by Licensor to, a breach of any provision of this License by the Licensee, will not constitute, operate or be construed as a waiver of, consent to, or excuse of any other or subsequent breach by Licensee. This License may be amended or modified only by an agreement in writing signed by an authorized representative of each of Licensor and Licensee. 32 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(Estep) 4 | export(Mstep) 5 | export(chooseClusterNumber) 6 | export(choose_anchors_from_stats) 7 | export(colorCellTypes) 8 | export(estimatePlatformEffects) 9 | export(fastCohorting) 10 | export(find_anchor_cells) 11 | export(flightpath_layout) 12 | export(flightpath_plot) 13 | export(getProteinParameters) 14 | export(getRNAprofiles) 15 | export(getSpatialContext) 16 | export(get_anchor_stats) 17 | export(insitutype) 18 | export(insitutypeML) 19 | export(lls_protein) 20 | export(lls_rna) 21 | export(numCores) 22 | export(refineAnchors) 23 | export(refineClusters) 24 | export(spatialUpdate) 25 | export(updateProfilesFromAnchors) 26 | export(updateReferenceProfiles) 27 | exportMethods(insitutype) 28 | exportMethods(insitutypeML) 29 | exportPattern("^[[:alpha:]]+") 30 | import(ggplot2) 31 | importFrom(Matrix,colSums) 32 | importFrom(Matrix,rowMeans) 33 | importFrom(Matrix,rowSums) 34 | importFrom(Matrix,sparseMatrix) 35 | importFrom(Matrix,t) 36 | importFrom(Rcpp,evalCpp) 37 | importFrom(SingleCellExperiment,SingleCellExperiment) 38 | importFrom(SummarizedExperiment,assay) 39 | importFrom(data.table,data.table) 40 | importFrom(data.table,melt) 41 | importFrom(data.table,rbindlist) 42 | importFrom(dplyr,filter) 43 | importFrom(dplyr,group_by) 44 | importFrom(dplyr,summarise_all) 45 | importFrom(grDevices,col2rgb) 46 | importFrom(grDevices,colors) 47 | importFrom(graphics,lines) 48 | importFrom(graphics,par) 49 | importFrom(graphics,plot) 50 | importFrom(irlba,irlba) 51 | importFrom(irlba,prcomp_irlba) 52 | importFrom(lsa,cosine) 53 | importFrom(magrittr,"%>%") 54 | importFrom(mclust,Mclust) 55 | importFrom(mclust,mclustBIC) 56 | importFrom(mclust,predict.Mclust) 57 | importFrom(methods,as) 58 | importFrom(methods,is) 59 | importFrom(rlang,.data) 60 | importFrom(scales,alpha) 61 | importFrom(spatstat.geom,closepairs) 62 | importFrom(spatstat.geom,nncross) 63 | importFrom(spatstat.geom,nndist) 64 | importFrom(spatstat.geom,nnwhich) 65 | importFrom(spatstat.geom,ppp) 66 | importFrom(stats,dnbinom) 67 | importFrom(stats,lm) 68 | importFrom(stats,qnorm) 69 | importFrom(stats,rnorm) 70 | importFrom(tibble,column_to_rownames) 71 | importFrom(tibble,rownames_to_column) 72 | importFrom(umap,umap) 73 | importFrom(utils,data) 74 | importFrom(uwot,umap_transform) 75 | useDynLib(InSituType, .registration = TRUE) 76 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # InSituType 2.0.0 2 | 3 | * Enable use in protein datasets via the assay_type argument. This required a major overhaul under the hood, but has little impact on existing RNA workflows. 4 | * More advanced methods for updating reference profiles via anchor cells, implemented in `updateReferenceProfiles`. 5 | * New function `spatialUpdate` for using alternative data types (e.g. space or immunofluorescence) and the Insitutype likelihood framework to update cell typing results from any method. 6 | * New function `getSpatialContext` for conveniently calculating cells' spatial contexts / neighborhood expression. 7 | * New functions `getRNAprofiles` and `getProteinParameters`, which serve as user-facing tools for getting profile matrices. 8 | 9 | # InSituType 1.2.3 10 | 11 | * handle collinearity issues with fastCohorting: 12 | Reduce to 2 PC's. 13 | If this fails, then try successively smaller # of cohorts with the 2 pc's. 14 | 15 | # InSituType 1.2.2 16 | 17 | * Add Compatibility of assay_type and platform effect correction 18 | 19 | # InSituType 1.2.1 20 | 21 | * Create "undefined" profile for cells with zero counts 22 | 23 | # InSituType 1.2.0 24 | 25 | * Also cluster continuous data from protein assay 26 | 27 | # InSituType 1.1.1 28 | 29 | * Support platform effect correction 30 | * Support anchor refinement via UMAP projection 31 | 32 | # InSituType 1.1.0 33 | 34 | * Support matrices with more than 4B elements 35 | 36 | # InSituType 1.0.0 37 | 38 | * License updated 39 | * lldist parallelized with OpenMP 40 | 41 | # InSituType 0.99.4 42 | 43 | * Re-submission to Bioconductor 44 | 45 | # InSituType 0.99.3 46 | 47 | * Merge subclustering fix 48 | 49 | # InSituType 0.99.2 50 | 51 | * Optionally use SingleCellExperiment class 52 | 53 | # InSituType 0.99.1 54 | 55 | * Added reference to CosMx paper and dataset 56 | 57 | # InSituType 0.99.0 58 | 59 | * Submission to Bioconductor 3.16 60 | 61 | # InSituType 1.1.1 62 | 63 | * Updated `flightpath_layout.R` to save the plot in a temp folder in the current work directory 64 | 65 | # InSituType 1.1.0 66 | 67 | * Fix several places counts matrix was being converted to dense to calculate a statistic 68 | * Revert conversion to `sparse matrix` of `dense` `mu` matrix and result from `dnbinom` 69 | 70 | # InSituType 1.0.0 71 | 72 | * Integrated rcpp support for the package 73 | * Added `dnbinom` for `sparse matrices` 74 | * Updated the `unit tests` 75 | * Removed `.o` files in `src` folder 76 | 77 | # InSituType 0.1.2 78 | 79 | * Updated package dependencies 80 | 81 | # InSituType 0.1.1 82 | 83 | * Added `lsa`, `SpatialDecon`, `irlba`, `mclust`, `rmarkdown` to the `DESCRIPTION` file 84 | * Fixed a Roxygen example for the R function `geoSketch` where it was trying to use `Ptolemy` and `Giotto` packages that are not being used within the package 85 | 86 | # InSituType 0.1.0 87 | 88 | * Added a `NEWS.md` file to track changes to the package. 89 | * Added BioConductor package dependencies (notably SpatialDecon and lsa) 90 | * Renamed vignettes to allow for compilation 91 | * Deleted old vignettes (labelled OLD) 92 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #' sum from negative binomial density function 5 | #' 6 | #' Probability density function of the negative binomial distribution (written in C++) 7 | #' 8 | #' @param mat dgCMatrix expression counts 9 | #' @param bgsub vector of background expression per cell 10 | #' @param x numeric expression for reference profiles 11 | #' @param bg numeric background level 12 | #' @param size_dnb int Dispersion parameter 13 | #' 14 | #' @return rowSums for matrix of densities 15 | #' @useDynLib InSituType, .registration = TRUE 16 | #' @importFrom Rcpp evalCpp 17 | #' @exportPattern "^[[:alpha:]]+" 18 | #' @export 19 | lls_rna <- function(mat, bgsub, x, bg, size_dnb) { 20 | .Call(`_InSituType_lls_rna`, mat, bgsub, x, bg, size_dnb) 21 | } 22 | 23 | #' sum from Gaussian density function 24 | #' 25 | #' Probability density function of the Gaussian distribution (written in C++) 26 | #' 27 | #' @param mat dgCMatrix expression matrix 28 | #' @param bgsub vector of background expression per cell 29 | #' @param x numeric expression for reference profiles 30 | #' @param xsd numeric expression for reference SD profiles 31 | #' 32 | #' @return rowSums for matrix of densities 33 | #' @useDynLib InSituType, .registration = TRUE 34 | #' @importFrom Rcpp evalCpp 35 | #' @exportPattern "^[[:alpha:]]+" 36 | #' @export 37 | lls_protein <- function(mat, bgsub, x, xsd) { 38 | .Call(`_InSituType_lls_protein`, mat, bgsub, x, xsd) 39 | } 40 | 41 | -------------------------------------------------------------------------------- /R/chooseClusterNumber.R: -------------------------------------------------------------------------------- 1 | #' Estimate the correct number of clusters using a subset of the data 2 | #' 3 | #' For a subset of the data, perform clustering under a range of cluster numbers. 4 | #' Report on loglikelihood vs. number of clusters, and suggest a best choice. 5 | #' @param counts Counts matrix, cells * genes. 6 | #' @param neg Vector of mean negprobe counts per cell (default = "rna") 7 | #' @param assay_type Assay type of RNA, protein 8 | #' @param bg Expected background 9 | #' @param fixed_profiles Matrix of cluster profiles to hold unchanged throughout iterations. 10 | #' @param fixed_sds Matrix of SDs expression of genes x cell types,to hold unchanged throughout iterations. Only for assay_type of protein 11 | #' @param cohort Vector of cells' cohort assignments. 12 | #' @param init_clust Vector of initial cluster assignments. 13 | #' @param n_clusts Vector giving a range of cluster numbers to consider. 14 | #' @param max_iters Number of iterations in each clustering attempt. Recommended to choose 15 | #' a smaller number for a quicker, approximate clustering. 16 | #' @param subset_size Number of cells to include in clustering. 17 | #' @param align_genes Logical, for whether to align the genes in fixed_profiles with the colnames in count 18 | #' @param plotresults Logical, for whether to plot the results. 19 | #' @param nb_size The size parameter to assume for the NB distribution. 20 | #' @param pct_drop the decrease in percentage of cell types with a valid switchover to 21 | #' another cell type compared to the last iteration. Default value: 1/10000. A valid 22 | #' switchover is only applicable when a cell has changed the assigned cell type with its 23 | #' highest cell type probability increased by min_prob_increase. 24 | #' @param min_prob_increase the threshold of probability used to determine a valid cell 25 | #' type switchover 26 | #' @param ... Arguments passed to nbclust. 27 | #' @export 28 | #' 29 | #' @importFrom graphics plot 30 | #' @importFrom graphics lines 31 | #' @importFrom graphics par 32 | #' @importFrom stats lm 33 | #' 34 | #' @return A list, with the following elements: 35 | #' \itemize{ 36 | #' \item 37 | #' } 38 | #' @examples 39 | #' data("mini_nsclc") 40 | #' chooseClusterNumber(mini_nsclc$counts, Matrix::rowMeans(mini_nsclc$neg), assay_type="RNA", 41 | #' n_clust = 2:5) 42 | 43 | chooseClusterNumber <- 44 | function(counts, 45 | neg, 46 | assay_type = c("rna", "protein"), 47 | bg = NULL, 48 | fixed_profiles = NULL, 49 | fixed_sds = NULL, 50 | cohort = NULL, 51 | init_clust = NULL, 52 | n_clusts = 2:12, 53 | max_iters = 10, 54 | subset_size = 1000, 55 | align_genes = TRUE, 56 | plotresults = FALSE, 57 | nb_size = 10, 58 | pct_drop = 0.005, 59 | min_prob_increase = 0.05, 60 | ...) { 61 | assay_type <- match.arg(tolower(assay_type), c("rna", "protein")) 62 | 63 | # infer bg if not provided: assume background is proportional to the scaling factor s 64 | s <- rowSums(counts) 65 | if (is.null(bg)) { 66 | bgmod <- stats::lm(neg ~ s - 1) 67 | bg <- bgmod$fitted 68 | } 69 | 70 | # subset the data: 71 | use <- sample(seq_len(nrow(counts)), subset_size) 72 | counts <- counts[use, ] 73 | s <- s[use] 74 | neg <- neg[use] 75 | bg <- bg[use] 76 | if (!is.null(init_clust)) { 77 | init_clust <- init_clust[use] 78 | } 79 | 80 | if (length(n_clusts) <= 0) { 81 | stop("n_clusts needs to be more than one value.") 82 | } else if (!all(sapply(n_clusts, function(x) x > 0 && x / as.integer(x) == 1))) { 83 | stop("n_clusts need to be a vector of positive integers.") 84 | } 85 | 86 | # align genes in fixed_profiles: 87 | if (align_genes && !is.null(fixed_profiles)) { 88 | sharedgenes <- intersect(rownames(fixed_profiles), colnames(counts)) 89 | counts <- counts[, sharedgenes] 90 | fixed_profiles <- fixed_profiles[sharedgenes, ] 91 | fixed_sds <- fixed_sds[sharedgenes, ] 92 | } 93 | # cluster under each value of n_clusts, and save loglik: 94 | totallogliks <- sapply(n_clusts, function(x) { 95 | 96 | # get init clust: 97 | tempinit <- rep(letters[seq_len(x)], each = ceiling(nrow(counts) / x))[ 98 | seq_len(nrow(counts))] 99 | 100 | # run nbclust: 101 | message(sprintf("Clustering with n_clust = %s", x)) 102 | tempclust <- nbclust( 103 | counts = counts, 104 | neg = neg, 105 | bg = bg, 106 | fixed_profiles = fixed_profiles, 107 | fixed_sds = fixed_sds, 108 | cohort = cohort, 109 | init_clust = tempinit, 110 | nb_size = nb_size, 111 | assay_type=assay_type, 112 | pct_drop = pct_drop, 113 | min_prob_increase = min_prob_increase, 114 | max_iters = max_iters) 115 | 116 | # get the loglik of the clustering result: 117 | loglik_thisclust <- lldist(x = tempclust$profiles, 118 | mat = counts, 119 | xsd = tempclust$sds, 120 | bg = bg, 121 | size = nb_size, 122 | assay_type = assay_type) 123 | 124 | total_loglik_this_clust <- sum(apply(loglik_thisclust, 1, max)) 125 | return(total_loglik_this_clust) 126 | }) 127 | 128 | # report goodness-of-fit 129 | n_parameters <- n_clusts * ncol(counts) 130 | aic <- n_parameters * 2 - 2 * totallogliks 131 | bic <- n_parameters * log(nrow(counts)) - 2 * totallogliks 132 | 133 | best_clust_number <- n_clusts[order(aic)[1]] 134 | 135 | if (plotresults) { 136 | original_par <- par()$mfrow 137 | graphics::par(mfrow = c(2, 1)) 138 | graphics::plot(n_clusts, totallogliks, xlab = "Number of clusters", ylab = "Log-likelihood") 139 | graphics::lines(n_clusts, totallogliks) 140 | graphics::plot(n_clusts, aic, xlab = "Number of clusters", ylab = "AIC") 141 | graphics::lines(n_clusts, aic) 142 | par(mfrow = original_par) 143 | } 144 | 145 | out <- list(best_clust_number = best_clust_number, 146 | n_clusts = n_clusts, 147 | loglik = totallogliks, 148 | aic = aic, 149 | bic = bic) 150 | return(out) 151 | } 152 | -------------------------------------------------------------------------------- /R/colorCellTypes.R: -------------------------------------------------------------------------------- 1 | #' Function to choose colors for cell types 2 | #' 3 | #' Uses Giotto::getDistinctColors to begin with. Orders colors so the most 4 | #' common cell types get the lightest colors. Removes colors that are too light 5 | #' (sum of rgb values > 600) 6 | #' @param names Vector of cell type names 7 | #' @param freqs Optional, named vector of cell type abundance (e.g. c(T = 1000, 8 | #' tumor = 15000...)) 9 | #' @param init_colors Optional, a named vector of cell colors. This will be used 10 | #' for all cell types in the "names" vector that match names(init_colors). 11 | #' Intended for use with the iocolors vector (found in the Ptolemy package 12 | #' data). 13 | #' @param max_sum_rgb Don't return any colors with total rgb values above this 14 | #' level. (Removes excessively light colors.) 15 | #' @param palette One of "tableau20", "brewers" or "earthplus". 16 | #' @return A named color vector 17 | #' @importFrom grDevices col2rgb colors 18 | #' @export 19 | #' @examples 20 | #' data("mini_nsclc") 21 | #' unsup <- insitutype( 22 | #' x = mini_nsclc$counts, 23 | #' neg = Matrix::rowMeans(mini_nsclc$neg), 24 | #' n_clusts = 8, 25 | #' n_phase1 = 200, 26 | #' n_phase2 = 500, 27 | #' n_phase3 = 2000, 28 | #' n_starts = 1, 29 | #' max_iters = 5, 30 | #' assay_type="RNA" 31 | #' ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended. 32 | #' colorCellTypes(freqs = table(unsup$clust), palette = "brewers") 33 | 34 | colorCellTypes <- function(names = NULL, freqs = NULL, init_colors = NULL, max_sum_rgb = 600, 35 | palette = "earthplus") { 36 | 37 | if (is.null(freqs) && is.null(names)) { 38 | stop("must specify either names or freqs") 39 | } 40 | 41 | if (is.null(freqs) && palette == "earthplus") { 42 | warning("this palette is best used when cell frequencies are known.") 43 | } 44 | 45 | if (is.null(freqs)) { 46 | # format names into freqs, then work with freqs henceforth 47 | freqs <- rep(1, length(names)) 48 | names(freqs) <- names 49 | } 50 | 51 | ### "brewers" version: increasingly bright Rcolorbrewer paletted: 52 | if (palette == "brewers") { 53 | # start with R colorbrewer pallettes, then add a ton of filler colors: 54 | cols <- c('#8DD3C7','#FFFFB3','#BEBADA','#FB8072','#80B1D3','#FDB462','#B3DE69','#FCCDE5', 55 | '#D9D9D9','#BC80BD','#CCEBC5','#FFED6F','#66C2A5','#FC8D62','#8DA0CB','#E78AC3', 56 | '#A6D854','#FFD92F','#E5C494','#B3B3B3','#E41A1C','#377EB8','#4DAF4A','#984EA3', 57 | '#FF7F00','#FFFF33','#A65628','#F781BF','#999999','firebrick','darkorange2','tan3', 58 | 'magenta','wheat4','palevioletred2','dodgerblue4','tomato3','mediumspringgreen', 59 | 'grey26','antiquewhite4','red1','blue2','olivedrab4','lightyellow1','rosybrown3', 60 | 'lightsteelblue4','rosybrown','rosybrown2','snow1','pink4','ghostwhite','ivory4', 61 | 'lightgoldenrod','royalblue1','deeppink1','white','violetred2','hotpink2', 62 | 'lightblue3','chartreuse4','azure2','plum','springgreen2','lemonchiffon1', 63 | 'goldenrod2','grey6','darkorchid','palevioletred4','green4','lightsalmon1', 64 | 'saddlebrown','rosybrown1','antiquewhite1','whitesmoke','plum4','cyan2', 65 | 'forestgreen','burlywood3','lightyellow4','firebrick1','khaki3','salmon3', 66 | 'sienna2','coral1','tan1','mediumvioletred','springgreen1','lemonchiffon', 67 | 'lightgoldenrod4','darkred','navajowhite1','lightcoral','mediumturquoise', 68 | 'lavenderblush','mistyrose1','indianred2','darkgoldenrod4','lightgoldenrod1', 69 | 'lightsalmon3','lavender','magenta4','tomato2','seashell3','purple','tan2', 70 | 'palevioletred3','coral3','lightblue1','darkorange4','orange1','darkolivegreen', 71 | 'maroon1','skyblue3','cadetblue2','mediumorchid3','gold3','violetred1', 72 | 'ivory2','snow4','aquamarine','darkgrey','darkolivegreen3','turquoise4', 73 | 'sienna4','springgreen4','peachpuff4','seashell','violet','turquoise', 74 | 'bisque2','lightsteelblue2','honeydew','lightsteelblue3','lawngreen', 75 | 'tomato4','lightsalmon4','chocolate2','black','lightpink4','deepskyblue4', 76 | 'aquamarine3','dodgerblue1','salmon1','yellow3','wheat','skyblue4','navajowhite4', 77 | 'purple2','lavenderblush1','darkorange1','khaki2','aquamarine1','honeydew2', 78 | 'cornsilk','lightskyblue4','mediumpurple2','paleturquoise1','seashell1', 79 | 'darkcyan','orchid','royalblue','darkseagreen2','seagreen4','darkmagenta', 80 | 'lightblue','mediumblue','chocolate3','yellow','darkgoldenrod2','mediumorchid4', 81 | 'palegreen2','olivedrab','darkslateblue','chocolate1','maroon2','grey36', 82 | 'orangered','goldenrod1','bisque3','deeppink3','peachpuff3','darkgreen', 83 | 'royalblue4','darkgoldenrod1','blanchedalmond','mistyrose4','turquoise2', 84 | 'ivory3','orchid1','limegreen','mediumpurple1','darkorange3','lemonchiffon4', 85 | 'palevioletred1','magenta2','blue4','cyan1','thistle4','peru','grey56','cornsilk4', 86 | 'mediumorchid2','green2','lightblue4','salmon4','burlywood4','burlywood1','orange', 87 | 'burlywood','purple4','plum1','violetred3','khaki4','lightgoldenrodyellow', 88 | 'lavenderblush3','lightpink3','azure4','orangered4','yellow2','mistyrose2', 89 | 'deepskyblue2','mediumaquamarine','slateblue1','orange2','coral2','darkorchid4', 90 | 'lightsalmon','gold2','darkseagreen') 91 | cols <- cols[!duplicated(cols)] 92 | 93 | # remove colors that are too light: 94 | sum_rgb <- colSums(grDevices::col2rgb(cols)) 95 | cols <- cols[sum_rgb < max_sum_rgb] 96 | # add more colors if needed: 97 | n_removed <- sum(sum_rgb >= max_sum_rgb) 98 | if (n_removed > 0) { 99 | newcols <- sample(colors()[!grepl("grey", colors())], length(freqs) * 2)[length(freqs) + seq_len(length(freqs))] 100 | newcols <- newcols[colSums(grDevices::col2rgb(newcols)) < max_sum_rgb] 101 | cols <- c(cols, newcols[seq_len(n_removed)]) 102 | } 103 | 104 | # order so the most common cells have lighter colors: 105 | cols <- cols[seq_along(freqs)] 106 | names(cols) <- names(freqs)[order(freqs, decreasing = TRUE)] 107 | } 108 | 109 | ### "tableau20" palette: start with the tablueau20 colors: 110 | if (palette == "tableau20") { 111 | tab20 <- c('#aec7e8','#ffbb78','#98df8a','#ff9896','#c5b0d5','#c7c7c7', 112 | '#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#8c564b', 113 | '#e377c2','#17becf','#7f7f7f', 114 | '#8DD3C7','#FFFFB3','#BEBADA','#FB8072','#80B1D3','#FDB462', 115 | '#B3DE69','#FCCDE5','#D9D9D9','#BC80BD','#CCEBC5','#FFED6F', 116 | sample(colors()[!grepl("grey", colors())], 200, replace = FALSE)) 117 | cols <- tab20[seq_along(freqs)] 118 | names(cols) <- names(freqs)[order(freqs)] 119 | } 120 | 121 | ### "earthplus" palette: earthtones for common cells, radiant colors for rare cells: 122 | if (palette == "earthplus") { 123 | # step 1: top least common cells, as long as <1% freq, get "radiant" colors: 124 | radiantcolors <- 125 | c( 126 | "#FF0000", 127 | "#00CCFF", 128 | "#00FF00", 129 | "#FFFF00", 130 | "#FF00CC", 131 | "#00FFFF", 132 | "#FF3300", 133 | "#CC00FF", 134 | "#CCFF00", 135 | "#66FF33" 136 | ) 137 | richcolors <- c("#660099", "#006600", "#000000", "#000066") 138 | nlow <- min(sum(freqs < 0.01), 14) 139 | lowcols <- c(radiantcolors, richcolors)[seq_len(nlow)] 140 | 141 | # step 2: most common cells, as long as >10% freq, get "earth" colors: 142 | earthtones <- c('#D9AF6B','#AF6458','#526A83','#68855C','#9C9C5E','#855C75') 143 | nhigh <- min(sum(freqs > 0.1), length(earthtones)) 144 | highcols <- earthtones[seq_len(nhigh)] 145 | 146 | # step 3: remainder get mid-range colors 147 | moderatecolors <- c('#1D6996','#73AF48','#E17C05','#94346E','#EDAD08','#38A6A5', 148 | '#CC503E','#0F8554','#5F4690', 149 | '#8DD3C7','#FFFFB3','#BEBADA','#FB8072','#80B1D3','#FDB462', 150 | '#B3DE69','#FCCDE5','#D9D9D9','#BC80BD','#CCEBC5','#FFED6F', 151 | sample(colors()[!grepl("grey", colors())], 200, replace = FALSE)) 152 | nmid <- length(freqs) - nlow - nhigh 153 | if (nmid < length(moderatecolors)) { 154 | midcols <- moderatecolors[seq_len(nmid)] 155 | } else { 156 | stop("too many cell types") 157 | } 158 | cols <- c(lowcols, midcols, highcols) 159 | names(cols) <- names(freqs)[order(freqs)] 160 | } 161 | 162 | # if init_colors are provided, use them when possible: 163 | if (!is.null(init_colors)) { 164 | overlap <- intersect(names(cols), names(init_colors)) 165 | cols[overlap] <- init_colors[overlap] 166 | } 167 | 168 | return(cols) 169 | } 170 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | 2 | #' Small example SMI data from a NSCLC tumor 3 | #' 4 | #' A 2000-cell excerpt from a 1000-plex SMI study of a NSCLC tumor. 5 | #' 6 | #' @format A list with the following elements: 7 | #' \itemize{ 8 | #' \item counts A matrix of raw counts, with cells in rows and genes in columns 9 | #' \item counts A matrix of negprobe counts, with cells in rows and negprobes in columns 10 | #' \item x x positions 11 | #' \item y y position 12 | #' \item umap umap projection 13 | #' } 14 | "mini_nsclc" 15 | 16 | 17 | 18 | #' Matrix of immune cell profiles 19 | #' 20 | #' A matrix of gene * cell type expected expression values 21 | #' 22 | #' @format A matrix of 27161 genes x 16 cell types. 23 | "ioprofiles" 24 | 25 | #' Default colors for the cell types in the ioprofiles matrix 26 | #' 27 | #' A named vector of colors, giving colors for the cell types of the ioprofiles 28 | #' matrix. 29 | #' 30 | #' @format A named vector 31 | "iocolors" 32 | 33 | 34 | #' Small example SMI protein data from a tonsil tissue 35 | #' 36 | #' A 21844-cells excerpt from a 68-plex SMI study of a tonsil tissue. 37 | #' 38 | #' @format A list with the following elements: 39 | #' \itemize{ 40 | #' \item counts A matrix of raw counts, with cells in rows and proteins in columns 41 | #' \item negs A matrix of IgG counts, with cells in rows and IgGs in columns 42 | #' \item xy_coord x and y positions 43 | #' \item UMAP umap projection 44 | #' } 45 | "tonsil_protein" 46 | 47 | 48 | 49 | #' Reference profile examples from a tonsil tissue 50 | #'#' 51 | #' @format A list with the following elements: 52 | #' \itemize{ 53 | #' \item tonsil_reference_profile A matrix of raw counts, with cells in rows and proteins in columns 54 | #' \item counts A matrix of IgG counts, with cells in rows and IgGs in columns 55 | #' \item xy_coord x and y positions 56 | #' \item UMAP umap projection 57 | #' } 58 | "tonsil_reference_profile" 59 | 60 | 61 | #' Matrix of anchor cells' annotation file 62 | #' A matrix including cell_ID and cellType for anchors cells 63 | #' 64 | #' matrix. 65 | #' 66 | #' @format A matrix of 11844 cells and 2 columns 67 | "tonsil_annotation" 68 | 69 | 70 | #' Example human marker proteins 71 | #' For inputting \code{into gen_profiles_protein_expression()} 72 | #' 73 | #' data frame 74 | #' 75 | #' @format A matrix of 11844 cells and 2 columns 76 | "human_signature" 77 | 78 | 79 | #' Example mouse marker proteins 80 | #' For inputting \code{into gen_profiles_protein_expression()} 81 | #' 82 | #' data frame 83 | #' 84 | #' @format A matrix of 11844 cells and 2 columns 85 | "mouse_signature" 86 | -------------------------------------------------------------------------------- /R/fastCohorting.R: -------------------------------------------------------------------------------- 1 | #' Quickly split cells into cohorts 2 | #' 3 | #' Quickly split cells into cohorts using non-RNA data like spatial context and immunofluorescence values. 4 | #' Rule of thumb: include any variables that might be informative for cell typing, 5 | #' *except* variables you'll want to analyze later. For example, if you'll later 6 | #' perform differential expression as a function of spatial context, then it's 7 | #' safer to exclude spatial context from the cell typing exercise (and therefore 8 | #' from this function). 9 | #' @param mat Matrix of variables to be used in cohorting, cells in rows, and variables in columns. 10 | #' Recommended to use < 20 variables. 11 | #' @param n_cohorts Number of clusters to divide cells into 12 | #' @param gaussian_transform Whether to map each variable onto the quantiles of a normal distribution. 13 | #' @return A vector of cohort assignments. 14 | #' @export 15 | #' @importFrom mclust Mclust 16 | #' @importFrom mclust predict.Mclust 17 | #' @importFrom mclust mclustBIC 18 | #' @importFrom stats qnorm 19 | #' @examples 20 | #' data("mini_nsclc") 21 | #' ## simulate immunofluorescence data: 22 | #' immunofluordata <- matrix(rpois(n = nrow(mini_nsclc$counts) * 4, lambda = 100), 23 | #' nrow(mini_nsclc$counts)) 24 | #' cohort <- fastCohorting(immunofluordata, gaussian_transform = TRUE) 25 | #' table(cohort) 26 | fastCohorting <- function(mat, n_cohorts = NULL, gaussian_transform = TRUE) { 27 | 28 | if (any(is.na(mat))) { 29 | stop("NA's detected in mat. fastCohorting needs complete data.") 30 | } 31 | 32 | # gaussian transform if called for: 33 | if (gaussian_transform) { 34 | for (i in seq_len(ncol(mat))) { 35 | mat[, i] <- qnorm(rank(mat[, i]) / (nrow(mat) + 1)) 36 | } 37 | } 38 | 39 | # choose number of cohorts: 40 | if (is.null(n_cohorts)) { 41 | n_cohorts <- 3 42 | if (nrow(mat) > 10000) n_cohorts <- 10 43 | if (nrow(mat) > 50000) n_cohorts <- 25 44 | if (nrow(mat) > 100000) n_cohorts <- 50 45 | if (nrow(mat) > 200000) n_cohorts <- 100 46 | } 47 | 48 | # cluster in a subsample: 49 | sub <- sample(seq_len(nrow(mat)), min(20000, nrow(mat))) 50 | tryCatch({ 51 | mc <- mclust::Mclust(data = mat[sub, ], G = n_cohorts, modelNames = "EEE") 52 | if(is.null(mc)) stop("Cohorting failed with ", ncohorts, " groups. Results in NULL mclust::Mclust object.") 53 | # classify all cells: 54 | cohort <- mclust::predict.Mclust(object = mc, newdata = mat)$classification 55 | },error = function(e){ 56 | message("First attempt at autocohorting failed, possibly due to high collinearity of biomarkers. User should consider manually cohorting.") 57 | message("Automatically attempting to cohort in 2-PC space:") 58 | message(paste0("Projecting data to a lower dimensional 2-PC space for cohorting.")) 59 | message(paste0("Error in cohorting with ", n_cohorts, " groups.")) 60 | 61 | ### project to 2-d pca space 62 | pc2 <- irlba::prcomp_irlba(mat, n=min(ncol(mat), 2)) 63 | n_cohorts_try <- rev(c(2, 3, 10, 25, 50, 100)) 64 | n_cohorts_try <- n_cohorts_try[n_cohorts_try <= n_cohorts] 65 | 66 | for(ii in seq_along(n_cohorts_try)){ 67 | tryCatch({ 68 | # cluster in a subsample: 69 | mc <<- mclust::Mclust(data = pc2$x[sub, ], G = n_cohorts_try[ii], modelNames = "EEE") 70 | if(is.null(mc)) stop("Cohorting PC's with ", ncohorts_try[ii], " groups results in NULL mclust::Mclust object.") 71 | break 72 | }, error = function(e){ 73 | if(ii == length(n_cohorts_try)) stop("All attempts at cohorting have failed. Please take a look at biomarkers used for cohorting to diagnose potential issues.") 74 | message(paste0("Error in cohorting with ", n_cohorts_try[ii], " groups.")) 75 | message(paste0("Retrying with ", n_cohorts_try[ii + 1], " groups.")) 76 | }) 77 | } 78 | # classify all cells: 79 | cohort <<- mclust::predict.Mclust(object = mc, newdata = pc2$x)$classification 80 | }) 81 | 82 | names(cohort) <- rownames(mat) 83 | return(cohort) 84 | } 85 | -------------------------------------------------------------------------------- /R/flightpath_layout.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | #' "Flightpath" (umap-like) plot of clustering results 4 | #' 5 | #' Arrays cells in 2d space based on their probability of belonging to a given 6 | #' cluster. 7 | #' @param logliks Matrix of cells' log-likelihoods under each cluster. Must 8 | #' provide this or probs argument. 9 | #' @param probs Matrix of cells' probabilities of belonging to each cluster. 10 | #' Must provide this or logliks argument. 11 | #' @param profiles Matrix of cell type mean expression profiles. If provided, 12 | #' profiles rather than probs will be used to lay out the centroids. 13 | #' @param cluster_xpos Vector of cluster centroids' x positions (i.e. where you 14 | #' want each cell type to appear in the plot) 15 | #' @param cluster_ypos Vector of cluster centroids' y positions 16 | #' @return A list with two elements: \enumerate{ \item clustpos: a matrix of 17 | #' cluster centroids * x,y positions in the flightpath plot \item cellpos: A 18 | #' matrix of cells * x,y positions in the flightpath plot } 19 | #' @importFrom umap umap 20 | #' @importFrom stats rnorm 21 | #' @export 22 | #' @examples 23 | #' data("mini_nsclc") 24 | #' unsup <- insitutype( 25 | #' x = mini_nsclc$counts, 26 | #' neg = Matrix::rowMeans(mini_nsclc$neg), 27 | #' assay_type = "RNA", 28 | #' n_clusts = 8, 29 | #' n_phase1 = 200, 30 | #' n_phase2 = 500, 31 | #' n_phase3 = 2000, 32 | #' n_starts = 1, 33 | #' max_iters = 5 34 | #' ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended. 35 | #' flightpath_layout(logliks = unsup$logliks, profiles = unsup$profiles) 36 | flightpath_layout <- function(logliks = NULL, probs = NULL, profiles = NULL, cluster_xpos = NULL, cluster_ypos = NULL) { 37 | 38 | if (is.null(probs) && is.null(logliks)) { 39 | stop("Must provide either probs or logliks.") 40 | } 41 | if (is.null(probs) && !is.null(logliks)) { 42 | probs <- logliks2probs(logliks) 43 | } 44 | # force NA probs to 0: 45 | probs <- replace(probs, is.na(probs), 0) 46 | # get cluster centroid positions if not pre-specified: 47 | if (is.null(cluster_xpos) || is.null(cluster_ypos)) { 48 | # controls for a umap-based layout: 49 | conf <- umap::umap.defaults 50 | conf$min_dist <- 3 51 | conf$spread <- conf$min_dist * 1.1 52 | conf$n_neighbors <- ncol(probs) 53 | if (!is.null(profiles)) { 54 | clustum <- umap::umap(t(sqrt(profiles)), config = conf)$layout 55 | } else { 56 | clustum <- umap::umap(t(probs), config = conf)$layout 57 | } 58 | 59 | cluster_xpos <- clustum[, 1] 60 | cluster_ypos <- clustum[, 2] 61 | } 62 | 63 | # get cell xy positions as a weighted average of the umap positions 64 | ux <- probs %*% cluster_xpos 65 | uy <- probs %*% cluster_ypos 66 | 67 | # jitter the xy positions, jittering widely for prob = 1 cells and minimally for prob < 0.5 cells: 68 | jitterrange <- 0.01 * c(0.0005, 0.9) * max(diff(range(ux)), diff(range(uy))) 69 | jitteramount <- jitterrange[1] + pmax((2 * apply(probs, 1, max) - 1), 0) * jitterrange[2] 70 | ux <- ux + rnorm(length(ux), mean = 0, sd = jitteramount) 71 | uy <- uy + rnorm(length(ux), mean = 0, sd = jitteramount) 72 | 73 | out <- list(clustpos = cbind(cluster_xpos, cluster_ypos), 74 | cellpos = cbind(ux, uy), 75 | clust = colnames(probs)[apply(probs, 1, which.max)]) 76 | colnames(out$clustpos) <- c("x", "y") 77 | colnames(out$cellpos) <- c("x", "y") 78 | 79 | # get clusters' mean confidence: 80 | out$meanconfidence <- getMeanClusterConfidence(probs) 81 | return(out) 82 | } 83 | 84 | 85 | 86 | 87 | #'Plot flightpath results 88 | #' 89 | #'@param flightpath_result The list output by the flightpath_layout function. 90 | #' Two elements: clustpos, cellpos. Must provide either this or 91 | #' insitutype_result. 92 | #'@param insitutype_result The list output by insitutype or insitutypeML. Must 93 | #' provide either this or insitutype_result. 94 | #'@param col Optional, a vector of cell colors, with length equal to the number 95 | #' of individual cells. 96 | #'@param showclusterconfidence Logical, for whether to label clusters with the 97 | #' average posterior probability of the cells within them. Gives a readout of 98 | #' how distinct a cluster is from the others. 99 | #'@importFrom utils data 100 | #'@importFrom scales alpha 101 | #'@import ggplot2 102 | #'@importFrom grDevices colors 103 | #'@importFrom rlang .data 104 | #'@return a ggplot object 105 | #' 106 | #'@export 107 | #'@examples 108 | #' data("ioprofiles") 109 | #' unsup <- insitutype( 110 | #' x = mini_nsclc$counts, 111 | #' neg = Matrix::rowMeans(mini_nsclc$neg), 112 | #' n_clusts = 8, 113 | #' n_phase1 = 200, 114 | #' n_phase2 = 500, 115 | #' n_phase3 = 2000, 116 | #' n_starts = 1, 117 | #' max_iters = 5, 118 | #' assay_type="RNA" 119 | #' ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended. 120 | #' flightpath_plot(insitutype_result = unsup) 121 | 122 | flightpath_plot <- function(flightpath_result = NULL, insitutype_result = NULL, col = NULL, showclusterconfidence = TRUE){ 123 | 124 | # get the flightpath results to use 125 | if (!is.null(flightpath_result) && !is.null(insitutype_result)) { 126 | warning("flightpath_result and insitutype_result were both provided. Using only flightpath_result.") 127 | insitutype_result <- NULL 128 | } 129 | if (is.null(flightpath_result) && is.null(insitutype_result)) { 130 | stop("Must provide either flightpath_result or insitutype_result.") 131 | } 132 | if (is.null(flightpath_result)) { 133 | flightpath_result <- flightpath_layout(logliks = insitutype_result$logliks, profiles = insitutype_result$profiles) 134 | } 135 | 136 | # create color scheme if needed: 137 | if (is.null(col)) { 138 | utils::data("iocolors", package = "InSituType", envir = environment()) 139 | scols <- c('#8DD3C7','#FFFFB3','#BEBADA','#FB8072','#80B1D3','#FDB462','#B3DE69','#FCCDE5','#D9D9D9','#BC80BD', 140 | '#CCEBC5','#FFED6F','#E41A1C','#377EB8','#4DAF4A','#984EA3','#FF7F00','#FFFF33','#A65628','#F781BF','#999999', 141 | sample(colors()[!grepl("grey", colors())], 100))[seq_along(unique(flightpath_result$clust))] 142 | names(scols) <- unique(flightpath_result$clust) 143 | iotypespresent <- intersect(names(environment()[['iocolors']]), names(scols)) 144 | scols[iotypespresent] <- environment()[['iocolors']][iotypespresent] 145 | col <- scols[flightpath_result$clust] 146 | } 147 | 148 | # prep data for plotting: 149 | df <- 150 | data.frame( 151 | x = flightpath_result$cellpos[, 1], 152 | y = flightpath_result$cellpos[, 2], 153 | col = scales::alpha(col, 0.7) 154 | ) 155 | df_text <- data.frame(x = flightpath_result$clustpos[, 1], 156 | y = flightpath_result$clustpos[, 2], 157 | group = rownames(flightpath_result$clustpos), 158 | col = "black") 159 | 160 | if (showclusterconfidence) { 161 | confthresh <- 0.8 162 | confidencecolors <- c('#FEB24C','#FD9D43','#FC863A','#FC6330','#F64226', 163 | '#E8251F','#D2111F','#B60224','#620015','#000000') 164 | df_text$col <- confidencecolors[ 165 | 1 + round(9 * (pmax(flightpath_result$meanconfidence, confthresh) - confthresh) / (1 - confthresh))] 166 | 167 | df_text$group <- paste0(df_text$group, "(", round(flightpath_result$meanconfidence, 2), ")") 168 | } 169 | p <- ggplot2::ggplot() + 170 | ggplot2::geom_point(df, mapping = ggplot2::aes(x = flightpath_result$cellpos[, 1], 171 | y = flightpath_result$cellpos[, 2], 172 | color = I(col), 173 | size = I(0.1))) + 174 | ggplot2::scale_color_identity() + 175 | ggplot2::geom_text(df_text, 176 | mapping = ggplot2::aes(x = .data$x, y = .data$y, label = .data$group, col = I(col)), 177 | size = 3) + 178 | ggplot2::xlab("") + 179 | ggplot2::ylab("") + 180 | ggplot2::theme_bw() + 181 | ggplot2::theme(legend.position = "none", 182 | panel.grid = ggplot2::element_blank(), 183 | axis.text = ggplot2::element_blank()) 184 | flightpath_plot_folder <- "./NBClust-Plots" # tempdir() 185 | if (!dir.exists(flightpath_plot_folder)) dir.create(flightpath_plot_folder, showWarnings = FALSE, recursive = TRUE) 186 | flightpath_plot_filename <- paste(format(Sys.time(), "%Y-%m-%d_%H-%M-%S-%Z"), "flightpath_plot.png", sep="-") 187 | flightpath_plot_file <- paste(flightpath_plot_folder,flightpath_plot_filename , sep="/") 188 | message("Saving flightpath_plot to: ", flightpath_plot_file) 189 | ggsave(filename = flightpath_plot_filename, plot = p, device = "png", path = flightpath_plot_folder, 190 | width = 7, 191 | height = 7, 192 | units="in") 193 | 194 | return(p) 195 | } 196 | 197 | 198 | #' Summarize clusters' mean confidence 199 | #' 200 | #' Calculate the mean confidence of the cell calls from each cluster 201 | #' @param probs Matrix of probabilities 202 | #' @return a vector of mean confidences, with values of 1 corresponding to clusters with only prob == 1 203 | #' @examples 204 | #' data("mini_nsclc") 205 | #' probs <- sapply(rownames(mini_nsclc$counts), function(x) {a = runif(10); a/sum(a)}) 206 | #' dimnames(probs)[[1]] <- letters[1:10] 207 | #' probs <- t(probs) 208 | #' getMeanClusterConfidence(probs) 209 | getMeanClusterConfidence <- function(probs) { 210 | 211 | maxprobs <- apply(probs, 1, max, na.rm = TRUE) 212 | meanconfidence <- sapply(colnames(probs), function(name) { 213 | thisclust <- probs[, name] == maxprobs 214 | mean(probs[thisclust, name, drop = FALSE]) 215 | }) 216 | 217 | return(meanconfidence) 218 | } 219 | -------------------------------------------------------------------------------- /R/gen_profiles_protein.R: -------------------------------------------------------------------------------- 1 | #' Generate the mean reference profile and its SD reference profile based on the data itself 2 | #' This function is based on signature matrix included in CELESTA package 3 | #' First, we rebuild a nested cell typing lists based on the 2-D signature matrix 4 | #' Second, we identify anchor cells ranked by their expression level for each cell type's protein marker 5 | #' Third, we estimate averaged expression level and SDs for proteins and cell types using the anchors 6 | #' 7 | #' @param exp.mat a matrix of raw protein expression data. cells are in rows and proteins are in columns 8 | #' @param sig_mat a signature matrix of cell types. cell types x protein markers 9 | #' @param cutoff a cutoff of quantile. e.g) cutoff=0.9 means that top 90 percentiles of cells are called anchors for the protein expression 10 | #' @param min.num.cells a minimum number of cells each cell type to estimate its mean or SDs. default value is 30. 11 | #' @param keep_marker_proteins whether just marker proteins from the signature matrix is kept. default value is FALSE, which returns all proteins included in the data 12 | #' 13 | #' @importFrom magrittr %>% 14 | #' @importFrom tibble rownames_to_column column_to_rownames 15 | #' @importFrom dplyr summarise_all group_by filter 16 | #' @return A list, with the following elements: 17 | #' \enumerate{ 18 | #' \item mean.ref.profile: a matrix of cluster-specific expression profiles. proteins x cell types 19 | #' \item SDs.ref.profile: a matrix of standard deviation profiles of pre-defined clusters. proteins x cell types 20 | #' \item anchors: a vector giving "anchor" cell types. Vector elements will be mainly NA's (for non-anchored cells) 21 | #' } 22 | #' @name gen_profiles_protein_expression 23 | #' @examples 24 | #' data("tonsil_protein") 25 | #' data("human_signature") 26 | #' data("mouse_signature") 27 | #' references <- gen_profiles_protein_expression( 28 | #' exp.mat=tonsil_protein$counts, 29 | #' sig_mat=NULL) 30 | gen_profiles_protein_expression <- function(exp.mat, sig_mat=NULL, cutoff=0.9, min.num.cells=30, keep_marker_proteins=FALSE){ 31 | 32 | if(is.null(sig_mat)){ 33 | 34 | ## call the human's signature matrix 35 | sig_mat = InSituType::human_signature 36 | 37 | ## If the panel is for mouse, we call the mouse's signature matrix 38 | if(length(intersect(names(sig_mat), names(exp.mat)) == 0)){ 39 | sig_mat = InSituType::mouse_signature 40 | } 41 | } 42 | 43 | markerProteins <- intersect(colnames(sig_mat), colnames(exp.mat)) 44 | ## Split Lineage levels into columns 45 | sig_mat[is.na(sig_mat)] <- 0 46 | sig_mat$level1 <- lapply(strsplit(sig_mat$Lineage_level, "_"), function(x){x[1]}) %>% unlist() 47 | sig_mat$level2 <- lapply(strsplit(sig_mat$Lineage_level, "_"), function(x){x[2]}) %>% unlist() 48 | sig_mat$level3 <- lapply(strsplit(sig_mat$Lineage_level, "_"), function(x){x[3]}) %>% unlist() 49 | 50 | markerProtein_celltype_level <- vector("list", length=max(sig_mat$level1)) 51 | for (i in 1:max(sig_mat$level1)){ 52 | 53 | if(i ==1){ 54 | markerProtein_celltype_level[[i]] <- data.frame(celltype = sig_mat[sig_mat$level1==i,]$celltype, 55 | marker_protein=apply(sig_mat[sig_mat$level1==i,], 1, function(x){colnames(sig_mat[sig_mat$level1==i,])[which(x==1)[1]]}), 56 | upper_celltype = "Parent") 57 | }else{ 58 | markerProtein_celltype_level[[i]] <- data.frame(celltype = sig_mat[sig_mat$level1==i,]$celltype, 59 | marker_protein=apply(sig_mat[sig_mat$level1==i,], 1, function(x){colnames(sig_mat[sig_mat$level1==i,])[which(x==1)[1]]}), 60 | upper_celltype = sig_mat$celltype[which(sig_mat$level3==unique(sig_mat[sig_mat$level1==i,]$level2))]) 61 | } 62 | } 63 | 64 | dat_mat_level <- vector("list", length=max(sig_mat$level1)) 65 | for (i in 1:length(markerProtein_celltype_level)){ 66 | if(i ==1){ 67 | dat_mat_level[[i]] <- lapply(markerProtein_celltype_level[[i]]$marker_protein, function(x){ 68 | if(max(exp.mat)<=1){ 69 | cutoff <- 0.9 70 | }else{ 71 | cutoff <- quantile(exp.mat[, x], prob=0.9) 72 | } 73 | rownames(exp.mat)[which(exp.mat[, x] > cutoff)] 74 | }) 75 | names(dat_mat_level[[i]]) <- markerProtein_celltype_level[[i]]$celltype 76 | }else{ 77 | 78 | dat_mat_level[[i]] <- vector("list", nrow(markerProtein_celltype_level[[i]])) 79 | names(dat_mat_level[[i]]) <- markerProtein_celltype_level[[i]]$celltype 80 | 81 | for(j in 1:length(markerProtein_celltype_level[[i]]$celltype)){ 82 | 83 | if(!is.na(markerProtein_celltype_level[[i]][j,]$marker_protein)){ 84 | 85 | ## Identify the upper level's cell type and where it is located in the signature matrix' lineage level 86 | for(k in 1:(i-1)){ 87 | tempDD <- markerProtein_celltype_level[[k]] %>% filter(celltype==markerProtein_celltype_level[[i]]$upper_celltype[1]) 88 | 89 | if(nrow(tempDD)==1){ 90 | tempMar=tempDD 91 | idx_k=k 92 | }else{ 93 | paste("pass") 94 | } 95 | } 96 | 97 | tempD <- exp.mat[rownames(exp.mat) %in% dat_mat_level[[idx_k]][[tempMar$celltype]], ] 98 | 99 | if(max(exp.mat)<=1){ 100 | cutoff <- 0.9 101 | }else{ 102 | cutoff <- quantile(tempD[, markerProtein_celltype_level[[i]][j,]$marker_protein], prob=0.9) 103 | } 104 | 105 | tempID <- rownames(tempD)[which(tempD[, markerProtein_celltype_level[[i]][j,]$marker_protein] > cutoff)] 106 | 107 | dat_mat_level[[idx_k]][[tempMar$celltype]] <- setdiff(dat_mat_level[[idx_k]][[tempMar$celltype]], tempID) 108 | dat_mat_level[[i]][[markerProtein_celltype_level[[i]]$celltype[j]]] <- tempID 109 | }else{ 110 | break 111 | 112 | } 113 | } 114 | } 115 | } 116 | 117 | markerProtein_celltype_all <- do.call("rbind", markerProtein_celltype_level) 118 | marker_id_cell_type <- do.call(c, dat_mat_level) 119 | marker_id_cell_type_insitu <- marker_id_cell_type[lapply(marker_id_cell_type, length)!=0] 120 | marker_id_cell_type_insitu_df <- lapply(1:length(marker_id_cell_type_insitu), function(x){data.frame(cell_ID=marker_id_cell_type_insitu[[x]], 121 | celltype=rep(names(marker_id_cell_type_insitu[x]), length(marker_id_cell_type_insitu[[x]])))}) 122 | names(marker_id_cell_type_insitu_df) <- names(marker_id_cell_type_insitu) 123 | anchors <- do.call("rbind", marker_id_cell_type_insitu_df) %>% as.data.frame() 124 | anchors_duplicate <- anchors[which(duplicated(anchors$cell_ID)==TRUE),]$cell_ID 125 | 126 | marker_id_cell_type_unique <- lapply(marker_id_cell_type_insitu_df, 127 | function(x) { 128 | tempV <- setdiff(x$cell_ID, anchors_duplicate) 129 | if(length(tempV) > 20){ 130 | names(tempV) <- x[x$cell_ID %in% tempV,]$celltype 131 | tempV <- tempV 132 | }else{ 133 | tempV <- NULL 134 | } 135 | return(tempV)}) 136 | 137 | # marker_id_cell_type_unique <- Filter(Negate(is.null), marker_id_cell_type_unique) 138 | 139 | anchors <- anchors[which(duplicated(anchors$cell_ID)==FALSE),] 140 | anchors <- anchors %>% filter(celltype %in% names(marker_id_cell_type_unique)) 141 | 142 | anchors <- rbind(anchors, data.frame(cell_ID = setdiff(rownames(exp.mat), anchors$cell_ID), celltype=NA)) 143 | rownames(anchors) <- anchors$cell_ID 144 | anchors$cell_ID <- NULL 145 | anchors <- t(anchors)[1,] 146 | 147 | ############################ Estimate averaged protein expression each cell type with its anchor cells ###################################### 148 | protein_exp_means_list <- lapply(marker_id_cell_type_unique, function(x){ 149 | 150 | mean.exp <- exp.mat[rownames(exp.mat) %in% x, ] %>% colMeans() 151 | 152 | }) 153 | 154 | mean.ref.profile <- do.call("rbind", protein_exp_means_list) %>% t() %>% as.data.frame() 155 | 156 | protein_exp_SDs_list <- lapply(marker_id_cell_type_unique, function(x){ 157 | apply(exp.mat[rownames(exp.mat) %in% x, ], 2, sd ) 158 | }) 159 | names(protein_exp_SDs_list) <- names(marker_id_cell_type_unique) 160 | SDs.ref.profile <- do.call("rbind", protein_exp_SDs_list) %>% t() %>% as.data.frame() 161 | 162 | if(keep_marker_proteins){ 163 | mean.ref.profile <- mean.ref.profile[markerProteins, ] 164 | SDs.ref.profile <- SDs.ref.profile[markerProteins, ] 165 | } 166 | out <- list(mean.ref.profile=mean.ref.profile, SDs.ref.profile=SDs.ref.profile, anchors=anchors[rownames(exp.mat)]) 167 | return(out) 168 | } 169 | 170 | 171 | #' Generate the mean reference profile and its SD reference profile from an annotation file 172 | #' This function is only for protein data set with known anchor cells and their cell types 173 | #' 174 | #' @param exp.mat a matrix of raw protein expression data. cells are in rows and proteins are in columns 175 | #' @param anno a data frame or matrix of cell types for anchor cells or manually annotated cell typing information for some cells. Should include cell_ID and celltype at least. 176 | #' 177 | #' @return A list, with the following elements: 178 | #' \enumerate{ 179 | #' \item mean.ref.profile: a matrix of cluster-specific expression profiles. proteins * cell types 180 | #' \item SDs.ref.profile: a matrix of standard deviation profiles of pre-defined clusters. proteins * cell types 181 | #' \item anchors: a vector giving "anchor" cell types. Vector elements will be mainly NA's (for non-anchored cells) 182 | #' } 183 | 184 | gen_profiles_protein_annotation <- function(exp.mat, anno) { 185 | 186 | anno_ref_mat <- merge(exp.mat %>% as.data.frame() %>% rownames_to_column(var="cell_ID"), anno %>% dplyr::select(c(cell_ID, cellType)), by="cell_ID") %>% column_to_rownames(var="cell_ID") 187 | 188 | mean.ref.profile <- anno_ref_mat %>% group_by(cellType) %>% summarise_all(mean) %>% column_to_rownames(var="cellType") %>% t() 189 | SDs.ref.profile <- anno_ref_mat %>% group_by(cellType) %>% summarise_all(sd) %>% column_to_rownames(var="cellType") %>% t() 190 | 191 | ## Set NAs for non-anchor cells' cell types 192 | anchors <- rbind(anno %>% dplyr::select(c(cell_ID, cellType)), data.frame(cell_ID = setdiff(rownames(exp.mat), rownames(anno_ref_mat)), cellType=NA)) 193 | rownames(anchors) <- anchors$cell_ID 194 | anchors$cell_ID <- NULL 195 | anchors <- anchors %>% t() 196 | anchors <- anchors[1,] 197 | 198 | out <- list(mean.ref.profile=mean.ref.profile, 199 | SDs.ref.profile=SDs.ref.profile, 200 | anchors=anchors[rownames(exp.mat)]) 201 | return(out) 202 | } 203 | -------------------------------------------------------------------------------- /R/getProfiles.R: -------------------------------------------------------------------------------- 1 | 2 | #' Extract mean background-subtracted profiles of RNA data 3 | #' 4 | #' Given cell assignments and count data, estimate the mean 5 | #' profile of each cluster. 6 | #' 7 | #' @param x Counts matrix, cells * genes. 8 | #' @param clust Vector of cluster assignments, or a matrix of probabilities 9 | #' of cells (rows) belonging to clusters (columns). 10 | #' @param neg Vector of mean background counts (or a single value applied to all cells) 11 | #' @return A matrix of gene x cell type expression profiles. 12 | #' @export 13 | getRNAprofiles <- function(x, neg, clust) { 14 | if (length(neg) == 1) { 15 | neg <- rep(neg, nrow(x)) 16 | } 17 | temp <- Estep(counts = x, clust = clust, neg = neg, assay_type = "RNA") 18 | return(temp$profiles) 19 | } 20 | 21 | #' Extract mean background-subtracted profiles of RNA data 22 | #' 23 | #' Given cell assignments and count data, estimate the mean 24 | #' profile of each cluster. 25 | #' @param x Expression matrix, cells * proteins. 26 | #' @param clust Vector of cluster assignments, or a matrix of probabilities 27 | #' of cells (rows) belonging to clusters (columns). 28 | #' @param neg Vector of mean background counts 29 | #' @return List with two elements: "profiles", a matrix of protein x cell type expression profiles, and "sds", a matrix of SD's. 30 | #' @export 31 | getProteinParameters <- function(x, clust) { 32 | temp <- Estep(counts = x, clust = clust, assay_type = "protein") 33 | return(temp) 34 | } 35 | -------------------------------------------------------------------------------- /R/getSpatialContext.R: -------------------------------------------------------------------------------- 1 | #' Get the neighborhood expression profile around all cells 2 | #' @param counts Counts matrix 3 | #' @param xy 2-column matrix of cells' xy positions 4 | #' @param tissue vector of tissue IDs. Used to ensure cells for different tissues are never called neighbors 5 | #' @param N number of neighbors to use. Specify this or \code{rad}. 6 | #' @param rad radius to use to define neighbors. Specify this or \code{N}. 7 | #' @param dim_reduce_to If entered, the neighborhood matrix will be reduced to this many PCs 8 | #' @return A matrix of neighborhood expression, potentially by gene, or else by PCs if \code{dim_reduce_to} was set. 9 | #' @export 10 | #' @importFrom irlba prcomp_irlba 11 | getSpatialContext <- function(counts, xy, tissue = NULL, N = 50, rad = NULL, dim_reduce_to = NULL) { 12 | 13 | # define neighbors: 14 | if (is.null(tissue)) { 15 | tissue = 1 16 | } 17 | if (!is.null(N)) { 18 | neighbors <- nearestNeighborGraph(x = xy[, 1], y = xy[, 2], N = N, subset = tissue) 19 | rad <- NULL 20 | } 21 | if (!is.null(rad)) { 22 | neighbors <- radiusBasedGraph(x = xy[, 1], y = xy[, 2], R = rad, subset = tissue) 23 | } 24 | 25 | # get neighborhood expression: 26 | neighborexpression <- get_neighborhood_expression(counts = counts, neighbors = neighbors) 27 | 28 | # dimension reduce 29 | if (!is.null(dim_reduce_to)) { 30 | neighborexpression <- irlba::prcomp_irlba(neighborexpression, n = dim_reduce_to)$x 31 | } 32 | return(neighborexpression) 33 | } 34 | 35 | 36 | 37 | 38 | #' Create spatial network from N nearest neighbors 39 | #' 40 | #' For each cell identify \code{N} nearest neighbors in Euclidean space and 41 | #' create an edge between them in graph structure, optionally subset cells (see 42 | #' Details). 43 | #' 44 | #' Edges will only be created for cells that have the same \code{subset} value, 45 | #' usually the slide column id but could also be a slide plus FOV id to only 46 | #' create edges within an FOV. 47 | #' 48 | #' @param x spatial coordinate 49 | #' @param y spatial coordinate 50 | #' @param N number of nearest neighbors 51 | #' @param subset same length as x,y (see Details) 52 | #' 53 | #' @return sparse adjacency matrix with distances 54 | #' @importFrom data.table data.table 55 | #' @importFrom data.table rbindlist 56 | #' @importFrom spatstat.geom nnwhich 57 | #' @importFrom spatstat.geom nndist 58 | #' @importFrom Matrix sparseMatrix 59 | nearestNeighborGraph <- function(x, y, N, subset=1) { 60 | DT <- data.table::data.table(x = x, y = y, subset = subset) 61 | nearestNeighbor <- function(i) { 62 | subset_dt <- DT[subset == i] 63 | idx <- which(DT[["subset"]] == i) 64 | ndist <- spatstat.geom::nndist(subset_dt[, .(x, y)], 65 | k=1:N) 66 | nwhich <- spatstat.geom::nnwhich(subset_dt[, .(x, y)], 67 | k=1:N) 68 | ij <- data.table::data.table(i = idx[1:nrow(subset_dt)], 69 | j = idx[as.vector(nwhich)], 70 | x = as.vector(ndist)) 71 | return(ij) 72 | } 73 | ij <- data.table::rbindlist(lapply(unique(subset), nearestNeighbor)) 74 | adj.m <- Matrix::sparseMatrix(i = ij$i, j = ij$j, x = ij$x, dims = c(nrow(DT), nrow(DT))) 75 | return(adj.m) 76 | } 77 | 78 | #' Create spatial network from neighbors within radius R 79 | #' 80 | #' For each cell identify neighbors within distance \code{R} in Euclidean space 81 | #' and create an edge between them in graph structure, optionally subset cells 82 | #' (see Details). 83 | #' 84 | #' Edges will only be created for cells that have the same \code{subset} value, 85 | #' usually the slide column id but could also be a slide plus FOV id to only 86 | #' create edges within an FOV. 87 | #' 88 | #' @param x spatial coordinate 89 | #' @param y spatial coordinate 90 | #' @param R radius 91 | #' @param subset same length as x,y (see Details) 92 | #' 93 | #' @return sparse adjacency matrix with distances 94 | #' @importFrom data.table data.table 95 | #' @importFrom data.table rbindlist 96 | #' @importFrom Matrix sparseMatrix 97 | #' @importFrom spatstat.geom ppp 98 | #' @importFrom spatstat.geom closepairs 99 | radiusBasedGraph <- function(x, y, R, subset=1) { 100 | DT <- data.table::data.table(x = x, y = y, subset = subset) 101 | radiusNeighbor <- function(i) { 102 | subset_dt <- DT[subset == i] 103 | idx <- which(DT[["subset"]] == i) 104 | pp <- spatstat.geom::ppp(subset_dt$x, subset_dt$y, 105 | range(subset_dt$x), range(subset_dt$y)) 106 | cp <- spatstat.geom::closepairs(pp, R) 107 | ij <- data.table::data.table(i = idx[cp$i], 108 | j = idx[cp$j], 109 | x = cp$d) 110 | return(ij) 111 | } 112 | ij <- data.table::rbindlist(lapply(unique(subset), radiusNeighbor)) 113 | adj.m <- Matrix::sparseMatrix(i = ij$i, j = ij$j, x = ij$x, dims = c(nrow(DT), nrow(DT))) 114 | return(adj.m) 115 | } 116 | 117 | 118 | 119 | #' Calculate neighborhood expression 120 | #' 121 | #' Calculates the expression profile of each cell's neighborhood 122 | #' @param counts Single cell expression matrix 123 | #' @param neighbors A neighbors adjacency matrix 124 | #' @return A matrix in the same dimensions as \code{counts}, giving the expression profile of each cell's neighborhood. 125 | get_neighborhood_expression <- function(counts, neighbors) { 126 | 127 | # check: 128 | if (nrow(counts) != ncol(neighbors)) { 129 | stop("misalignment between nrow(counts) and ncol(neighbors)") 130 | } 131 | # get clust-specific environment expression 132 | env <- neighbor_colMeans(counts, neighbors) 133 | rownames(env) <- rownames(neighbors) 134 | env <- as.matrix(env) 135 | return(env) 136 | } 137 | 138 | #' for each cell, get the colMeans of x over its neighbors: 139 | #' @param x A matrix 140 | #' @param neighbors A (probably sparse) adjacency matrix 141 | neighbor_colMeans <- function(x, neighbors) { 142 | neighbors@x <- rep(1, length(neighbors@x)) 143 | neighbors <- Matrix::Diagonal(x=1/Matrix::rowSums(neighbors)) %*% neighbors 144 | neighbors@x[neighbors@x==0] <- 1 145 | out <- neighbors %*% x 146 | return(out) 147 | } 148 | 149 | 150 | 151 | #' for each cell, get the colSums of x over its neighbors: 152 | #' @param x A matrix 153 | #' @param neighbors A (probably sparse) adjacency matrix 154 | neighbor_colSums <- function(x, neighbors) { 155 | neighbors@x <- rep(1, length(neighbors@x)) 156 | neighbors <- Matrix::Diagonal(x=rep(1, nrow(neighbors))) %*% neighbors 157 | neighbors@x[neighbors@x==0] <- 1 158 | out <- neighbors %*% x 159 | return(out) 160 | } 161 | -------------------------------------------------------------------------------- /R/insitutypeML.R: -------------------------------------------------------------------------------- 1 | #' Classify cells based on reference profiles 2 | #' 3 | #' Supervised classification of cells. Each cell is assigned to the cell type 4 | #' under which its observed expression profile is most likely. 5 | #' @param x Counts matrix (or dgCMatrix), cells * genes. 6 | #' 7 | #' Alternatively, a \linkS4class{SingleCellExperiment} object containing such 8 | #' a matrix. 9 | #' @param neg Vector of mean negprobe counts per cell. Can be provided 10 | #' @param bg Expected background 11 | #' @param cohort Vector of cells' cohort memberships 12 | #' @param reference_profiles Matrix of expression profiles of pre-defined clusters, 13 | #' e.g. from previous scRNA-seq. These profiles will not be updated by the EM algorithm. 14 | #' Colnames must all be included in the init_clust variable. 15 | #' @param reference_sds Matrix of standard deviation profiles of pre-defined 16 | #' clusters. These SD profiles also will not be updated by the EM algorithm. 17 | #' Columns must all be included in the init_clust variable. This parameter should 18 | #' be defined if assay_type is protein. Default is NULL. 19 | #' @param nb_size The size parameter to assume for the NB distribution. 20 | #' @param align_genes Logical, for whether to align the counts matrix and the reference_profiles by gene ID. 21 | #' @param assay_type Assay type of RNA, protein (default = "rna") 22 | #' @param ... For the \linkS4class{SingleCellExperiment} method, additional 23 | #' arguments to pass to the ANY method. 24 | #' @param assay.type A string specifying which assay values to use. 25 | #' @return A list, with the following elements: 26 | #' \enumerate{ 27 | #' \item clust: a vector given cells' cluster assignments 28 | #' \item prob: a vector giving the confidence in each cell's cluster 29 | #' \item profiles: Matrix of clusters' mean background-subtracted profiles 30 | #' \item logliks: Matrix of cells' log-likelihoods under each cluster. Cells in rows, clusters in columns. 31 | #' } 32 | #' 33 | #' @name insitutypeML 34 | #' @examples 35 | #' data("mini_nsclc") 36 | #' data("ioprofiles") 37 | #' sup <- insitutypeML( 38 | #' x = mini_nsclc$counts, 39 | #' neg = Matrix::rowMeans(mini_nsclc$neg), 40 | #' reference_profiles = ioprofiles, 41 | #' assay_type = "RNA") 42 | #' table(sup$clust) 43 | NULL 44 | 45 | .insitutypeML <- function(x, neg = NULL, bg = NULL, cohort = NULL, 46 | reference_profiles, 47 | reference_sds=NULL, 48 | nb_size = 10, 49 | assay_type = c("rna", "protein"), 50 | align_genes = TRUE) { 51 | assay_type <- match.arg(tolower(assay_type), c("rna", "protein")) 52 | 53 | # get vector of expected background: 54 | bg <- estimateBackground(counts = x, neg = neg, bg = bg) 55 | 56 | # align genes: 57 | if (align_genes) { 58 | x <- alignGenes(counts = x, profiles = reference_profiles) 59 | reference_profiles <- reference_profiles[colnames(x), ] 60 | if (!is.null(reference_sds)) { 61 | reference_sds <- reference_sds[colnames(x), ] 62 | } 63 | } 64 | 65 | # prep cohort vector: 66 | if (is.null(cohort)) { 67 | cohort <- rep("all", length(bg)) 68 | } 69 | 70 | logliks <- lldist(x = reference_profiles, 71 | xsd = reference_sds, 72 | mat = x, 73 | bg =bg, 74 | size = nb_size, 75 | assay_type=assay_type) 76 | 77 | 78 | # update logliks based on frequencies within cohorts: 79 | logliks <- update_logliks_with_cohort_freqs(logliks = logliks, 80 | cohort = cohort, 81 | minfreq = 1e-4, 82 | nbaselinecells = 100) 83 | if ("undefined" %in% colnames(logliks)) { 84 | logliks <- logliks[, -which(colnames(logliks) == "undefined")] 85 | } 86 | features <- intersect(rownames(reference_profiles), colnames(x)) 87 | logliks <- cbind(logliks, ifelse(Matrix::rowSums(x[, features]) == 0, 0, -Inf)) 88 | colnames(logliks)[ncol(logliks)] <- "undefined" 89 | 90 | # get remaining outputs 91 | clust <- colnames(logliks)[apply(logliks, 1, which.max)] 92 | names(clust) <- rownames(logliks) 93 | 94 | probs <- logliks2probs(logliks) 95 | prob <- apply(probs, 1, max) 96 | names(prob) <- names(clust) 97 | profiles_info <- Estep(counts=x, 98 | clust = clust, 99 | neg = neg, 100 | assay_type=assay_type) 101 | 102 | profiles <- profiles_info$profiles 103 | sds <- profiles_info$sds 104 | 105 | # aligns profiles and logliks, removing lost clusters: 106 | logliks_from_lost_celltypes <- logliks[, !is.element(colnames(logliks), unique(clust)), drop = FALSE] 107 | logliks <- logliks[, is.element(colnames(logliks), clust), drop = FALSE] 108 | profiles <- profiles[, colnames(logliks), drop = FALSE] 109 | 110 | if(identical(tolower(assay_type), "rna")){ 111 | sds <- NULL 112 | } 113 | 114 | out <- list(clust = clust, 115 | prob = prob, 116 | profiles = profiles, 117 | sds = sds, 118 | logliks = round(logliks, 4), 119 | logliks_from_lost_celltypes = round(logliks_from_lost_celltypes, 4)) 120 | return(out) 121 | } 122 | 123 | ############################ 124 | # S4 method definitions 125 | ############################ 126 | 127 | #' @export 128 | #' @rdname insitutypeML 129 | setGeneric("insitutypeML", function(x, ...) standardGeneric("insitutypeML")) 130 | 131 | #' @export 132 | #' @rdname insitutypeML 133 | setMethod("insitutypeML", "ANY", .insitutypeML) 134 | 135 | #' @export 136 | #' @rdname insitutypeML 137 | #' @importFrom SummarizedExperiment assay 138 | #' @importFrom SingleCellExperiment SingleCellExperiment 139 | setMethod("insitutypeML", "SingleCellExperiment", function(x, ..., assay.type="counts") { 140 | .insitutypeML(t(assay(x, i=assay.type)), ...) 141 | }) 142 | -------------------------------------------------------------------------------- /R/refineClusters.R: -------------------------------------------------------------------------------- 1 | #' Merge cell types in a clustering result 2 | #' 3 | #' Take a user-defined list of cells types to rename/combine, then re-compute 4 | #' cluster assignments and probabilities under the merged cell types. 5 | #' @param assay_type Assay type of RNA, protein (default = "rna") 6 | #' @param merges A named vector in which the elements give new cluster names and 7 | #' the names give old cluster names. OK to omit cell types that aren't being 8 | #' merged. 9 | #' @param to_delete A vector of cluster names to delete. All cells assigned to 10 | #' these clusters will be reassigned to the next best cluster. 11 | #' @param subcluster A list, where each element's name is a cell type to 12 | #' subcluster, and the element itself is the cluster number(s) to use. E.g. 13 | #' list("macrophages" = 2, "cancer" = 2:3) 14 | #' @param logliks Matrix of log-likelihoods output by insitutype, cells in rows, 15 | #' clusters in columns 16 | #' @param counts Counts matrix, cells * genes. Only needed if subclustering is 17 | #' run. 18 | #' @param neg Vector of mean negprobe counts per cell. Only needed if 19 | #' subclustering is run. 20 | #' @param bg Expected background. Optional, and only used if subclustering is 21 | #' run. 22 | #' @param cohort Vector of cells' cohort memberships. Optional, and only needed 23 | #' if subclustering is run. 24 | #' @return A list with two elements: \enumerate{ \item clust: a vector of 25 | #' cluster assignments \item prob: Vector of posterior probabilities for each 26 | #' cell type \item logliks: a matrix of probabilities of all cells (rows) 27 | #' belonging to all clusters (columns) \item profiles: a matrix of the average 28 | #' background-subracted profile of each cell type after 29 | #' merging/deleting/subclustering } 30 | #' @export 31 | #' @examples 32 | #' #example merges argument: 33 | #' merges = c("macrophages" = "myeloid", # merge 3 clusters 34 | #' "monocytes" = "myeloid", 35 | #' "mDC" = "myeloid", 36 | #' "B-cells" = "lymphoid") # just rename 1 cluster 37 | #' # example to_delete argument: 38 | #' to_delete = c("neutrophils") 39 | #' # example subcluster argument: 40 | #' subcluster = list("Myofibroblast" = 2:3) 41 | refineClusters <- function(assay_type = c("rna", "protein"), 42 | merges = NULL, to_delete = NULL, subcluster = NULL, 43 | logliks, 44 | counts = NULL, 45 | neg = NULL, bg = NULL, 46 | cohort = NULL) { 47 | assay_type <- match.arg(tolower(assay_type), c("rna", "protein")) 48 | 49 | # check that provided cell names are all in logliks: 50 | if (any(!is.element(names(merges), colnames(logliks)))) { 51 | mismatch <- setdiff(names(merges), colnames(logliks)) 52 | stop(paste0("The following user-provided cluster name(s) in the merges argument are missing from colnames(logliks): ", 53 | paste0(mismatch, collapse = ", "))) 54 | } 55 | if (any(!is.element(to_delete, colnames(logliks)))) { 56 | mismatch <- setdiff(to_delete, colnames(logliks)) 57 | stop(paste0("The following user-provided cluster name(s) in the to_delete argument are missing from colnames(logliks): ", 58 | paste0(mismatch, collapse = ", "))) 59 | } 60 | if (any(!is.element(names(subcluster), colnames(logliks)))) { 61 | mismatch <- setdiff(names(subcluster), colnames(logliks)) 62 | stop(paste0("The following user-provided cluster name(s) in the merges argument are missing from colnames(logliks): ", 63 | paste0(mismatch, collapse = ", "))) 64 | } 65 | if (length(setdiff(colnames(logliks), to_delete)) == 0) { 66 | stop("The to_delete argument is asking for all clusters to be deleted.") 67 | } 68 | # check that subcluster data is available: 69 | if (!is.null(subcluster)) { 70 | if (is.null(counts)) { 71 | stop("Must provide counts data to subcluster") 72 | } 73 | if (is.null(neg)) { 74 | stop("Must provide neg vector to subcluster") 75 | } 76 | } 77 | 78 | # delete those called for: 79 | logliks <- logliks[, !is.element(colnames(logliks), to_delete)] 80 | 81 | # prevent merging into existing names: 82 | duplicatednames <- intersect(merges, colnames(logliks)) 83 | merges[is.element(merges, duplicatednames)] <- paste0(merges[is.element(merges, duplicatednames)], ".new") 84 | 85 | # get logliks under merged categories: each cell's "new" loglik in a merged cell type is 86 | # its best loglik under the "old" celltype. 87 | newlogliks <- matrix(NA, nrow(logliks), length(unique(merges)), 88 | dimnames = list(rownames(logliks), unique(merges))) 89 | newlogliks <- sapply(unique(merges), function(newname) { 90 | oldnames <- names(merges)[merges == newname] 91 | newlogliks[, newname] <- apply(logliks[, oldnames, drop = FALSE], 1, max, na.rm = TRUE) 92 | }) 93 | if (length(newlogliks) > 0) { 94 | newlogliks <- cbind(newlogliks, logliks[, setdiff(colnames(logliks), names(merges)), drop = FALSE]) 95 | } else { 96 | newlogliks <- logliks 97 | } 98 | 99 | # get new cluster assignments: 100 | clust <- colnames(newlogliks)[apply(newlogliks, 1, which.max)] 101 | names(clust) <- rownames(newlogliks) 102 | 103 | ## perform subclustering: 104 | # subclustering logic: 105 | # - run unsupervised clustering of the selected cell type 106 | # - record the subcluster logliks for the selected cells 107 | # - for unselected cells, propagate the original supercluster loglik to the subclusters (to prevent unselected cells joining the subclusters) 108 | # - unselected cells keep their cell type. selected cells go to whichever subcluster gives them the greatest loglik 109 | for (name in names(subcluster)) { 110 | message(paste0("Subclustering ", name)) 111 | use <- which(colnames(newlogliks)[apply(newlogliks, 1, which.max)] == name) 112 | # run insitutype on just the named cell type: 113 | temp <- insitutype(x = counts[use, ], 114 | assay_type = assay_type, 115 | neg = neg[use], 116 | bg = bg[use], 117 | cohort = cohort[use], 118 | n_clusts = subcluster[[name]], 119 | n_starts = 3, n_benchmark_cells = 5000, 120 | n_phase1 = 2000, n_phase2 = 10000, n_phase3 = 20000, 121 | n_chooseclusternumber = 2000) 122 | 123 | # make logliks matrix for all cells vs. the new clusters, with cells outside 124 | # the selected cell type retaining their original loglik for the cluster 125 | subclustlogliks <- matrix(rep(newlogliks[, name], ncol(temp$logliks)), nrow(counts)) 126 | rownames(subclustlogliks) <- rownames(counts) 127 | colnames(subclustlogliks) <- colnames(temp$logliks) 128 | # for cells with subclustering results, overwrite the old logliks: 129 | subclustlogliks[rownames(temp$logliks), colnames(temp$logliks)] <- temp$logliks 130 | # better names: 131 | colnames(subclustlogliks) <- paste0(name, "_", seq_len(ncol(subclustlogliks))) 132 | 133 | # safeguard in case we've created a cell type name that already exists: 134 | if (any(is.element(colnames(subclustlogliks), colnames(newlogliks)))) { 135 | colnames(subclustlogliks) <- paste0(colnames(subclustlogliks), "subcluster") 136 | } 137 | 138 | # update logliks matrix: 139 | newlogliks <- newlogliks[, setdiff(colnames(newlogliks), name)] 140 | newlogliks <- cbind(newlogliks, subclustlogliks) 141 | 142 | # update clust for the subclustered cells: 143 | clust[use] <- colnames(subclustlogliks)[apply(subclustlogliks[use, ], 1, which.max)] 144 | } 145 | 146 | # get new posterior probs: 147 | probs <- logliks2probs(newlogliks) 148 | prob <- apply(probs, 1, max) 149 | names(prob) <- names(clust) 150 | 151 | # re-calculate profiles if available: 152 | profiles <- NULL 153 | sds <- NULL 154 | if (!is.null(counts) && !is.null(neg)) { 155 | profiles_info <- Estep(counts = counts, 156 | clust = clust, 157 | neg = neg, 158 | assay_type=assay_type) 159 | profiles <- profiles_info$profiles 160 | sds <- profiles_info$sds 161 | 162 | } 163 | # aligns profiles and logliks, removing lost clusters: 164 | logliks_from_lost_celltypes <- newlogliks[, !is.element(colnames(newlogliks), unique(clust)), drop = FALSE] 165 | newlogliks <- newlogliks[, is.element(colnames(newlogliks), clust), drop = FALSE] 166 | profiles <- profiles[, colnames(newlogliks), drop = FALSE] 167 | 168 | if(identical(tolower(assay_type), "protein")){ 169 | sds <- sds[, colnames(newlogliks), drop = FALSE] 170 | } 171 | 172 | if(identical(tolower(assay_type), "rna")){ 173 | sds <- NULL 174 | } 175 | out <- list(clust = clust, prob = prob, logliks = round(newlogliks, 4), # (rounding logliks to save memory) 176 | profiles = profiles, sds=sds, logliks_from_lost_celltypes = round(logliks_from_lost_celltypes, 4)) 177 | return(out) 178 | } 179 | 180 | 181 | #' Get a logliks matrix from a probabilities matrix 182 | #' 183 | #' @param probs probability matrix 184 | #' 185 | #' @return log-transformed matrix 186 | #' @examples 187 | #' a <- runif(10) 188 | #' probs2logliks(a/sum(a)) 189 | probs2logliks <- function(probs) { 190 | return(log(probs)) 191 | } 192 | 193 | 194 | #' convert logliks to probabilities 195 | #' 196 | #' From cell x cluster log-likelihoods, calculate cell x cluster probabilities 197 | #' @param logliks Matrix of loglikelihoods, as output by insitytupe. Cells in rows, clusters in columns. 198 | #' @return A matrix of probabilities, in the same dimensions as logliks. 199 | #' @examples 200 | #' data("mini_nsclc") 201 | #' unsup <- insitutype( 202 | #' x = mini_nsclc$counts, 203 | #' neg = Matrix::rowMeans(mini_nsclc$neg), 204 | #' n_clusts = 8, 205 | #' n_phase1 = 200, 206 | #' n_phase2 = 500, 207 | #' n_phase3 = 2000, 208 | #' n_starts = 1, 209 | #' max_iters = 5, 210 | #' assay_type="RNA" 211 | #' ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended. 212 | #' logliks2probs(unsup$logliks) 213 | #' 214 | logliks2probs <- function(logliks) { 215 | templogliks <- sweep(logliks, 1, apply(logliks, 1, max, na.rm = TRUE), "-") 216 | # get on likelihood scale: 217 | liks <- exp(templogliks) 218 | # convert to probs 219 | probs <- sweep(liks, 1, rowSums(liks, na.rm = TRUE), "/") 220 | return(probs) 221 | } 222 | -------------------------------------------------------------------------------- /R/spatialUpdate.R: -------------------------------------------------------------------------------- 1 | #' @title Update cell typing results with spatial context or other alternative data 2 | #' 3 | #' @description 4 | #' Takes cell typing results, then updates it based on alternative data types, 5 | #' e.g. spatial context, morphology, or protein expression. Existing cell typing results are 6 | #' put into Insitutype's likelihood framework, which then can use alternative data 7 | #' as a prior to be updated by the expression data to get a new posterior probability 8 | #' of cell type. 9 | #' Performs this operation by 10 | #' \enumerate{ 11 | #' \item deriving cell type profiles using InSituType:::Estep(), 12 | #' \item assigning cells to "cohorts" (clusters) derived from their alternative data 13 | #' \item Inputing the output of steps (1) and (2) into InSituType::insitutype() to 14 | #' re-calculate cell type. 15 | #' } 16 | #' Paths for using alternative data in priority order (choose one; if multiple are input, only the most downstream option will be used): 17 | #' \enumerate{ 18 | #' \item Input \code{xy} positions (and possibly \code{tissue}). Then cells will be clustered 19 | #' into cohorts based on the expression pattern of their 50 nearest neighboring cells. 20 | #' \item Input a matrix of alternative data (\code{altdata}) to be automatically clustered into cohorts. This supersedes 21 | #' the altdata matrix derived from the \code{xy} argument. 22 | #' \item Input your own \code{cohort} vector. This supersedes the above inputs. 23 | #' } 24 | #' @param celltype Vector of cell type assignments to be updated 25 | #' @param counts Counts matrix (or dgCMatrix), cells * genes. 26 | #' @param neg Vector of mean negprobe counts per cell 27 | #' @param cohort Vector of cells' cohort memberships. Output of a spatial clustering algorithm makes for good cohorts. 28 | #' @param altdata Matrix of cells' alternative data values 29 | #' @param xy 2-column matrix of cells' xy positions. 30 | #' @param tissue Vector giving cells' tissue IDs. Used to separate tissue with overlapping xy coordinates. 31 | #' @param nb_size The size parameter to assume for the NB distribution. 32 | #' @param assay_type A string specifying which assay values to use. 33 | #' @importFrom irlba irlba 34 | #' @export 35 | spatialUpdate <- function(celltype, counts, neg, 36 | cohort = NULL, altdata = NULL, xy = NULL, tissue = NULL, 37 | nb_size = 10, assay_type = c("rna", "protein")) { 38 | 39 | assay_type <- match.arg(tolower(assay_type), c("rna", "protein")) 40 | 41 | ## check alternative data args: 42 | if(all(sapply(c(cohort, altdata, xy), is.null))) { 43 | stop("Must supply cohort, altdata or xy") 44 | } 45 | 46 | ## process alternative data, obtaining cohort vector: 47 | if (is.null(cohort)) { 48 | if (is.null(altdata)) { 49 | # make altdata from cells' neighborhoods: 50 | altdata <- getSpatialContext(counts = counts, xy = xy, tissue = tissue, 51 | N = 50, rad = NULL, dim_reduce_to = 20) 52 | } 53 | # cluster altdata to get cohort: 54 | cohort <- fastCohorting(mat = altdata, 55 | gaussian_transform = TRUE) 56 | } 57 | 58 | ## derive reference profiles from initial cell type vector: 59 | profiles <- Estep(counts = counts, 60 | clust = celltype, 61 | neg = neg, 62 | assay_type = assay_type) 63 | print(str(profiles)) 64 | ## Run supervised cell typing with InSituType 65 | res <- insitutype(x = counts, 66 | cohort = cohort, 67 | neg = neg, 68 | reference_profiles = profiles$profiles, 69 | reference_sds = profiles$sds, 70 | n_clusts = 0, 71 | update_reference_profiles = FALSE, 72 | assay_type = assay_type) 73 | res$cohort <- cohort 74 | return(res) 75 | } 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /R/utilities.R: -------------------------------------------------------------------------------- 1 | #' Prepare bg data for other functions 2 | #' 3 | #' Process neg data or bg to get background for each cell 4 | #' @param counts Counts matrix, cells * genes. 5 | #' @param neg Vector of mean negprobe counts per cell 6 | #' @param bg Expected background 7 | #' @return A named vector for the estimated background of each cell 8 | 9 | estimateBackground <- function(counts, neg, bg = NULL){ 10 | # infer bg if not provided: assume background is proportional to the scaling factor s 11 | if (is.null(bg) && is.null(neg)) { 12 | stop("Must provide either bg or neg") 13 | } 14 | 15 | if (is.null(bg)) { 16 | ## get neg in condition 17 | if (is.null(names(neg))) { 18 | names(neg) <- rownames(counts) 19 | } 20 | if (length(neg) != nrow(counts)) { 21 | stop("length of neg should equal nrows of counts.") 22 | } 23 | 24 | s <- Matrix::rowMeans(counts) 25 | bgmod <- stats::lm(neg ~ s - 1) 26 | bg <- bgmod$fitted 27 | } 28 | if (length(bg) == 1) { 29 | bg <- rep(bg, nrow(counts)) 30 | names(bg) <- rownames(counts) 31 | } 32 | 33 | # overwrite if non-positive bg 34 | bg[bg <=0] <- min(1e-5, bg[bg>0]) 35 | 36 | return(bg) 37 | 38 | } 39 | 40 | 41 | #' align genes in counts to profiles for other functions 42 | #' 43 | #' Process counts to have genes shared with profiles 44 | #' @param counts Counts matrix, cells * genes. 45 | #' @param profiles Matrix of reference profiles holding mean expression of genes x cell types. 46 | #' Input linear-scale expression, with genes in rows and cell types in columns. 47 | #' @return a cells * genes count matrix for shared genes only 48 | alignGenes <- function(counts, profiles){ 49 | sharedgenes <- intersect(rownames(profiles), colnames(counts)) 50 | if (length(sharedgenes) < 10) { 51 | stop("Profiles have fewer than 10 genes in common with panel, use different profiles or re-run InSituType in unsupervised mode.") 52 | } 53 | lostgenes <- setdiff(colnames(counts), rownames(profiles)) 54 | 55 | # subset: 56 | counts <- counts[, sharedgenes] 57 | 58 | # warn about genes being lost: 59 | if ((length(lostgenes) > 0) && length(lostgenes < 50)) { 60 | message( 61 | paste0( 62 | "The following genes in the count data are missing from fixed_profiles and will be omitted from downstream: ", 63 | paste0(lostgenes, collapse = ",") 64 | ) 65 | ) 66 | } 67 | if (length(lostgenes) > 50) { 68 | message( 69 | paste0( 70 | length(lostgenes), 71 | " genes in the count data are missing from fixed_profiles and will be omitted from downstream" 72 | ) 73 | ) 74 | } 75 | 76 | return(counts) 77 | } 78 | 79 | 80 | #' Get number of cores for parallelized operations 81 | #' 82 | #' @param percentCores percent of cores to use for parallelization [0-1] 83 | #' @param minNotUsedCores minimum number of cores to leave for background processes 84 | #' 85 | #' @return number of cores to use for mclapply 86 | #' @export 87 | numCores <- function(percentCores = 0.9, minNotUsedCores = 2) { 88 | if(percentCores > 1 & percentCores <= 0){ 89 | stop("percentCores is not a valid number, must be between 0-1") 90 | } 91 | 92 | num_cores <- 1 93 | if (.Platform$OS.type == "unix") { 94 | if (is.null(getOption("mc.cores"))) { 95 | num_cores <- parallel::detectCores() 96 | if(num_cores <= minNotUsedCores){ 97 | stop("minNotUsedCores must be fewer than available cores") 98 | } 99 | num_cores <- min(floor(num_cores*percentCores), num_cores-minNotUsedCores) 100 | } else { 101 | num_cores <- getOption("mc.cores") 102 | } 103 | } 104 | return(num_cores) 105 | } 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # InSituType 2 | An R package for performing cell typing in SMI and other single cell data 3 | 4 | **Manuscript**: https://www.biorxiv.org/content/10.1101/2022.10.19.512902v1.abstract 5 | 6 | **Citing Insitutype**: Danaher P, Zhao E, Yang Z, Ross D, Gregory M, Reitz Z, Kim TK, Baxter S, Jackson S, He S, Henderson DA. Insitutype: likelihood-based cell typing for single cell spatial transcriptomics. bioRxiv. 2022 Jan 1. 7 | 8 | ### System requirements 9 | - R (>= 3.5.0) 10 | - UNIX, Mac or Windows 11 | - Rcpp library (>= 1.0.9) 12 | - see DESCRIPTION for full dependencies 13 | 14 | ### Demo 15 | See the "vignettes" folder. Vignettes should run in <5 minutes. 16 | 17 | ### Instructions for use 18 | Run "insitutype" for unsupervised or semi-supervised clustering. Run "insitutypeML" for supervised cell typing. See the vignettes for example workflows. 19 | 20 | ### FAQs and tips: 21 | [https://github.com/Nanostring-Biostats/InSituType/FAQs.md](https://github.com/Nanostring-Biostats/InSituType/blob/main/FAQs.md) 22 | 23 | ### Reproduction instructions 24 | The full results of the Insitutype manuscript can be reproduced with the code in this repo: https://github.com/Nanostring-Biostats/InSituType-manuscript-analyses 25 | 26 | ### Installation 27 | ``` 28 | # Make sure Matrix and irlba are both up to date (otherwise versioning issues cause prcomp_irlba to error out): 29 | # (This is required as of Feb 2024; with any luck these packages will fix their versioning issues soon and this will not be necessary.) 30 | install.packages("Matrix", type = "source") 31 | install.packages("irlba", type = "source") 32 | 33 | # Install Insitutype: 34 | devtools::install_github("https://github.com/Nanostring-Biostats/InSituType") 35 | ``` 36 | Installation should take < 2 mins on a normal desktop computer. 37 | 38 | 39 | ### Function dependencies: 40 | ![image](https://user-images.githubusercontent.com/4357938/200046292-ba3e3453-b201-4776-b5f5-6bf3dfce6ec6.png) 41 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | trigger: none 2 | 3 | parameters: 4 | - name: imageTag 5 | displayName: 'Enter nanopipeline-build-environment image tag from last build https://dev.azure.com/Nanostring/Gemini/_build?definitionId=525&_a=summary' 6 | default: '1.3.2-94008' 7 | type: string 8 | 9 | resources: 10 | - repo: self 11 | 12 | variables: 13 | group: smida-build-pipeline-devnext-variables 14 | 15 | stages: 16 | - stage: Build 17 | displayName: Build Docker image 18 | jobs: 19 | - job: Build_image 20 | displayName: Build Docker image 21 | pool: 22 | vmImage: ubuntu-latest 23 | variables: 24 | - group: smida-build-pipeline-devnext-variables 25 | steps: 26 | - task: ECRPullImage@1 27 | displayName: Pull NanoPipeline Build environment image from DEVNEXT HUB ECR 28 | inputs: 29 | awsCredentials: 'atomx-aws-devnext-hub' 30 | regionName: '$(AWS_REGION)' 31 | repository: 'ecr-cac1-devnext-cosmx_da_repo-devnext-hub-smida-nanopipeline-build-environment' 32 | imageSource: 'imagetag' 33 | imageTag: ${{ parameters.imageTag }} 34 | - task: Bash@3 35 | displayName: Run smida-nanopipeline-build-environment Docker Image 36 | inputs: 37 | targetType: 'inline' 38 | script: | 39 | docker run --rm -v "$(Build.SourcesDirectory)":"/source" $(DEVNEXT_HUB_ACCOUNT_ID).dkr.ecr.$(AWS_REGION).amazonaws.com/ecr-cac1-devnext-cosmx_da_repo-devnext-hub-smida-nanopipeline-build-environment:${{ parameters.imageTag }} 40 | - task: Bash@3 41 | displayName: Copy .tar.gz file as latest 42 | inputs: 43 | targetType: 'inline' 44 | script: | 45 | cp *.tar.gz smiCellTyping_latest.tar.gz 46 | - task: S3Upload@1 47 | displayName: Upload CellTyping .tar.gz files to S3 on DEVNEXT HUB 48 | inputs: 49 | awsCredentials: 'atomx-aws-devnext-hub' 50 | regionName: '$(AWS_REGION)' 51 | bucketName: 's3-cac1-devnext-smida-assets-global-devnext' 52 | sourceFolder: '$(Build.SourcesDirectory)' 53 | globExpressions: '+(smiCellTyping|InSituType)_*.tar.gz' 54 | targetFolder: 'smida/assets/nanopipeline' -------------------------------------------------------------------------------- /data/human_signature.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/human_signature.RData -------------------------------------------------------------------------------- /data/iocolors.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/iocolors.RData -------------------------------------------------------------------------------- /data/ioprofiles.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/ioprofiles.RData -------------------------------------------------------------------------------- /data/mini_nsclc.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/mini_nsclc.RData -------------------------------------------------------------------------------- /data/mouse_signature.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/mouse_signature.RData -------------------------------------------------------------------------------- /data/tonsil_annotation.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/tonsil_annotation.RData -------------------------------------------------------------------------------- /data/tonsil_protein.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/tonsil_protein.RData -------------------------------------------------------------------------------- /data/tonsil_reference_profile.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/tonsil_reference_profile.RData -------------------------------------------------------------------------------- /man/Estep.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nbclust.R 3 | \name{Estep} 4 | \alias{Estep} 5 | \title{E step: estimate each cluster's mean profile} 6 | \usage{ 7 | Estep(counts, clust, neg, assay_type = c("rna", "protein")) 8 | } 9 | \arguments{ 10 | \item{counts}{Counts matrix, cells * genes.} 11 | 12 | \item{clust}{Vector of cluster assignments, or a matrix of probabilities 13 | of cells (rows) belonging to clusters (columns).} 14 | 15 | \item{neg}{Vector of mean background counts} 16 | 17 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 18 | } 19 | \value{ 20 | A list with two elements: 1. A matrix of cluster profiles, genes * clusters. 21 | 2. In protein mode, a matrix holding SDs, also genes * clusters. NULL in RNA mode. 22 | } 23 | \description{ 24 | Given cell assignments (or posterior probabilities), estimate the mean 25 | profile of each cluster. 26 | } 27 | \examples{ 28 | data("ioprofiles") 29 | unsup <- insitutype( 30 | x = mini_nsclc$counts, 31 | neg = Matrix::rowMeans(mini_nsclc$neg), 32 | n_clusts = 8, 33 | n_phase1 = 200, 34 | n_phase2 = 500, 35 | n_phase3 = 2000, 36 | n_starts = 1, 37 | max_iters = 5, 38 | assay_type="RNA", 39 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended. 40 | Estep(counts = mini_nsclc$counts, clust = unsup$clust, neg = Matrix::rowMeans(mini_nsclc$neg), assay_type="RNA") 41 | } 42 | -------------------------------------------------------------------------------- /man/Mstep.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nbclust.R 3 | \name{Mstep} 4 | \alias{Mstep} 5 | \title{M step} 6 | \usage{ 7 | Mstep( 8 | counts, 9 | means, 10 | sds = NULL, 11 | cohort, 12 | bg = 0.01, 13 | size = 10, 14 | digits = 2, 15 | return_loglik = FALSE, 16 | assay_type = c("rna", "protein") 17 | ) 18 | } 19 | \arguments{ 20 | \item{counts}{Counts matrix, cells * genes.} 21 | 22 | \item{means}{Matrix of mean cluster profiles, 23 | with genes in rows and clusters in columns.} 24 | 25 | \item{sds}{Matrix of standard deviation cluster profiles, 26 | with genes in rows and clusters in columns.} 27 | 28 | \item{cohort}{a vector of cells' "cohort" assignment, used to update logliks 29 | based on cluster frequencies within a cohort.} 30 | 31 | \item{bg}{Expected background} 32 | 33 | \item{size}{NB size parameter} 34 | 35 | \item{digits}{Round the output to this many digits (saves memory)} 36 | 37 | \item{return_loglik}{If TRUE, logliks will be returned. If FALSE, probabilities will be returned.} 38 | 39 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 40 | } 41 | \value{ 42 | Matrix of probabilities of each cell belonging to each cluster 43 | } 44 | \description{ 45 | Compute probability that each cell belongs to a given cluster 46 | } 47 | \examples{ 48 | data("mini_nsclc") 49 | data("ioprofiles") 50 | sharedgenes <- intersect(rownames(ioprofiles), colnames(mini_nsclc$counts)) 51 | Mstep(mini_nsclc$counts, ioprofiles[sharedgenes, ], bg = Matrix::rowMeans(mini_nsclc$neg), cohort = NULL, assay_type="RNA") 52 | } 53 | -------------------------------------------------------------------------------- /man/alignGenes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utilities.R 3 | \name{alignGenes} 4 | \alias{alignGenes} 5 | \title{align genes in counts to profiles for other functions} 6 | \usage{ 7 | alignGenes(counts, profiles) 8 | } 9 | \arguments{ 10 | \item{counts}{Counts matrix, cells * genes.} 11 | 12 | \item{profiles}{Matrix of reference profiles holding mean expression of genes x cell types. 13 | Input linear-scale expression, with genes in rows and cell types in columns.} 14 | } 15 | \value{ 16 | a cells * genes count matrix for shared genes only 17 | } 18 | \description{ 19 | Process counts to have genes shared with profiles 20 | } 21 | -------------------------------------------------------------------------------- /man/chooseClusterNumber.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chooseClusterNumber.R 3 | \name{chooseClusterNumber} 4 | \alias{chooseClusterNumber} 5 | \title{Estimate the correct number of clusters using a subset of the data} 6 | \usage{ 7 | chooseClusterNumber( 8 | counts, 9 | neg, 10 | assay_type = c("rna", "protein"), 11 | bg = NULL, 12 | fixed_profiles = NULL, 13 | fixed_sds = NULL, 14 | cohort = NULL, 15 | init_clust = NULL, 16 | n_clusts = 2:12, 17 | max_iters = 10, 18 | subset_size = 1000, 19 | align_genes = TRUE, 20 | plotresults = FALSE, 21 | nb_size = 10, 22 | pct_drop = 0.005, 23 | min_prob_increase = 0.05, 24 | ... 25 | ) 26 | } 27 | \arguments{ 28 | \item{counts}{Counts matrix, cells * genes.} 29 | 30 | \item{neg}{Vector of mean negprobe counts per cell (default = "rna")} 31 | 32 | \item{assay_type}{Assay type of RNA, protein} 33 | 34 | \item{bg}{Expected background} 35 | 36 | \item{fixed_profiles}{Matrix of cluster profiles to hold unchanged throughout iterations.} 37 | 38 | \item{fixed_sds}{Matrix of SDs expression of genes x cell types,to hold unchanged throughout iterations. Only for assay_type of protein} 39 | 40 | \item{cohort}{Vector of cells' cohort assignments.} 41 | 42 | \item{init_clust}{Vector of initial cluster assignments.} 43 | 44 | \item{n_clusts}{Vector giving a range of cluster numbers to consider.} 45 | 46 | \item{max_iters}{Number of iterations in each clustering attempt. Recommended to choose 47 | a smaller number for a quicker, approximate clustering.} 48 | 49 | \item{subset_size}{Number of cells to include in clustering.} 50 | 51 | \item{align_genes}{Logical, for whether to align the genes in fixed_profiles with the colnames in count} 52 | 53 | \item{plotresults}{Logical, for whether to plot the results.} 54 | 55 | \item{nb_size}{The size parameter to assume for the NB distribution.} 56 | 57 | \item{pct_drop}{the decrease in percentage of cell types with a valid switchover to 58 | another cell type compared to the last iteration. Default value: 1/10000. A valid 59 | switchover is only applicable when a cell has changed the assigned cell type with its 60 | highest cell type probability increased by min_prob_increase.} 61 | 62 | \item{min_prob_increase}{the threshold of probability used to determine a valid cell 63 | type switchover} 64 | 65 | \item{...}{Arguments passed to nbclust.} 66 | } 67 | \value{ 68 | A list, with the following elements: 69 | \itemize{ 70 | \item 71 | } 72 | } 73 | \description{ 74 | For a subset of the data, perform clustering under a range of cluster numbers. 75 | Report on loglikelihood vs. number of clusters, and suggest a best choice. 76 | } 77 | \examples{ 78 | data("mini_nsclc") 79 | chooseClusterNumber(mini_nsclc$counts, Matrix::rowMeans(mini_nsclc$neg), assay_type="RNA", 80 | n_clust = 2:5) 81 | } 82 | -------------------------------------------------------------------------------- /man/choose_anchors_from_stats.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/find_anchor_cells.R 3 | \name{choose_anchors_from_stats} 4 | \alias{choose_anchors_from_stats} 5 | \title{Choose anchor cells given anchor stats} 6 | \usage{ 7 | choose_anchors_from_stats( 8 | counts, 9 | neg = NULL, 10 | bg, 11 | anchorstats = NULL, 12 | cos = NULL, 13 | llr = NULL, 14 | n_cells = 500, 15 | min_cosine = 0.3, 16 | min_scaled_llr = 0.01, 17 | insufficient_anchors_thresh = 20, 18 | assay_type = c("rna", "protein") 19 | ) 20 | } 21 | \arguments{ 22 | \item{counts}{Counts matrix, cells * genes.} 23 | 24 | \item{neg}{Vector of mean negprobe counts per cell} 25 | 26 | \item{bg}{Expected background} 27 | 28 | \item{anchorstats}{Output from get_anchor_stats. Must provide either this or 29 | both cos and llr matrices.} 30 | 31 | \item{cos}{Matrix of cosine distances from reference profiles. Cells in rows, 32 | cell types in columns.} 33 | 34 | \item{llr}{Matrix of log likelihood ratios from reference profiles. Cells in 35 | rows, cell types in columns.} 36 | 37 | \item{n_cells}{Up to this many cells will be taken as anchor points} 38 | 39 | \item{min_cosine}{Cells must have at least this much cosine similarity to a 40 | fixed profile to be used as an anchor} 41 | 42 | \item{min_scaled_llr}{Cells must have (log-likelihood ratio / totalcounts) 43 | above this threshold to be used as an anchor} 44 | 45 | \item{insufficient_anchors_thresh}{Cell types that end up with fewer than 46 | this many anchors will be discarded.} 47 | 48 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 49 | } 50 | \value{ 51 | A vector holding anchor cell assignments (or NA) for each cell in the 52 | counts matrix 53 | } 54 | \description{ 55 | Starting with cosine distances and log likelihood ratios, choose anchor 56 | cells. 57 | } 58 | \examples{ 59 | data("ioprofiles") 60 | data("mini_nsclc") 61 | counts <- mini_nsclc$counts 62 | astats <- get_anchor_stats(counts = counts, 63 | neg = Matrix::rowMeans(mini_nsclc$neg), 64 | sds=NULL, assay_type = "RNA", 65 | profiles = ioprofiles) 66 | 67 | ## estimate per-cell bg as a fraction of total counts: 68 | negmean.per.totcount <- mean(rowMeans(mini_nsclc$neg)) / mean(rowSums(counts)) 69 | per.cell.bg <- rowSums(counts) * negmean.per.totcount 70 | 71 | # now choose anchors: 72 | choose_anchors_from_stats(counts = counts, 73 | neg = Matrix::rowMeans(mini_nsclc$neg), 74 | bg = per.cell.bg, 75 | anchorstats = astats, 76 | # a very low value chosen for the mini 77 | # dataset. Typically hundreds of cells 78 | # would be better. 79 | n_cells = 50, 80 | min_cosine = 0.4, 81 | min_scaled_llr = 0.03, 82 | insufficient_anchors_thresh = 5, 83 | assay_type="RNA") 84 | } 85 | -------------------------------------------------------------------------------- /man/colorCellTypes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/colorCellTypes.R 3 | \name{colorCellTypes} 4 | \alias{colorCellTypes} 5 | \title{Function to choose colors for cell types} 6 | \usage{ 7 | colorCellTypes( 8 | names = NULL, 9 | freqs = NULL, 10 | init_colors = NULL, 11 | max_sum_rgb = 600, 12 | palette = "earthplus" 13 | ) 14 | } 15 | \arguments{ 16 | \item{names}{Vector of cell type names} 17 | 18 | \item{freqs}{Optional, named vector of cell type abundance (e.g. c(T = 1000, 19 | tumor = 15000...))} 20 | 21 | \item{init_colors}{Optional, a named vector of cell colors. This will be used 22 | for all cell types in the "names" vector that match names(init_colors). 23 | Intended for use with the iocolors vector (found in the Ptolemy package 24 | data).} 25 | 26 | \item{max_sum_rgb}{Don't return any colors with total rgb values above this 27 | level. (Removes excessively light colors.)} 28 | 29 | \item{palette}{One of "tableau20", "brewers" or "earthplus".} 30 | } 31 | \value{ 32 | A named color vector 33 | } 34 | \description{ 35 | Uses Giotto::getDistinctColors to begin with. Orders colors so the most 36 | common cell types get the lightest colors. Removes colors that are too light 37 | (sum of rgb values > 600) 38 | } 39 | \examples{ 40 | data("mini_nsclc") 41 | unsup <- insitutype( 42 | x = mini_nsclc$counts, 43 | neg = Matrix::rowMeans(mini_nsclc$neg), 44 | n_clusts = 8, 45 | n_phase1 = 200, 46 | n_phase2 = 500, 47 | n_phase3 = 2000, 48 | n_starts = 1, 49 | max_iters = 5, 50 | assay_type="RNA" 51 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended. 52 | colorCellTypes(freqs = table(unsup$clust), palette = "brewers") 53 | } 54 | -------------------------------------------------------------------------------- /man/estimateBackground.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utilities.R 3 | \name{estimateBackground} 4 | \alias{estimateBackground} 5 | \title{Prepare bg data for other functions} 6 | \usage{ 7 | estimateBackground(counts, neg, bg = NULL) 8 | } 9 | \arguments{ 10 | \item{counts}{Counts matrix, cells * genes.} 11 | 12 | \item{neg}{Vector of mean negprobe counts per cell} 13 | 14 | \item{bg}{Expected background} 15 | } 16 | \value{ 17 | A named vector for the estimated background of each cell 18 | } 19 | \description{ 20 | Process neg data or bg to get background for each cell 21 | } 22 | -------------------------------------------------------------------------------- /man/estimatePlatformEffects.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rescaleProfiles.R 3 | \name{estimatePlatformEffects} 4 | \alias{estimatePlatformEffects} 5 | \title{Platform effect adjustment on reference profiles based on the expression profiles of anchors} 6 | \usage{ 7 | estimatePlatformEffects( 8 | counts, 9 | neg, 10 | assay_type = c("rna", "protein"), 11 | bg = NULL, 12 | anchors, 13 | profiles, 14 | sds = NULL, 15 | blacklist = NULL 16 | ) 17 | } 18 | \arguments{ 19 | \item{counts}{Counts matrix, cells * genes.} 20 | 21 | \item{neg}{Vector of mean negprobe counts per cell} 22 | 23 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 24 | 25 | \item{bg}{Expected background} 26 | 27 | \item{anchors}{Vector giving "anchor" cell types, for use in semi-supervised 28 | clustering. Vector elements will be mainly NA's (for non-anchored cells) 29 | and cell type names for cells to be held constant throughout iterations.} 30 | 31 | \item{profiles}{Matrix of reference profiles holding mean expression of genes x cell types. 32 | Input linear-scale expression, with genes in rows and cell types in columns.} 33 | 34 | \item{sds}{Matrix of reference profiles holding SDs expression of genes x cell types. 35 | Input linear-scale expression, with genes in rows and cell types in columns. Only for assay_type of protein} 36 | 37 | \item{blacklist}{vector of user-defined genes to be excluded for cell typing (default = NULL)} 38 | } 39 | \value{ 40 | A list with five elements: 41 | \describe{ 42 | \item{rescaled_profiles}{genes * cell types Matrix of rescaled reference profiles with platform effect corrected } 43 | \item{platformEff_statsDF}{a data.frame for statistics on platform effect estimation with genes in rows and columns for `Gene`, `Beta`, `beta_SE`.} 44 | \item{anchors}{a named vector of anchors used for platform effect estimation} 45 | \item{blacklist}{a vector of genes excluded from cell typing, including both outliers identified in platform effect estimation and the user-defined genes} 46 | \item{lostgenes}{a vector of genes excluded from platform effect estiamtion due to insufficient evidence} 47 | } 48 | } 49 | \description{ 50 | The general workflow would be: (1) extract the anchor cells from input; 51 | (2) Run poisson regression with anchor cells; (3) Filter user defined genes(if any) 52 | and genes with extreme betas, outside [0.01, 100]; (4) Re-scale Profile with Beta estimates. 53 | } 54 | \details{ 55 | Calculates gene-wise scaling factor between reference profiles and the observed profiles of the provided anchors. 56 | } 57 | -------------------------------------------------------------------------------- /man/fastCohorting.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fastCohorting.R 3 | \name{fastCohorting} 4 | \alias{fastCohorting} 5 | \title{Quickly split cells into cohorts} 6 | \usage{ 7 | fastCohorting(mat, n_cohorts = NULL, gaussian_transform = TRUE) 8 | } 9 | \arguments{ 10 | \item{mat}{Matrix of variables to be used in cohorting, cells in rows, and variables in columns. 11 | Recommended to use < 20 variables.} 12 | 13 | \item{n_cohorts}{Number of clusters to divide cells into} 14 | 15 | \item{gaussian_transform}{Whether to map each variable onto the quantiles of a normal distribution.} 16 | } 17 | \value{ 18 | A vector of cohort assignments. 19 | } 20 | \description{ 21 | Quickly split cells into cohorts using non-RNA data like spatial context and immunofluorescence values. 22 | Rule of thumb: include any variables that might be informative for cell typing, 23 | *except* variables you'll want to analyze later. For example, if you'll later 24 | perform differential expression as a function of spatial context, then it's 25 | safer to exclude spatial context from the cell typing exercise (and therefore 26 | from this function). 27 | } 28 | \examples{ 29 | data("mini_nsclc") 30 | ## simulate immunofluorescence data: 31 | immunofluordata <- matrix(rpois(n = nrow(mini_nsclc$counts) * 4, lambda = 100), 32 | nrow(mini_nsclc$counts)) 33 | cohort <- fastCohorting(immunofluordata, gaussian_transform = TRUE) 34 | table(cohort) 35 | } 36 | -------------------------------------------------------------------------------- /man/find_anchor_cells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/find_anchor_cells.R 3 | \name{find_anchor_cells} 4 | \alias{find_anchor_cells} 5 | \title{Choose anchor cells} 6 | \usage{ 7 | find_anchor_cells( 8 | counts, 9 | neg = NULL, 10 | bg = NULL, 11 | align_genes = TRUE, 12 | profiles, 13 | sds = NULL, 14 | size = 10, 15 | assay_type = c("rna", "protein"), 16 | n_cells = 500, 17 | min_cosine = 0.3, 18 | min_scaled_llr = 0.01, 19 | insufficient_anchors_thresh = 20, 20 | refinement = FALSE 21 | ) 22 | } 23 | \arguments{ 24 | \item{counts}{Counts matrix, cells * genes.} 25 | 26 | \item{neg}{Vector of mean negprobe counts per cell} 27 | 28 | \item{bg}{Expected background} 29 | 30 | \item{align_genes}{Logical, for whether to align the columns of the counts 31 | matrix and the rows of the profiles matrix based on their names.} 32 | 33 | \item{profiles}{Matrix of reference profiles holding mean expression of genes 34 | x cell types. Input linear-scale expression, with genes in rows and cell 35 | types in columns.} 36 | 37 | \item{sds}{Matrix of reference profiles holding SDs expression of genes x cell types. 38 | Input linear-scale expression, with genes in rows and cell types in columns. Only for assay_type of protein} 39 | 40 | \item{size}{Negative binomial size parameter to be used in likelihood calculation. Only for assay_type of RNA} 41 | 42 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 43 | 44 | \item{n_cells}{Up to this many cells will be taken as anchor points} 45 | 46 | \item{min_cosine}{Cells must have at least this much cosine similarity to a 47 | fixed profile to be used as an anchor} 48 | 49 | \item{min_scaled_llr}{Cells must have (log-likelihood ratio / totalcounts) 50 | above this threshold to be used as an anchor} 51 | 52 | \item{insufficient_anchors_thresh}{Cell types that end up with fewer than 53 | this many anchors will be discarded.} 54 | 55 | \item{refinement}{flag to further refine the anchors via UMAP projection (default = FALSE)} 56 | } 57 | \value{ 58 | A vector holding anchor cell assignments (or NA) for each cell in the 59 | counts matrix 60 | } 61 | \description{ 62 | Finds cells with very good fits to the reference profiles, and saves these 63 | cells for use as "anchors" in the semi-supervised learning version of 64 | nbclust. The function would first pick anchor cell candidates through stats 65 | and then refine anchors based on umap projection. 66 | } 67 | \examples{ 68 | data("ioprofiles") 69 | data("mini_nsclc") 70 | sharedgenes <- intersect(colnames(mini_nsclc$counts), rownames(ioprofiles)) 71 | find_anchor_cells(counts = mini_nsclc$counts[, sharedgenes], 72 | assay_type="RNA", 73 | sds=NULL, 74 | neg = Matrix::rowMeans(mini_nsclc$neg), 75 | profiles = ioprofiles) 76 | } 77 | -------------------------------------------------------------------------------- /man/flightpath_layout.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/flightpath_layout.R 3 | \name{flightpath_layout} 4 | \alias{flightpath_layout} 5 | \title{"Flightpath" (umap-like) plot of clustering results} 6 | \usage{ 7 | flightpath_layout( 8 | logliks = NULL, 9 | probs = NULL, 10 | profiles = NULL, 11 | cluster_xpos = NULL, 12 | cluster_ypos = NULL 13 | ) 14 | } 15 | \arguments{ 16 | \item{logliks}{Matrix of cells' log-likelihoods under each cluster. Must 17 | provide this or probs argument.} 18 | 19 | \item{probs}{Matrix of cells' probabilities of belonging to each cluster. 20 | Must provide this or logliks argument.} 21 | 22 | \item{profiles}{Matrix of cell type mean expression profiles. If provided, 23 | profiles rather than probs will be used to lay out the centroids.} 24 | 25 | \item{cluster_xpos}{Vector of cluster centroids' x positions (i.e. where you 26 | want each cell type to appear in the plot)} 27 | 28 | \item{cluster_ypos}{Vector of cluster centroids' y positions} 29 | } 30 | \value{ 31 | A list with two elements: \enumerate{ \item clustpos: a matrix of 32 | cluster centroids * x,y positions in the flightpath plot \item cellpos: A 33 | matrix of cells * x,y positions in the flightpath plot } 34 | } 35 | \description{ 36 | Arrays cells in 2d space based on their probability of belonging to a given 37 | cluster. 38 | } 39 | \examples{ 40 | data("mini_nsclc") 41 | unsup <- insitutype( 42 | x = mini_nsclc$counts, 43 | neg = Matrix::rowMeans(mini_nsclc$neg), 44 | assay_type = "RNA", 45 | n_clusts = 8, 46 | n_phase1 = 200, 47 | n_phase2 = 500, 48 | n_phase3 = 2000, 49 | n_starts = 1, 50 | max_iters = 5 51 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended. 52 | flightpath_layout(logliks = unsup$logliks, profiles = unsup$profiles) 53 | } 54 | -------------------------------------------------------------------------------- /man/flightpath_plot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/flightpath_layout.R 3 | \name{flightpath_plot} 4 | \alias{flightpath_plot} 5 | \title{Plot flightpath results} 6 | \usage{ 7 | flightpath_plot( 8 | flightpath_result = NULL, 9 | insitutype_result = NULL, 10 | col = NULL, 11 | showclusterconfidence = TRUE 12 | ) 13 | } 14 | \arguments{ 15 | \item{flightpath_result}{The list output by the flightpath_layout function. 16 | Two elements: clustpos, cellpos. Must provide either this or 17 | insitutype_result.} 18 | 19 | \item{insitutype_result}{The list output by insitutype or insitutypeML. Must 20 | provide either this or insitutype_result.} 21 | 22 | \item{col}{Optional, a vector of cell colors, with length equal to the number 23 | of individual cells.} 24 | 25 | \item{showclusterconfidence}{Logical, for whether to label clusters with the 26 | average posterior probability of the cells within them. Gives a readout of 27 | how distinct a cluster is from the others.} 28 | } 29 | \value{ 30 | a ggplot object 31 | } 32 | \description{ 33 | Plot flightpath results 34 | } 35 | \examples{ 36 | data("ioprofiles") 37 | unsup <- insitutype( 38 | x = mini_nsclc$counts, 39 | neg = Matrix::rowMeans(mini_nsclc$neg), 40 | n_clusts = 8, 41 | n_phase1 = 200, 42 | n_phase2 = 500, 43 | n_phase3 = 2000, 44 | n_starts = 1, 45 | max_iters = 5, 46 | assay_type="RNA" 47 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended. 48 | flightpath_plot(insitutype_result = unsup) 49 | } 50 | -------------------------------------------------------------------------------- /man/gen_profiles_protein_annotation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_profiles_protein.R 3 | \name{gen_profiles_protein_annotation} 4 | \alias{gen_profiles_protein_annotation} 5 | \title{Generate the mean reference profile and its SD reference profile from an annotation file 6 | This function is only for protein data set with known anchor cells and their cell types} 7 | \usage{ 8 | gen_profiles_protein_annotation(exp.mat, anno) 9 | } 10 | \arguments{ 11 | \item{exp.mat}{a matrix of raw protein expression data. cells are in rows and proteins are in columns} 12 | 13 | \item{anno}{a data frame or matrix of cell types for anchor cells or manually annotated cell typing information for some cells. Should include cell_ID and celltype at least.} 14 | } 15 | \value{ 16 | A list, with the following elements: 17 | \enumerate{ 18 | \item mean.ref.profile: a matrix of cluster-specific expression profiles. proteins * cell types 19 | \item SDs.ref.profile: a matrix of standard deviation profiles of pre-defined clusters. proteins * cell types 20 | \item anchors: a vector giving "anchor" cell types. Vector elements will be mainly NA's (for non-anchored cells) 21 | } 22 | } 23 | \description{ 24 | Generate the mean reference profile and its SD reference profile from an annotation file 25 | This function is only for protein data set with known anchor cells and their cell types 26 | } 27 | -------------------------------------------------------------------------------- /man/gen_profiles_protein_expression.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gen_profiles_protein.R 3 | \name{gen_profiles_protein_expression} 4 | \alias{gen_profiles_protein_expression} 5 | \title{Generate the mean reference profile and its SD reference profile based on the data itself 6 | This function is based on signature matrix included in CELESTA package 7 | First, we rebuild a nested cell typing lists based on the 2-D signature matrix 8 | Second, we identify anchor cells ranked by their expression level for each cell type's protein marker 9 | Third, we estimate averaged expression level and SDs for proteins and cell types using the anchors} 10 | \usage{ 11 | gen_profiles_protein_expression( 12 | exp.mat, 13 | sig_mat = NULL, 14 | cutoff = 0.9, 15 | min.num.cells = 30, 16 | keep_marker_proteins = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{exp.mat}{a matrix of raw protein expression data. cells are in rows and proteins are in columns} 21 | 22 | \item{sig_mat}{a signature matrix of cell types. cell types x protein markers} 23 | 24 | \item{cutoff}{a cutoff of quantile. e.g) cutoff=0.9 means that top 90 percentiles of cells are called anchors for the protein expression} 25 | 26 | \item{min.num.cells}{a minimum number of cells each cell type to estimate its mean or SDs. default value is 30.} 27 | 28 | \item{keep_marker_proteins}{whether just marker proteins from the signature matrix is kept. default value is FALSE, which returns all proteins included in the data} 29 | } 30 | \value{ 31 | A list, with the following elements: 32 | \enumerate{ 33 | \item mean.ref.profile: a matrix of cluster-specific expression profiles. proteins x cell types 34 | \item SDs.ref.profile: a matrix of standard deviation profiles of pre-defined clusters. proteins x cell types 35 | \item anchors: a vector giving "anchor" cell types. Vector elements will be mainly NA's (for non-anchored cells) 36 | } 37 | } 38 | \description{ 39 | Generate the mean reference profile and its SD reference profile based on the data itself 40 | This function is based on signature matrix included in CELESTA package 41 | First, we rebuild a nested cell typing lists based on the 2-D signature matrix 42 | Second, we identify anchor cells ranked by their expression level for each cell type's protein marker 43 | Third, we estimate averaged expression level and SDs for proteins and cell types using the anchors 44 | } 45 | \examples{ 46 | data("tonsil_protein") 47 | data("human_signature") 48 | data("mouse_signature") 49 | references <- gen_profiles_protein_expression( 50 | exp.mat=tonsil_protein$counts, 51 | sig_mat=NULL) 52 | } 53 | -------------------------------------------------------------------------------- /man/geoSketch.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geoSketch.R 3 | \name{geoSketch} 4 | \alias{geoSketch} 5 | \title{Function for creating a biased sample of a given dataset with the aim of retaining cells with unique expression vectors} 6 | \usage{ 7 | geoSketch( 8 | X, 9 | N, 10 | alpha = 0.1, 11 | max_iter = 200, 12 | returnBins = FALSE, 13 | minCellsPerBin = 1 14 | ) 15 | } 16 | \arguments{ 17 | \item{X}{feature matrix with cellIDs as rows and featureIDs as columns (can be counts, normalized expression, PCA, UMAP, etc.)} 18 | 19 | \item{N}{desired sample size} 20 | 21 | \item{alpha}{defines the acceptable minimum number of bins to sample from as `(1-alpha)*N`} 22 | 23 | \item{max_iter}{maximum number of iterations used to achieve an acceptable minimum number of bins} 24 | 25 | \item{returnBins}{determines whether or not to pass back bin labels for each cell} 26 | 27 | \item{minCellsPerBin}{the minimum number of cells required for a bin to be considered for sampling} 28 | } 29 | \value{ 30 | sampledCells, a vector of cellIDs sampled using the geometric sketching method 31 | 32 | Plaid, a named vector of binIDs where names correspond to cellIDs 33 | } 34 | \description{ 35 | Function for creating a biased sample of a given dataset with the aim of retaining cells with unique expression vectors 36 | } 37 | \examples{ 38 | data("mini_nsclc") 39 | geoSketch(mini_nsclc$counts, 200) 40 | } 41 | -------------------------------------------------------------------------------- /man/geoSketch_get_plaid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geoSketch.R 3 | \name{geoSketch_get_plaid} 4 | \alias{geoSketch_get_plaid} 5 | \title{Bin cells into "plaids"} 6 | \usage{ 7 | geoSketch_get_plaid( 8 | X, 9 | N, 10 | alpha = 0.1, 11 | max_iter = 200, 12 | returnBins = FALSE, 13 | minCellsPerBin = 1 14 | ) 15 | } 16 | \arguments{ 17 | \item{X}{feature matrix with cellIDs as rows and featureIDs as columns (can 18 | be counts, normalized expression, PCA, UMAP, etc.)} 19 | 20 | \item{N}{desired sample size} 21 | 22 | \item{alpha}{defines the acceptable minimum number of bins to sample from as 23 | `(1-alpha)*N`} 24 | 25 | \item{max_iter}{maximum number of iterations used to achieve an acceptable 26 | minimum number of bins} 27 | 28 | \item{returnBins}{determines whether or not to pass back bin labels for each 29 | cell} 30 | 31 | \item{minCellsPerBin}{the minimum number of cells required for a bin to be 32 | considered for sampling} 33 | } 34 | \value{ 35 | Plaid, a named vector of binIDs where names correspond to cellIDs 36 | } 37 | \description{ 38 | Assign cells to "plaids", very rough clusters. 39 | } 40 | \examples{ 41 | data("mini_nsclc") 42 | geoSketch_get_plaid(mini_nsclc$counts, 100) 43 | } 44 | -------------------------------------------------------------------------------- /man/geoSketch_sample_from_plaids.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geoSketch.R 3 | \name{geoSketch_sample_from_plaids} 4 | \alias{geoSketch_sample_from_plaids} 5 | \title{Subsample from plaids} 6 | \usage{ 7 | geoSketch_sample_from_plaids(Plaid, N) 8 | } 9 | \arguments{ 10 | \item{Plaid}{Vector of cells' plaid IDs} 11 | 12 | \item{N}{desired sample size} 13 | } 14 | \value{ 15 | Plaid, a named vector of binIDs where names correspond to cellIDs 16 | 17 | sampledCells, a vector of cellIDs sampled using the geometric sketching method 18 | } 19 | \description{ 20 | Sample cells, trying to give each plaid equal representation 21 | } 22 | \examples{ 23 | data("mini_nsclc") 24 | plaids <- geoSketch_get_plaid(mini_nsclc$counts, 100) 25 | geoSketch_sample_from_plaids(plaids, 5) 26 | } 27 | -------------------------------------------------------------------------------- /man/getMeanClusterConfidence.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/flightpath_layout.R 3 | \name{getMeanClusterConfidence} 4 | \alias{getMeanClusterConfidence} 5 | \title{Summarize clusters' mean confidence} 6 | \usage{ 7 | getMeanClusterConfidence(probs) 8 | } 9 | \arguments{ 10 | \item{probs}{Matrix of probabilities} 11 | } 12 | \value{ 13 | a vector of mean confidences, with values of 1 corresponding to clusters with only prob == 1 14 | } 15 | \description{ 16 | Calculate the mean confidence of the cell calls from each cluster 17 | } 18 | \examples{ 19 | data("mini_nsclc") 20 | probs <- sapply(rownames(mini_nsclc$counts), function(x) {a = runif(10); a/sum(a)}) 21 | dimnames(probs)[[1]] <- letters[1:10] 22 | probs <- t(probs) 23 | getMeanClusterConfidence(probs) 24 | } 25 | -------------------------------------------------------------------------------- /man/getProteinParameters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getProfiles.R 3 | \name{getProteinParameters} 4 | \alias{getProteinParameters} 5 | \title{Extract mean background-subtracted profiles of RNA data} 6 | \usage{ 7 | getProteinParameters(x, clust) 8 | } 9 | \arguments{ 10 | \item{x}{Expression matrix, cells * proteins.} 11 | 12 | \item{clust}{Vector of cluster assignments, or a matrix of probabilities 13 | of cells (rows) belonging to clusters (columns).} 14 | 15 | \item{neg}{Vector of mean background counts} 16 | } 17 | \value{ 18 | List with two elements: "profiles", a matrix of protein x cell type expression profiles, and "sds", a matrix of SD's. 19 | } 20 | \description{ 21 | Given cell assignments and count data, estimate the mean 22 | profile of each cluster. 23 | } 24 | -------------------------------------------------------------------------------- /man/getRNAprofiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getProfiles.R 3 | \name{getRNAprofiles} 4 | \alias{getRNAprofiles} 5 | \title{Extract mean background-subtracted profiles of RNA data} 6 | \usage{ 7 | getRNAprofiles(x, neg, clust) 8 | } 9 | \arguments{ 10 | \item{x}{Counts matrix, cells * genes.} 11 | 12 | \item{neg}{Vector of mean background counts (or a single value applied to all cells)} 13 | 14 | \item{clust}{Vector of cluster assignments, or a matrix of probabilities 15 | of cells (rows) belonging to clusters (columns).} 16 | } 17 | \value{ 18 | A matrix of gene x cell type expression profiles. 19 | } 20 | \description{ 21 | Given cell assignments and count data, estimate the mean 22 | profile of each cluster. 23 | } 24 | -------------------------------------------------------------------------------- /man/getSpatialContext.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getSpatialContext.R 3 | \name{getSpatialContext} 4 | \alias{getSpatialContext} 5 | \title{Get the neighborhood expression profile around all cells} 6 | \usage{ 7 | getSpatialContext( 8 | counts, 9 | xy, 10 | tissue = NULL, 11 | N = 50, 12 | rad = NULL, 13 | dim_reduce_to = NULL 14 | ) 15 | } 16 | \arguments{ 17 | \item{counts}{Counts matrix} 18 | 19 | \item{xy}{2-column matrix of cells' xy positions} 20 | 21 | \item{tissue}{vector of tissue IDs. Used to ensure cells for different tissues are never called neighbors} 22 | 23 | \item{N}{number of neighbors to use. Specify this or \code{rad}.} 24 | 25 | \item{rad}{radius to use to define neighbors. Specify this or \code{N}.} 26 | 27 | \item{dim_reduce_to}{If entered, the neighborhood matrix will be reduced to this many PCs} 28 | } 29 | \value{ 30 | A matrix of neighborhood expression, potentially by gene, or else by PCs if \code{dim_reduce_to} was set. 31 | } 32 | \description{ 33 | Get the neighborhood expression profile around all cells 34 | } 35 | -------------------------------------------------------------------------------- /man/get_anchor_stats.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/find_anchor_cells.R 3 | \name{get_anchor_stats} 4 | \alias{get_anchor_stats} 5 | \title{Get anchor stats} 6 | \usage{ 7 | get_anchor_stats( 8 | counts, 9 | neg = NULL, 10 | bg = NULL, 11 | align_genes = TRUE, 12 | profiles, 13 | sds = NULL, 14 | size = 10, 15 | assay_type = c("rna", "protein"), 16 | min_cosine = 0.3 17 | ) 18 | } 19 | \arguments{ 20 | \item{counts}{Counts matrix, cells * genes.} 21 | 22 | \item{neg}{Vector of mean negprobe counts per cell} 23 | 24 | \item{bg}{Expected background} 25 | 26 | \item{align_genes}{Logical, for whether to align the columns of the counts matrix and the rows of 27 | the profiles matrix based on their names.} 28 | 29 | \item{profiles}{Matrix of reference profiles holding mean expression of genes x cell types. 30 | Input linear-scale expression, with genes in rows and cell types in columns.} 31 | 32 | \item{sds}{Matrix of reference profiles holding SDs expression of genes x cell types. 33 | Input linear-scale expression, with genes in rows and cell types in columns. Only for assay_type of protein} 34 | 35 | \item{size}{Negative binomial size parameter to be used in likelihood calculation.} 36 | 37 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 38 | 39 | \item{min_cosine}{Cells must have at least this much cosine similarity to a fixed profile to be used as an anchor.} 40 | } 41 | \value{ 42 | A list with two elements: cos, the matrix of cosine distances; 43 | and llr, the matrix of log likelihood ratios of each cell under each cell type vs. the 2nd best cell type. 44 | } 45 | \description{ 46 | Compute the statistics used in finding anchor cells. 47 | Often the anchor cell selection process will involve some trial-and-error. 48 | This function performs the computationally-expensive steps that only need to 49 | happen once. 50 | } 51 | \examples{ 52 | data("ioprofiles") 53 | data("mini_nsclc") 54 | get_anchor_stats(counts = mini_nsclc$counts, 55 | neg = Matrix::rowMeans(mini_nsclc$neg), 56 | profiles = ioprofiles, 57 | sds=NULL, 58 | assay_type = "RNA") 59 | } 60 | -------------------------------------------------------------------------------- /man/get_neighborhood_expression.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getSpatialContext.R 3 | \name{get_neighborhood_expression} 4 | \alias{get_neighborhood_expression} 5 | \title{Calculate neighborhood expression} 6 | \usage{ 7 | get_neighborhood_expression(counts, neighbors) 8 | } 9 | \arguments{ 10 | \item{counts}{Single cell expression matrix} 11 | 12 | \item{neighbors}{A neighbors adjacency matrix} 13 | } 14 | \value{ 15 | A matrix in the same dimensions as \code{counts}, giving the expression profile of each cell's neighborhood. 16 | } 17 | \description{ 18 | Calculates the expression profile of each cell's neighborhood 19 | } 20 | -------------------------------------------------------------------------------- /man/human_signature.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{human_signature} 5 | \alias{human_signature} 6 | \title{Example human marker proteins 7 | For inputting \code{into gen_profiles_protein_expression()}} 8 | \format{ 9 | A matrix of 11844 cells and 2 columns 10 | } 11 | \usage{ 12 | human_signature 13 | } 14 | \description{ 15 | data frame 16 | } 17 | \keyword{datasets} 18 | -------------------------------------------------------------------------------- /man/insitutype.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/insitutype.R 3 | \name{insitutype} 4 | \alias{insitutype} 5 | \alias{insitutype,ANY-method} 6 | \alias{insitutype,SingleCellExperiment-method} 7 | \title{Run insitutype.} 8 | \usage{ 9 | insitutype(x, ...) 10 | 11 | \S4method{insitutype}{ANY}( 12 | x, 13 | neg, 14 | assay_type = c("rna", "protein"), 15 | bg = NULL, 16 | anchors = NULL, 17 | cohort = NULL, 18 | n_clusts, 19 | reference_profiles = NULL, 20 | reference_sds = NULL, 21 | update_reference_profiles = TRUE, 22 | sketchingdata = NULL, 23 | align_genes = TRUE, 24 | nb_size = 10, 25 | init_clust = NULL, 26 | n_starts = 10, 27 | n_benchmark_cells = 10000, 28 | n_phase1 = 10000, 29 | n_phase2 = 20000, 30 | n_phase3 = 1e+05, 31 | n_chooseclusternumber = 2000, 32 | pct_drop = 1/10000, 33 | min_prob_increase = 0.05, 34 | max_iters = 40, 35 | n_anchor_cells = 2000, 36 | min_anchor_cosine = 0.3, 37 | min_anchor_llr = 0.03, 38 | insufficient_anchors_thresh = 20, 39 | refinement = FALSE, 40 | rescale = TRUE, 41 | refit = TRUE 42 | ) 43 | 44 | \S4method{insitutype}{SingleCellExperiment}(x, ..., assay.type = "counts") 45 | } 46 | \arguments{ 47 | \item{x}{Counts matrix (or dgCMatrix), cells * genes. 48 | 49 | Alternatively, a \linkS4class{SingleCellExperiment} object containing such 50 | a matrix.} 51 | 52 | \item{...}{For the \linkS4class{SingleCellExperiment} method, additional 53 | arguments to pass to the ANY method.} 54 | 55 | \item{neg}{Vector of mean negprobe counts per cell} 56 | 57 | \item{assay_type}{Assay type of rna, protein (default = "rna")} 58 | 59 | \item{bg}{Expected background} 60 | 61 | \item{anchors}{Vector giving "anchor" cell types, for use in semi-supervised 62 | clustering. Vector elements will be mainly NA's (for non-anchored cells) 63 | and cell type names for cells to be held constant throughout iterations.} 64 | 65 | \item{cohort}{Vector of cells' cohort memberships} 66 | 67 | \item{n_clusts}{Number of clusters, in addition to any pre-specified cell 68 | types. Enter 0 to run purely supervised cell typing from fixed profiles. 69 | Enter a range of integers to automatically select the optimal number of 70 | clusters.} 71 | 72 | \item{reference_profiles}{Matrix of mean expression profiles of pre-defined 73 | clusters, e.g. from previous scRNA-seq. These profiles will not be updated 74 | by the EM algorithm. Columns must all be included in the init_clust 75 | variable.} 76 | 77 | \item{reference_sds}{Matrix of standard deviation profiles of pre-defined 78 | clusters. These SD profiles also will not be updated by the EM algorithm. 79 | Columns must all be included in the init_clust variable. This parameter should 80 | be defined if assay_type is protein. Default is NULL.} 81 | 82 | \item{update_reference_profiles}{Logical, for whether to use the data to 83 | update the reference profiles. Default and strong recommendation is TRUE. 84 | (However, if the reference profiles are from the same platform as the 85 | study, then FALSE could be better.)} 86 | 87 | \item{sketchingdata}{Optional matrix of data for use in non-random sampling 88 | via "sketching". If not provided, then the data's first 20 PCs will be 89 | used.} 90 | 91 | \item{align_genes}{Logical, for whether to align the counts matrix and the 92 | fixed_profiles by gene ID.} 93 | 94 | \item{nb_size}{The size parameter to assume for the NB distribution. This 95 | parameter is only for RNA.} 96 | 97 | \item{init_clust}{Vector of initial cluster assignments. If NULL, initial 98 | assignments will be automatically inferred.} 99 | 100 | \item{n_starts}{the number of iterations} 101 | 102 | \item{n_benchmark_cells}{the number of cells for benchmarking} 103 | 104 | \item{n_phase1}{Subsample size for phase 1 (random starts)} 105 | 106 | \item{n_phase2}{Subsample size for phase 2 (refining in a larger subset)} 107 | 108 | \item{n_phase3}{Subsample size for phase 3 (getting final solution in a very 109 | large subset)} 110 | 111 | \item{n_chooseclusternumber}{Subsample size for choosing an optimal number of 112 | clusters} 113 | 114 | \item{pct_drop}{the decrease in percentage of cell types with a valid 115 | switchover to another cell type compared to the last iteration. Default 116 | value: 1/10000. A valid switchover is only applicable when a cell has 117 | changed the assigned cell type with its highest cell type probability 118 | increased by min_prob_increase.} 119 | 120 | \item{min_prob_increase}{the threshold of probability used to determine a 121 | valid cell type switchover} 122 | 123 | \item{max_iters}{Maximum number of iterations.} 124 | 125 | \item{n_anchor_cells}{For semi-supervised learning. Maximum number of anchor 126 | cells to use for each cell type.} 127 | 128 | \item{min_anchor_cosine}{For semi-supervised learning. Cells must have at 129 | least this much cosine similarity to a fixed profile to be used as an 130 | anchor.} 131 | 132 | \item{min_anchor_llr}{For semi-supervised learning. Cells must have 133 | (log-likelihood ratio / totalcounts) above this threshold to be used as an 134 | anchor} 135 | 136 | \item{insufficient_anchors_thresh}{Cell types that end up with fewer than 137 | this many anchors after anchor selection will be discarded.} 138 | 139 | \item{refinement}{Logical, flag for further anchor refinement, used when update_reference_profiles = TRUE (default = FALSE)} 140 | 141 | \item{rescale}{Logical, flag for platform effect correction, used when update_reference_profiles = TRUE (default = FALSE)} 142 | 143 | \item{refit}{Logical, flag for fitting reference profiles to anchors, used when update_reference_profiles = TRUE (default = TRUE)} 144 | 145 | \item{assay.type}{A string specifying which assay values to use.} 146 | } 147 | \value{ 148 | A list, with the following elements: \enumerate{ \item clust: a 149 | vector given cells' cluster assignments \item prob: a vector giving the 150 | confidence in each cell's cluster \item logliks: Matrix of cells' 151 | log-likelihoods under each cluster. Cells in rows, clusters in columns. 152 | \item profiles: a matrix of cluster-specific expression profiles \item 153 | anchors: from semi-supervised clustering: a vector giving the identifies 154 | and cell types of anchor cells } 155 | } 156 | \description{ 157 | A wrapper for nbclust, to manage subsampling and multiple random starts. 158 | } 159 | \examples{ 160 | data("mini_nsclc") 161 | unsup <- insitutype( 162 | x = mini_nsclc$counts, 163 | neg = Matrix::rowMeans(mini_nsclc$neg), 164 | assay_type = "rna", 165 | n_clusts = 8, 166 | n_phase1 = 200, 167 | n_phase2 = 500, 168 | n_phase3 = 2000, 169 | n_starts = 1, 170 | max_iters = 5 171 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended. 172 | table(unsup$clust) 173 | } 174 | -------------------------------------------------------------------------------- /man/insitutypeML.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/insitutypeML.R 3 | \name{insitutypeML} 4 | \alias{insitutypeML} 5 | \alias{insitutypeML,ANY-method} 6 | \alias{insitutypeML,SingleCellExperiment-method} 7 | \title{Classify cells based on reference profiles} 8 | \usage{ 9 | insitutypeML(x, ...) 10 | 11 | \S4method{insitutypeML}{ANY}( 12 | x, 13 | neg = NULL, 14 | bg = NULL, 15 | cohort = NULL, 16 | reference_profiles, 17 | reference_sds = NULL, 18 | nb_size = 10, 19 | assay_type = c("rna", "protein"), 20 | align_genes = TRUE 21 | ) 22 | 23 | \S4method{insitutypeML}{SingleCellExperiment}(x, ..., assay.type = "counts") 24 | } 25 | \arguments{ 26 | \item{x}{Counts matrix (or dgCMatrix), cells * genes. 27 | 28 | Alternatively, a \linkS4class{SingleCellExperiment} object containing such 29 | a matrix.} 30 | 31 | \item{...}{For the \linkS4class{SingleCellExperiment} method, additional 32 | arguments to pass to the ANY method.} 33 | 34 | \item{neg}{Vector of mean negprobe counts per cell. Can be provided} 35 | 36 | \item{bg}{Expected background} 37 | 38 | \item{cohort}{Vector of cells' cohort memberships} 39 | 40 | \item{reference_profiles}{Matrix of expression profiles of pre-defined clusters, 41 | e.g. from previous scRNA-seq. These profiles will not be updated by the EM algorithm. 42 | Colnames must all be included in the init_clust variable.} 43 | 44 | \item{reference_sds}{Matrix of standard deviation profiles of pre-defined 45 | clusters. These SD profiles also will not be updated by the EM algorithm. 46 | Columns must all be included in the init_clust variable. This parameter should 47 | be defined if assay_type is protein. Default is NULL.} 48 | 49 | \item{nb_size}{The size parameter to assume for the NB distribution.} 50 | 51 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 52 | 53 | \item{align_genes}{Logical, for whether to align the counts matrix and the reference_profiles by gene ID.} 54 | 55 | \item{assay.type}{A string specifying which assay values to use.} 56 | } 57 | \value{ 58 | A list, with the following elements: 59 | \enumerate{ 60 | \item clust: a vector given cells' cluster assignments 61 | \item prob: a vector giving the confidence in each cell's cluster 62 | \item profiles: Matrix of clusters' mean background-subtracted profiles 63 | \item logliks: Matrix of cells' log-likelihoods under each cluster. Cells in rows, clusters in columns. 64 | } 65 | } 66 | \description{ 67 | Supervised classification of cells. Each cell is assigned to the cell type 68 | under which its observed expression profile is most likely. 69 | } 70 | \examples{ 71 | data("mini_nsclc") 72 | data("ioprofiles") 73 | sup <- insitutypeML( 74 | x = mini_nsclc$counts, 75 | neg = Matrix::rowMeans(mini_nsclc$neg), 76 | reference_profiles = ioprofiles, 77 | assay_type = "RNA") 78 | table(sup$clust) 79 | } 80 | -------------------------------------------------------------------------------- /man/iocolors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{iocolors} 5 | \alias{iocolors} 6 | \title{Default colors for the cell types in the ioprofiles matrix} 7 | \format{ 8 | A named vector 9 | } 10 | \usage{ 11 | iocolors 12 | } 13 | \description{ 14 | A named vector of colors, giving colors for the cell types of the ioprofiles 15 | matrix. 16 | } 17 | \keyword{datasets} 18 | -------------------------------------------------------------------------------- /man/ioprofiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{ioprofiles} 5 | \alias{ioprofiles} 6 | \title{Matrix of immune cell profiles} 7 | \format{ 8 | A matrix of 27161 genes x 16 cell types. 9 | } 10 | \usage{ 11 | ioprofiles 12 | } 13 | \description{ 14 | A matrix of gene * cell type expected expression values 15 | } 16 | \keyword{datasets} 17 | -------------------------------------------------------------------------------- /man/ismax.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nbclust.R 3 | \name{ismax} 4 | \alias{ismax} 5 | \title{For a numeric object, return a logical object of whether each element is the max or not.} 6 | \usage{ 7 | ismax(x) 8 | } 9 | \arguments{ 10 | \item{x}{a vector of values} 11 | } 12 | \value{ 13 | a vecetor of logical values 14 | } 15 | \description{ 16 | For a numeric object, return a logical object of whether each element is the max or not. 17 | } 18 | \examples{ 19 | ismax(c(3, 5, 5, 2)) 20 | } 21 | -------------------------------------------------------------------------------- /man/lldist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nbclust.R 3 | \name{lldist} 4 | \alias{lldist} 5 | \title{Calculate the likelihood of the expression mat 6 | using the reference profiles of x} 7 | \usage{ 8 | lldist( 9 | x, 10 | xsd = NULL, 11 | mat, 12 | bg = 0.01, 13 | size = 10, 14 | digits = 2, 15 | assay_type = c("rna", "protein") 16 | ) 17 | } 18 | \arguments{ 19 | \item{x}{a vector of a reference mean profile for the cell type} 20 | 21 | \item{xsd}{a vector of a reference standard deviation profile for the cell type} 22 | 23 | \item{mat}{a matrix of expression levels in all cells: for Protein data, we use raw data for calculating the scaling factor} 24 | 25 | \item{bg}{background level (default: 0.01)} 26 | 27 | \item{size}{the parameters for dnbinom function (default: 10)} 28 | 29 | \item{digits}{the number of digits for rounding} 30 | 31 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 32 | } 33 | \value{ 34 | likelihood for profile 35 | 36 | cells x profiles matrix of log likelihoods 37 | } 38 | \description{ 39 | Calculate the likelihood of the expression mat 40 | using the reference profiles of x 41 | } 42 | \examples{ 43 | data("mini_nsclc") 44 | data("ioprofiles") 45 | bg <- Matrix::rowMeans(mini_nsclc$neg) 46 | genes <- intersect(dimnames(mini_nsclc$counts)[[2]], dimnames(ioprofiles)[[1]]) 47 | mat <- mini_nsclc$counts[, genes] 48 | x <- ioprofiles[genes, ] 49 | lldist(x = x, mat = mini_nsclc$counts, bg = bg, assay_type="RNA") 50 | 51 | } 52 | -------------------------------------------------------------------------------- /man/lls_protein.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{lls_protein} 4 | \alias{lls_protein} 5 | \title{sum from Gaussian density function} 6 | \usage{ 7 | lls_protein(mat, bgsub, x, xsd) 8 | } 9 | \arguments{ 10 | \item{mat}{dgCMatrix expression matrix} 11 | 12 | \item{bgsub}{vector of background expression per cell} 13 | 14 | \item{x}{numeric expression for reference profiles} 15 | 16 | \item{xsd}{numeric expression for reference SD profiles} 17 | } 18 | \value{ 19 | rowSums for matrix of densities 20 | } 21 | \description{ 22 | Probability density function of the Gaussian distribution (written in C++) 23 | } 24 | -------------------------------------------------------------------------------- /man/lls_rna.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{lls_rna} 4 | \alias{lls_rna} 5 | \title{sum from negative binomial density function} 6 | \usage{ 7 | lls_rna(mat, bgsub, x, bg, size_dnb) 8 | } 9 | \arguments{ 10 | \item{mat}{dgCMatrix expression counts} 11 | 12 | \item{bgsub}{vector of background expression per cell} 13 | 14 | \item{x}{numeric expression for reference profiles} 15 | 16 | \item{bg}{numeric background level} 17 | 18 | \item{size_dnb}{int Dispersion parameter} 19 | } 20 | \value{ 21 | rowSums for matrix of densities 22 | } 23 | \description{ 24 | Probability density function of the negative binomial distribution (written in C++) 25 | } 26 | -------------------------------------------------------------------------------- /man/logliks2probs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/refineClusters.R 3 | \name{logliks2probs} 4 | \alias{logliks2probs} 5 | \title{convert logliks to probabilities} 6 | \usage{ 7 | logliks2probs(logliks) 8 | } 9 | \arguments{ 10 | \item{logliks}{Matrix of loglikelihoods, as output by insitytupe. Cells in rows, clusters in columns.} 11 | } 12 | \value{ 13 | A matrix of probabilities, in the same dimensions as logliks. 14 | } 15 | \description{ 16 | From cell x cluster log-likelihoods, calculate cell x cluster probabilities 17 | } 18 | \examples{ 19 | data("mini_nsclc") 20 | unsup <- insitutype( 21 | x = mini_nsclc$counts, 22 | neg = Matrix::rowMeans(mini_nsclc$neg), 23 | n_clusts = 8, 24 | n_phase1 = 200, 25 | n_phase2 = 500, 26 | n_phase3 = 2000, 27 | n_starts = 1, 28 | max_iters = 5, 29 | assay_type="RNA" 30 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended. 31 | logliks2probs(unsup$logliks) 32 | 33 | } 34 | -------------------------------------------------------------------------------- /man/mini_nsclc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{mini_nsclc} 5 | \alias{mini_nsclc} 6 | \title{Small example SMI data from a NSCLC tumor} 7 | \format{ 8 | A list with the following elements: 9 | \itemize{ 10 | \item counts A matrix of raw counts, with cells in rows and genes in columns 11 | \item counts A matrix of negprobe counts, with cells in rows and negprobes in columns 12 | \item x x positions 13 | \item y y position 14 | \item umap umap projection 15 | } 16 | } 17 | \usage{ 18 | mini_nsclc 19 | } 20 | \description{ 21 | A 2000-cell excerpt from a 1000-plex SMI study of a NSCLC tumor. 22 | } 23 | \keyword{datasets} 24 | -------------------------------------------------------------------------------- /man/mouse_signature.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{mouse_signature} 5 | \alias{mouse_signature} 6 | \title{Example mouse marker proteins 7 | For inputting \code{into gen_profiles_protein_expression()}} 8 | \format{ 9 | A matrix of 11844 cells and 2 columns 10 | } 11 | \usage{ 12 | mouse_signature 13 | } 14 | \description{ 15 | data frame 16 | } 17 | \keyword{datasets} 18 | -------------------------------------------------------------------------------- /man/nbclust.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nbclust.R 3 | \name{nbclust} 4 | \alias{nbclust} 5 | \title{Cluster via EM algorithm based on cell logliks} 6 | \usage{ 7 | nbclust( 8 | counts, 9 | neg, 10 | assay_type = c("rna", "protein"), 11 | bg = NULL, 12 | fixed_profiles = NULL, 13 | fixed_sds = NULL, 14 | init_profiles = NULL, 15 | init_sds = NULL, 16 | init_clust = NULL, 17 | nb_size = 10, 18 | cohort = NULL, 19 | pct_drop = 1/10000, 20 | min_prob_increase = 0.05, 21 | max_iters = 40, 22 | logresults = FALSE 23 | ) 24 | } 25 | \arguments{ 26 | \item{counts}{Counts matrix, cells * genes.} 27 | 28 | \item{neg}{Vector of mean negative probe counts per cell.} 29 | 30 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 31 | 32 | \item{bg}{Expected background} 33 | 34 | \item{fixed_profiles}{Matrix of mean expression profiles to hold unchanged throughout iterations. genes * cell types} 35 | 36 | \item{fixed_sds}{Matrix of standard deviation profiles of pre-defined 37 | clusters to hold unchanged throughout iterations. 38 | Columns must all be included in the init_clust variable. This parameter is 39 | only for assay_type of protein.} 40 | 41 | \item{init_profiles}{Matrix of cluster mean profiles under which to begin iterations. 42 | If NULL, initial assignments will be automatically inferred, using init_clust 43 | if available, and using random clusters if not.} 44 | 45 | \item{init_sds}{Matrix of cluster SDs profiles under which to begin iterations. 46 | If NULL, initial assignments will be automatically inferred, using init_clust 47 | if available, and using random clusters if not. Only for assay_type of protein} 48 | 49 | \item{init_clust}{Vector of initial cluster assignments. 50 | If NULL, initial assignments will be automatically inferred.} 51 | 52 | \item{nb_size}{The size parameter to assume for the NB distribution. Only for assay_type of RNA.} 53 | 54 | \item{cohort}{Vector of cells' "cohort" assignments, uses to assess frequencies in each cluster.} 55 | 56 | \item{pct_drop}{the decrease in percentage of cell types with a valid switchover to 57 | another cell type compared to the last iteration. Default value: 1/10000. A valid 58 | switchover is only applicable when a cell has changed the assigned cell type with its 59 | highest cell type probability increased by min_prob_increase.} 60 | 61 | \item{min_prob_increase}{the threshold of probability used to determine a valid cell 62 | type switchover} 63 | 64 | \item{max_iters}{Maximum number of iterations} 65 | 66 | \item{logresults}{Populate clusterlog in returned list 67 | 68 | @importFrom stats lm} 69 | } 70 | \value{ 71 | A list, with the following elements: 72 | \enumerate{ 73 | \item probs: a matrix of probabilities of all cells (rows) belonging to all clusters (columns) 74 | \item profiles: a matrix of cluster-specific expression profiles 75 | } 76 | } 77 | \description{ 78 | Cluster single cell gene expression data using an EM algorithm. 79 | } 80 | \examples{ 81 | data("ioprofiles") 82 | data("mini_nsclc") 83 | sharedgenes <- intersect(colnames(mini_nsclc$counts), rownames(ioprofiles)) 84 | nbclust(counts = mini_nsclc$counts[, sharedgenes], 85 | neg = Matrix::rowMeans(mini_nsclc$neg), 86 | assay_type = "RNA", 87 | bg = NULL, 88 | fixed_profiles = ioprofiles[sharedgenes, 1:3], 89 | init_profiles = NULL, 90 | init_clust = rep(c("a", "b"), 91 | nrow(mini_nsclc$counts) / 2), 92 | nb_size = 10, 93 | cohort = rep("a", nrow(mini_nsclc$counts)), 94 | pct_drop = 1/10000, 95 | min_prob_increase = 0.05, 96 | max_iters = 3, 97 | logresults = FALSE) 98 | } 99 | -------------------------------------------------------------------------------- /man/nearestNeighborGraph.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getSpatialContext.R 3 | \name{nearestNeighborGraph} 4 | \alias{nearestNeighborGraph} 5 | \title{Create spatial network from N nearest neighbors} 6 | \usage{ 7 | nearestNeighborGraph(x, y, N, subset = 1) 8 | } 9 | \arguments{ 10 | \item{x}{spatial coordinate} 11 | 12 | \item{y}{spatial coordinate} 13 | 14 | \item{N}{number of nearest neighbors} 15 | 16 | \item{subset}{same length as x,y (see Details)} 17 | } 18 | \value{ 19 | sparse adjacency matrix with distances 20 | } 21 | \description{ 22 | For each cell identify \code{N} nearest neighbors in Euclidean space and 23 | create an edge between them in graph structure, optionally subset cells (see 24 | Details). 25 | } 26 | \details{ 27 | Edges will only be created for cells that have the same \code{subset} value, 28 | usually the slide column id but could also be a slide plus FOV id to only 29 | create edges within an FOV. 30 | } 31 | -------------------------------------------------------------------------------- /man/neighbor_colMeans.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getSpatialContext.R 3 | \name{neighbor_colMeans} 4 | \alias{neighbor_colMeans} 5 | \title{for each cell, get the colMeans of x over its neighbors:} 6 | \usage{ 7 | neighbor_colMeans(x, neighbors) 8 | } 9 | \arguments{ 10 | \item{x}{A matrix} 11 | 12 | \item{neighbors}{A (probably sparse) adjacency matrix} 13 | } 14 | \description{ 15 | for each cell, get the colMeans of x over its neighbors: 16 | } 17 | -------------------------------------------------------------------------------- /man/neighbor_colSums.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getSpatialContext.R 3 | \name{neighbor_colSums} 4 | \alias{neighbor_colSums} 5 | \title{for each cell, get the colSums of x over its neighbors:} 6 | \usage{ 7 | neighbor_colSums(x, neighbors) 8 | } 9 | \arguments{ 10 | \item{x}{A matrix} 11 | 12 | \item{neighbors}{A (probably sparse) adjacency matrix} 13 | } 14 | \description{ 15 | for each cell, get the colSums of x over its neighbors: 16 | } 17 | -------------------------------------------------------------------------------- /man/numCores.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utilities.R 3 | \name{numCores} 4 | \alias{numCores} 5 | \title{Get number of cores for parallelized operations} 6 | \usage{ 7 | numCores(percentCores = 0.9, minNotUsedCores = 2) 8 | } 9 | \arguments{ 10 | \item{percentCores}{percent of cores to use for parallelization [0-1]} 11 | 12 | \item{minNotUsedCores}{minimum number of cores to leave for background processes} 13 | } 14 | \value{ 15 | number of cores to use for mclapply 16 | } 17 | \description{ 18 | Get number of cores for parallelized operations 19 | } 20 | -------------------------------------------------------------------------------- /man/prepDataForSketching.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/geoSketch.R 3 | \name{prepDataForSketching} 4 | \alias{prepDataForSketching} 5 | \title{Prepare data for geoSketch} 6 | \usage{ 7 | prepDataForSketching(counts, assay_type = c("rna", "protein")) 8 | } 9 | \arguments{ 10 | \item{counts}{Counts matrix: cells x genes} 11 | 12 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 13 | } 14 | \value{ 15 | A matrix of data for geoSketch, with cells in rows and features in columns 16 | } 17 | \description{ 18 | Process raw counts data for input into geoSketching. 19 | } 20 | \examples{ 21 | data("mini_nsclc") 22 | prepDataForSketching(counts=mini_nsclc$counts, assay_type="RNA") 23 | } 24 | -------------------------------------------------------------------------------- /man/probs2logliks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/refineClusters.R 3 | \name{probs2logliks} 4 | \alias{probs2logliks} 5 | \title{Get a logliks matrix from a probabilities matrix} 6 | \usage{ 7 | probs2logliks(probs) 8 | } 9 | \arguments{ 10 | \item{probs}{probability matrix} 11 | } 12 | \value{ 13 | log-transformed matrix 14 | } 15 | \description{ 16 | Get a logliks matrix from a probabilities matrix 17 | } 18 | \examples{ 19 | a <- runif(10) 20 | probs2logliks(a/sum(a)) 21 | } 22 | -------------------------------------------------------------------------------- /man/radiusBasedGraph.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/getSpatialContext.R 3 | \name{radiusBasedGraph} 4 | \alias{radiusBasedGraph} 5 | \title{Create spatial network from neighbors within radius R} 6 | \usage{ 7 | radiusBasedGraph(x, y, R, subset = 1) 8 | } 9 | \arguments{ 10 | \item{x}{spatial coordinate} 11 | 12 | \item{y}{spatial coordinate} 13 | 14 | \item{R}{radius} 15 | 16 | \item{subset}{same length as x,y (see Details)} 17 | } 18 | \value{ 19 | sparse adjacency matrix with distances 20 | } 21 | \description{ 22 | For each cell identify neighbors within distance \code{R} in Euclidean space 23 | and create an edge between them in graph structure, optionally subset cells 24 | (see Details). 25 | } 26 | \details{ 27 | Edges will only be created for cells that have the same \code{subset} value, 28 | usually the slide column id but could also be a slide plus FOV id to only 29 | create edges within an FOV. 30 | } 31 | -------------------------------------------------------------------------------- /man/refineAnchors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/find_anchor_cells.R 3 | \name{refineAnchors} 4 | \alias{refineAnchors} 5 | \title{Filter anchor candidates via projection of reference profiles to anchor-derived UMAP} 6 | \usage{ 7 | refineAnchors( 8 | counts, 9 | neg = NULL, 10 | bg = NULL, 11 | align_genes = TRUE, 12 | profiles, 13 | anchor_candidates, 14 | nn_cells = 500, 15 | insufficient_anchors_thresh = 20 16 | ) 17 | } 18 | \arguments{ 19 | \item{counts}{Counts matrix, cells * genes.} 20 | 21 | \item{neg}{Vector of mean negprobe counts per cell} 22 | 23 | \item{bg}{Expected background} 24 | 25 | \item{align_genes}{Logical, for whether to align the columns of the counts matrix and the rows of 26 | the profiles matrix based on their names.} 27 | 28 | \item{profiles}{Matrix of reference profiles holding mean expression of genes x cell types. 29 | Input linear-scale expression, with genes in rows and cell types in columns.} 30 | 31 | \item{anchor_candidates}{Named vector of anchor candidates with cell_ID in name and corresponding cell type in values.} 32 | 33 | \item{nn_cells}{Number of top nearest neighbors to the projected reference profiles to be selected as final anchor cells.} 34 | 35 | \item{insufficient_anchors_thresh}{Cell types that end up with fewer than this many anchors will be discarded.} 36 | } 37 | \value{ 38 | anchors, a named vector for the final anchor cells 39 | } 40 | \description{ 41 | Calculates expression UMAP model for anchor candidates, then projects reference 42 | profiles to the anchor-derived UMAP and select anchor candidates within top 43 | nearest neighbors of the projected reference profiles of same cell type in the 44 | UMAP as the final anchor cells. 45 | } 46 | -------------------------------------------------------------------------------- /man/refineClusters.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/refineClusters.R 3 | \name{refineClusters} 4 | \alias{refineClusters} 5 | \title{Merge cell types in a clustering result} 6 | \usage{ 7 | refineClusters( 8 | assay_type = c("rna", "protein"), 9 | merges = NULL, 10 | to_delete = NULL, 11 | subcluster = NULL, 12 | logliks, 13 | counts = NULL, 14 | neg = NULL, 15 | bg = NULL, 16 | cohort = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 21 | 22 | \item{merges}{A named vector in which the elements give new cluster names and 23 | the names give old cluster names. OK to omit cell types that aren't being 24 | merged.} 25 | 26 | \item{to_delete}{A vector of cluster names to delete. All cells assigned to 27 | these clusters will be reassigned to the next best cluster.} 28 | 29 | \item{subcluster}{A list, where each element's name is a cell type to 30 | subcluster, and the element itself is the cluster number(s) to use. E.g. 31 | list("macrophages" = 2, "cancer" = 2:3)} 32 | 33 | \item{logliks}{Matrix of log-likelihoods output by insitutype, cells in rows, 34 | clusters in columns} 35 | 36 | \item{counts}{Counts matrix, cells * genes. Only needed if subclustering is 37 | run.} 38 | 39 | \item{neg}{Vector of mean negprobe counts per cell. Only needed if 40 | subclustering is run.} 41 | 42 | \item{bg}{Expected background. Optional, and only used if subclustering is 43 | run.} 44 | 45 | \item{cohort}{Vector of cells' cohort memberships. Optional, and only needed 46 | if subclustering is run.} 47 | } 48 | \value{ 49 | A list with two elements: \enumerate{ \item clust: a vector of 50 | cluster assignments \item prob: Vector of posterior probabilities for each 51 | cell type \item logliks: a matrix of probabilities of all cells (rows) 52 | belonging to all clusters (columns) \item profiles: a matrix of the average 53 | background-subracted profile of each cell type after 54 | merging/deleting/subclustering } 55 | } 56 | \description{ 57 | Take a user-defined list of cells types to rename/combine, then re-compute 58 | cluster assignments and probabilities under the merged cell types. 59 | } 60 | \examples{ 61 | #example merges argument: 62 | merges = c("macrophages" = "myeloid", # merge 3 clusters 63 | "monocytes" = "myeloid", 64 | "mDC" = "myeloid", 65 | "B-cells" = "lymphoid") # just rename 1 cluster 66 | # example to_delete argument: 67 | to_delete = c("neutrophils") 68 | # example subcluster argument: 69 | subcluster = list("Myofibroblast" = 2:3) 70 | } 71 | -------------------------------------------------------------------------------- /man/spatialUpdate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/spatialUpdate.R 3 | \name{spatialUpdate} 4 | \alias{spatialUpdate} 5 | \title{Update cell typing results with spatial context or other alternative data} 6 | \usage{ 7 | spatialUpdate( 8 | celltype, 9 | counts, 10 | neg, 11 | cohort = NULL, 12 | altdata = NULL, 13 | xy = NULL, 14 | tissue = NULL, 15 | nb_size = 10, 16 | assay_type = c("rna", "protein") 17 | ) 18 | } 19 | \arguments{ 20 | \item{celltype}{Vector of cell type assignments to be updated} 21 | 22 | \item{counts}{Counts matrix (or dgCMatrix), cells * genes.} 23 | 24 | \item{neg}{Vector of mean negprobe counts per cell} 25 | 26 | \item{cohort}{Vector of cells' cohort memberships. Output of a spatial clustering algorithm makes for good cohorts.} 27 | 28 | \item{altdata}{Matrix of cells' alternative data values} 29 | 30 | \item{xy}{2-column matrix of cells' xy positions.} 31 | 32 | \item{tissue}{Vector giving cells' tissue IDs. Used to separate tissue with overlapping xy coordinates.} 33 | 34 | \item{nb_size}{The size parameter to assume for the NB distribution.} 35 | 36 | \item{assay_type}{A string specifying which assay values to use.} 37 | } 38 | \description{ 39 | Takes cell typing results, then updates it based on alternative data types, 40 | e.g. spatial context, morphology, or protein expression. Existing cell typing results are 41 | put into Insitutype's likelihood framework, which then can use alternative data 42 | as a prior to be updated by the expression data to get a new posterior probability 43 | of cell type. 44 | Performs this operation by 45 | \enumerate{ 46 | \item deriving cell type profiles using InSituType:::Estep(), 47 | \item assigning cells to "cohorts" (clusters) derived from their alternative data 48 | \item Inputing the output of steps (1) and (2) into InSituType::insitutype() to 49 | re-calculate cell type. 50 | } 51 | Paths for using alternative data in priority order (choose one; if multiple are input, only the most downstream option will be used): 52 | \enumerate{ 53 | \item Input \code{xy} positions (and possibly \code{tissue}). Then cells will be clustered 54 | into cohorts based on the expression pattern of their 50 nearest neighboring cells. 55 | \item Input a matrix of alternative data (\code{altdata}) to be automatically clustered into cohorts. This supersedes 56 | the altdata matrix derived from the \code{xy} argument. 57 | \item Input your own \code{cohort} vector. This supersedes the above inputs. 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /man/tonsil_annotation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{tonsil_annotation} 5 | \alias{tonsil_annotation} 6 | \title{Matrix of anchor cells' annotation file 7 | A matrix including cell_ID and cellType for anchors cells} 8 | \format{ 9 | A matrix of 11844 cells and 2 columns 10 | } 11 | \usage{ 12 | tonsil_annotation 13 | } 14 | \description{ 15 | matrix. 16 | } 17 | \keyword{datasets} 18 | -------------------------------------------------------------------------------- /man/tonsil_protein.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{tonsil_protein} 5 | \alias{tonsil_protein} 6 | \title{Small example SMI protein data from a tonsil tissue} 7 | \format{ 8 | A list with the following elements: 9 | \itemize{ 10 | \item counts A matrix of raw counts, with cells in rows and proteins in columns 11 | \item negs A matrix of IgG counts, with cells in rows and IgGs in columns 12 | \item xy_coord x and y positions 13 | \item UMAP umap projection 14 | } 15 | } 16 | \usage{ 17 | tonsil_protein 18 | } 19 | \description{ 20 | A 21844-cells excerpt from a 68-plex SMI study of a tonsil tissue. 21 | } 22 | \keyword{datasets} 23 | -------------------------------------------------------------------------------- /man/tonsil_reference_profile.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{tonsil_reference_profile} 5 | \alias{tonsil_reference_profile} 6 | \title{Reference profile examples from a tonsil tissue 7 | #'} 8 | \format{ 9 | A list with the following elements: 10 | \itemize{ 11 | \item tonsil_reference_profile A matrix of raw counts, with cells in rows and proteins in columns 12 | \item counts A matrix of IgG counts, with cells in rows and IgGs in columns 13 | \item xy_coord x and y positions 14 | \item UMAP umap projection 15 | } 16 | } 17 | \usage{ 18 | tonsil_reference_profile 19 | } 20 | \description{ 21 | Reference profile examples from a tonsil tissue 22 | #' 23 | } 24 | \keyword{datasets} 25 | -------------------------------------------------------------------------------- /man/updateProfilesFromAnchors.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rescaleProfiles.R 3 | \name{updateProfilesFromAnchors} 4 | \alias{updateProfilesFromAnchors} 5 | \title{Use anchor cells to update reference profiles, simply by taking the mean 6 | profile of the anchors.} 7 | \usage{ 8 | updateProfilesFromAnchors( 9 | counts, 10 | neg, 11 | bg = NULL, 12 | assay_type = c("rna", "protein"), 13 | anchors 14 | ) 15 | } 16 | \arguments{ 17 | \item{counts}{Counts matrix, cells * genes.} 18 | 19 | \item{neg}{Vector of mean negprobe counts per cell. Can be provided} 20 | 21 | \item{bg}{Expected background} 22 | 23 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 24 | 25 | \item{anchors}{Vector of anchor assignments} 26 | } 27 | \value{ 28 | \enumerate{ 29 | \item updated_profiles: A mean profiles matrix with the rows rescaled 30 | according to platform effects and individual elements updated further 31 | \item updated_sds: A mean profiles matrix with the rows rescaled 32 | according to platform effects and individual elements updated further} 33 | } 34 | \description{ 35 | Uses anchor cells to estimate platform effects / scaling factors to be 36 | applied to the genes/rows of the reference profile matrix. Then uses Bayesian 37 | math to update the individual elements on X. 38 | } 39 | -------------------------------------------------------------------------------- /man/updateReferenceProfiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rescaleProfiles.R 3 | \name{updateReferenceProfiles} 4 | \alias{updateReferenceProfiles} 5 | \title{Update reference profiles} 6 | \usage{ 7 | updateReferenceProfiles( 8 | reference_profiles, 9 | reference_sds, 10 | counts, 11 | neg, 12 | assay_type = c("rna", "protein"), 13 | bg = NULL, 14 | nb_size = 10, 15 | anchors = NULL, 16 | n_anchor_cells = 2000, 17 | min_anchor_cosine = 0.3, 18 | min_anchor_llr = 0.01, 19 | insufficient_anchors_thresh = 20, 20 | refinement = FALSE, 21 | blacklist = NULL, 22 | rescale = FALSE, 23 | refit = TRUE 24 | ) 25 | } 26 | \arguments{ 27 | \item{reference_profiles}{Matrix of reference mean profiles, genes * cell types 28 | are specified, by first choosing anchor cells.} 29 | 30 | \item{reference_sds}{Matrix of standard deviation profiles, genes * cell types. Only for assay_type of protein.} 31 | 32 | \item{counts}{Counts matrix, cells * genes.} 33 | 34 | \item{neg}{Vector of mean negprobe counts per cell} 35 | 36 | \item{assay_type}{Assay type of RNA, protein (default = "rna")} 37 | 38 | \item{bg}{Expected background} 39 | 40 | \item{nb_size}{The size parameter to assume for the NB distribution. Only for assay_type of RNA} 41 | 42 | \item{anchors}{named vector giving "anchor" cell types with cell_id in names, 43 | for use in semi-supervised clustering. Vector elements will be mainly NA's 44 | (for non-anchored cells) and cell type names for cells to be held constant 45 | throughout iterations.} 46 | 47 | \item{n_anchor_cells}{For semi-supervised learning. Maximum number of anchor 48 | cells to use for each cell type.} 49 | 50 | \item{min_anchor_cosine}{For semi-supervised learning. Cells must have at 51 | least this much cosine similarity to a fixed profile to be used as an 52 | anchor.} 53 | 54 | \item{min_anchor_llr}{For semi-supervised learning. Cells must have 55 | (log-likelihood ratio / totalcounts) above this threshold to be used as an 56 | anchor} 57 | 58 | \item{insufficient_anchors_thresh}{Cell types that end up with fewer than 59 | this many anchors will be discarded.} 60 | 61 | \item{refinement}{Logical, flag for further anchor refinement via UMAP projection (default = FALSE)} 62 | 63 | \item{blacklist}{vector of genes to be excluded for cell typing (default = NULL)} 64 | 65 | \item{rescale}{Logical, flag for platform effect correction (default = FALSE).} 66 | 67 | \item{refit}{Logical, flag for fitting reference profiles to anchors, run after rescale if rescale = TRUE (default = TRUE)} 68 | } 69 | \value{ 70 | a list 71 | \describe{ 72 | \item{updated_profiles}{a genes * cell types matrix for final updated reference profiles} 73 | \item{blacklist}{a vector of genes excluded from the final updated reference profiles} 74 | \item{anchors}{a named vector for final anchors used for reference profile update} 75 | \item{rescale_res}{a list of 5 elements, `rescaled_profiles`, `platformEff_statsDF`, `anchors`, `blacklist` and `lostgenes`, for platform effect correction outputs, return when rescale = TRUE} 76 | \item{refit_res}{a list of 2 elements, `refitted_profiles` and `anchors`, for anchor-based profile refitting outputs, return when refit = TRUE} 77 | } 78 | } 79 | \description{ 80 | Update reference profiles using pre-specified anchor cells, or if no anchors 81 | are specified, by first choosing anchor cells. Option to return reference 82 | profiles rescaled for platform effect and/or to return further refitted profiles 83 | based on the observed profiles of anchor cells. 84 | } 85 | \examples{ 86 | data("mini_nsclc") 87 | data("ioprofiles") 88 | counts <- mini_nsclc$counts 89 | ## estimate per-cell bg as a fraction of total counts: 90 | negmean.per.totcount <- mean(rowMeans(mini_nsclc$neg)) / mean(rowSums(counts)) 91 | per.cell.bg <- rowSums(counts) * negmean.per.totcount 92 | astats <- get_anchor_stats(counts = mini_nsclc$counts, 93 | assay_type="RNA", 94 | neg = Matrix::rowMeans(mini_nsclc$neg), 95 | profiles = ioprofiles, 96 | sds=NULL) 97 | 98 | # now choose anchors: 99 | anchors <- choose_anchors_from_stats(counts = counts, 100 | neg = mini_nsclc$negmean, 101 | bg = per.cell.bg, 102 | anchorstats = astats, 103 | # a very low value chosen for the mini 104 | # dataset. Typically hundreds of cells 105 | # would be better. 106 | n_cells = 50, 107 | min_cosine = 0.4, 108 | min_scaled_llr = 0.03, 109 | insufficient_anchors_thresh = 5, 110 | assay_type="RNA") 111 | 112 | # The next step is to use the anchors to update the reference profiles: 113 | 114 | updateReferenceProfiles(reference_profiles = ioprofiles, 115 | reference_sds = NULL, 116 | counts = mini_nsclc$counts, 117 | neg = mini_nsclc$neg, 118 | assay_type = "rna", 119 | bg = per.cell.bg, 120 | anchors = anchors) 121 | } 122 | -------------------------------------------------------------------------------- /man/update_logliks_with_cohort_freqs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nbclust.R 3 | \name{update_logliks_with_cohort_freqs} 4 | \alias{update_logliks_with_cohort_freqs} 5 | \title{Update logliks based on frequencies} 6 | \usage{ 7 | update_logliks_with_cohort_freqs( 8 | logliks, 9 | cohort, 10 | minfreq = 1e-06, 11 | nbaselinecells = 50 12 | ) 13 | } 14 | \arguments{ 15 | \item{logliks}{Matrix of cells' (rows) loglikelihoods under clusters (columns)} 16 | 17 | \item{cohort}{Vector of cells' cohort memberships} 18 | 19 | \item{minfreq}{Minimum frequency to give any cell type in any cohort} 20 | 21 | \item{nbaselinecells}{Number of cells from baseline distribution to add to the 22 | cohort-specific frequencies, thereby shrinking each cohort's data towards the population} 23 | } 24 | \value{ 25 | An adjusted logliks matrix 26 | } 27 | \description{ 28 | Update logliks based on frequencies 29 | } 30 | -------------------------------------------------------------------------------- /reqs.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | #### Reqs for insitutype: 6 | Insitutype performs unsupervised clustering, or semi-supervised clustering if 7 | provided with reference profiles. It uses an Expectation_maximization (EM) algorithm based on a negbinom 8 | distribution. Insitutype coordinates calls to nbclust(), which runs the EM algorithm. 9 | 10 | ##### Inputs: 11 | - an expression matrix (cells * genes) 12 | - a vector of mean negprobe values 13 | - for semi-supervised learning, a matrix of reference profiles 14 | - additional arguments for finer control 15 | 16 | ##### Outputs: 17 | A list, with the following elements: 18 | \enumerate{ 19 | \item clust: a vector given cells' cluster assignments 20 | \item prob: a vector giving the confidence in each cell's cluster 21 | \item logliks: Matrix of cells' log-likelihoods under each cluster. Cells in rows, clusters in columns. 22 | \item profiles: a matrix of cluster-specific expression profiles 23 | \item anchors: from semi-supervised clustering: a vector giving the identifies and cell types of anchor cells 24 | } 25 | 26 | 27 | #### Reqs for insitutypeML: 28 | Insitutype performs supervised cell typing using a Bayes classifier based on a negbinom distribution. 29 | 30 | ##### Inputs: 31 | - an expression matrix (cells * genes) 32 | - a vector of mean negprobe values 33 | - for semi-supervised learning, a matrix of reference profiles 34 | - additional arguments for finer control 35 | 36 | ##### Outputs: 37 | A list, with the following elements: 38 | \enumerate{ 39 | \item clust: a vector given cells' cluster assignments 40 | \item prob: a vector giving the confidence in each cell's cluster 41 | \item logliks: Matrix of cells' log-likelihoods under each cluster. Cells in rows, clusters in columns. 42 | \item profiles: a matrix of cluster-specific expression profiles 43 | } 44 | 45 | 46 | #### Reqs for updateReferenceProfiles 47 | Update reference profiles from alternative platforms to better fit the spatial platform. 48 | Uses pre-specified anchor cells, or if no anchors are specified, by first choosing anchor cells. 49 | 50 | ##### Inputs: 51 | - reference profiles 52 | - spatial data: counts matrix, negmean values 53 | - additional arguments for finer control 54 | 55 | ##### Outputs: 56 | - An updated reference matrix 57 | - A vector storing the anchor cells used 58 | 59 | #### Reqs for refineClusters 60 | A function for refining the output of insitutype and insitutypeML. 61 | Can delete clusters, merge/rename clusters, or sub-cluster clusters. 62 | 63 | ##### Inputs: 64 | - Results from an insitutyle/insitutypeML run 65 | - If subclustering further, counts data 66 | 67 | ##### Outputs: 68 | A list in the format of insitutype results with updated cluster assignments. 69 | 70 | 71 | 72 | #### Reqs for chooseClusterNumber 73 | A function to run insituytpe across a range of cluster numbers and identify the best fit 74 | 75 | ##### Inputs: 76 | - The standard insitutype inputs 77 | - A range of cluster numbers 78 | 79 | ##### Outputs: 80 | - A suggested cluster number, plus metrics for comparing cluster numbers. 81 | 82 | 83 | 84 | 85 | #### Reqs for get_anchor_stats 86 | Function to calculate the summary stats used by anchor cell selection. 87 | Results are meant to be fed to choose_anchors_from_stats(). 88 | 89 | ##### Inputs: 90 | - The same expression data used by insitutype. 91 | - Reference profiles 92 | 93 | ##### Outputs: 94 | - A matrix of cosine distances of cells * cell types 95 | - A matrix of log likelihood ratio scores for cells * cell types 96 | 97 | 98 | 99 | #### Reqs for choose_anchors_from_stats 100 | Chooses anchor cells given cosine distances and log likelihood ratio scores 101 | output by get_anchor_stats. 102 | 103 | ##### Inputs: 104 | - A matrix of cosine distances of cells * cell types 105 | - A matrix of log likelihood ratio scores for cells * cell types 106 | 107 | ##### Outputs: 108 | A vector of anchor assignments. 109 | 110 | 111 | 112 | #### Reqs for find_anchor_cells 113 | Complete anchor cell selection workflow. Calls get_anchor_stats and choose_anchors_from_stats. 114 | 115 | ##### Inputs: 116 | - The same expression data used by insitutype. 117 | - Reference profiles 118 | 119 | ##### Outputs: 120 | A vector of anchor assignments. 121 | 122 | 123 | 124 | #### Reqs for flightpath_layout 125 | A function to define the layout for a flightpath plot. Uses UMAP to place cluster centroids, 126 | then places cells based on their posterior probabilities of belonging to each centroid. 127 | 128 | ##### Inputs: 129 | - A matrix of cell * cluster log-likelihoods (output by insitutype) 130 | - A matrix of cluster profiles 131 | 132 | ##### Outputs: 133 | - xy placements for cluster centroids 134 | - xy placements for individual cells 135 | 136 | 137 | 138 | #### Reqs for flightpath_plot 139 | Makes a ggplot object holding a flightpath plot. Uses UMAP to place cluster centroids, 140 | then places cells based on their posterior probabilities of belonging to each centroid. 141 | 142 | ##### Inputs: 143 | - Path 1: input an insitutype/insitutypeML result, and it will call flightpath_layout() 144 | - Path 2: input a flightpath_layout result. 145 | 146 | ##### Outputs: 147 | A ggplot object 148 | 149 | 150 | #### Reqs for fastCohorting 151 | Quickly clusters data from alternative sources like immunofluorescence and spatial context. 152 | 153 | ##### Inputs: 154 | - A matrix holding alternative data (cells * variables) 155 | - Arguments for finer control 156 | 157 | ##### Output: 158 | A vector giving each cell's cohort assignment. 159 | -------------------------------------------------------------------------------- /specs.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | #### Specs for insitutypeML: 4 | - Returns a vector of cell type assignments -- test: test_insitutype.R#L54 5 | - Returns a vector of posterior probabilities / confidence scores -- test: test_insitutype.R#L55 6 | - Returns a matrix of cell * cell type log-likelihoods -- test: test_insitutype.R#L56 7 | - Returns a matrix of cell type profiles -- test: test_insitutype.R#L57 8 | 9 | 10 | #### Specs for insitutype: 11 | - If run with fixed_profiles and 0 new clusters, produces valid outputs: 12 | - Returns a vector of cell type assignments -- test: test_insitutype.R#L82 13 | - Returns a vector of posterior probabilities / confidence scores -- test: test_insitutype.R#L83 14 | - Returns a matrix of cell * cell type log-likelihoods -- test: test_insitutype.R#L84 15 | - Returns a matrix of cell type profiles -- test: test_insitutype.R#L85 16 | - If run with no fixed_profiles (fully unsupervises), produces valid outputs: 17 | - Returns a vector of cell type assignments -- test: test_insitutype.R#L112 18 | - Returns a vector of posterior probabilities / confidence scores -- test: test_insitutype.R#L113 19 | - Returns a matrix of cell * cell type log-likelihoods -- test: test_insitutype.R#L114 20 | - Returns a matrix of cell type profiles -- test: test_insitutype.R#L115 21 | - If unsupervised clustering is run with initial clusters specified, produces valid outputs: 22 | - Returns a vector of cell type assignments -- test: test_insitutype.R#L144 23 | - Returns a vector of posterior probabilities / confidence scores -- test: test_insitutype.R#L145 24 | - Returns a matrix of cell * cell type log-likelihoods -- test: test_insitutype.R#L146 25 | - Returns a matrix of cell type profiles -- test: test_insitutype.R#L147 26 | - The clusters returned have the same names as the initial clusters -- test: test_insitutype.R#L148 27 | - If semi-supervised clustering is run with initial clusters specified, produces valid outputs: 28 | - Returns a vector of cell type assignments -- test: test_insitutype.R#L177 29 | - Returns a vector of posterior probabilities / confidence scores -- test: test_insitutype.R#L178 30 | - Returns a matrix of cell * cell type log-likelihoods -- test: test_insitutype.R#L179 31 | - Returns a matrix of cell type profiles -- test: test_insitutype.R#L180 32 | 33 | 34 | #### Specs for updateReferenceProfiles: 35 | - Returns a matrix of new profiles -- test: test_insitutype.R#L316 36 | - Returns a vector of anchor assignments -- test: test_insitutype.R#L317 37 | 38 | 39 | #### Specs for refineClusters: 40 | - Merging operations happen correctly -- test: test_insitutype.R#L196 41 | - Cell names are preserved -- test: test_insitutype.R#L196 42 | - Makes no changes if none are requested -- test: test_insitutype.R#L302 43 | - Merging operations happen correctly if merges and deletions are asked for -- test: test_insitutype.R#L307 44 | - Merging operations happen correctly if merges are asked for -- test: test_refinecells_cell_merging_logic.R#L14,18,22 45 | 46 | 47 | #### Specs for chooseClusterNumber: 48 | - Returns a single value for "best cluster number" -- test: test_insitutype.R#L219 49 | - Reports the cluster numbers considered -- test: test_insitutype.R#L220 50 | - Reports the log likelihood from each cluster number #221 51 | - Reports the AIC from each cluster number #222 52 | - Reports the BIC from each cluster number #223 53 | 54 | 55 | #### Specs for get_anchor_stats 56 | - Returns a matrix of cosine distances -- test: test_insitutype.R#L236 57 | - Returns a matrix of log likelihood ratios -- test: test_insitutype.R#L237 58 | 59 | #### Specs for choose_anchors_from_stats 60 | - Assigns values consistent with the cell type names of the inputs -- test: test_insitutype.R#L253 61 | - Assigns no more than the specified number of anchors per cell type -- test: test_insitutype.R#L253 62 | - The anchors vector aligns to the rows of the counts matrix (cells) -- test: test_insitutype.R#L254 63 | 64 | #### Specs for find_anchor_cells 65 | - Assigns values consistent with the cell type names of the inputs -- test: test_insitutype.R#L271 66 | - Assigns no more than the specified number of anchors per cell type -- test: test_insitutype.R#L272 67 | - The anchors vector aligns to the rows of the counts matrix (cells) -- test: test_insitutype.R#L273 68 | - Returns NULL if no cells meet anchor criteria -- test: test_insitutype.R#L288 69 | 70 | 71 | #### Specs for flightpath_layout 72 | - Returns correctly formatted results: 73 | - Cluster positions are in a 2-column matrix -- test: test_flightpath.R#L34 74 | - Cell positions are in a 2-column matrix -- test: test_flightpath.R#L35 75 | - There are no missing cluster positions -- test: test_flightpath.R#L36 76 | - There are no missing cell positions -- test: test_flightpath.R#L37 77 | 78 | #### Specs for flightpath_plot 79 | - when passed a result from flightpath_layout, flightpath_plot returns a ggplot object -- test: test_flightpath.R#L43 80 | - when passed an insitutype results, flightpath_plot returns a ggplot object -- test: test_flightpath.R#L51 81 | - when asked to show meanConfidence, flightpath_plot returns a ggplot object -- test: test_flightpath.R#L57 82 | 83 | 84 | #### Specs for fastCohorting 85 | - Returns a vector of cohort assignments -- test: test_insitutype.R#L325 86 | - Returns the specified number of unique cohorts -- test: test_insitutype.R#L325 87 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 3 | ## enable compilation with C++11 (where available) 4 | ## 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider 6 | ## availability of the package we do not yet enforce this here. It is however 7 | ## recommended for client packages to set it. 8 | ## 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP 10 | ## support within Armadillo prefers / requires it 11 | CXX_STD = CXX11 12 | 13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD=1 14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 3 | ## enable compilation with C++11 (where available) 4 | ## 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider 6 | ## availability of the package we do not yet enforce this here. It is however 7 | ## recommended for client packages to set it. 8 | ## 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP 10 | ## support within Armadillo prefers / requires it 11 | CXX_STD = CXX11 12 | 13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD=1 14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 15 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | #include 6 | 7 | using namespace Rcpp; 8 | 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 10 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 11 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 12 | #endif 13 | 14 | // lls_rna 15 | Rcpp::NumericMatrix lls_rna(arma::sp_mat& mat, arma::vec& bgsub, arma::mat& x, arma::vec& bg, int& size_dnb); 16 | RcppExport SEXP _InSituType_lls_rna(SEXP matSEXP, SEXP bgsubSEXP, SEXP xSEXP, SEXP bgSEXP, SEXP size_dnbSEXP) { 17 | BEGIN_RCPP 18 | Rcpp::RObject rcpp_result_gen; 19 | Rcpp::RNGScope rcpp_rngScope_gen; 20 | Rcpp::traits::input_parameter< arma::sp_mat& >::type mat(matSEXP); 21 | Rcpp::traits::input_parameter< arma::vec& >::type bgsub(bgsubSEXP); 22 | Rcpp::traits::input_parameter< arma::mat& >::type x(xSEXP); 23 | Rcpp::traits::input_parameter< arma::vec& >::type bg(bgSEXP); 24 | Rcpp::traits::input_parameter< int& >::type size_dnb(size_dnbSEXP); 25 | rcpp_result_gen = Rcpp::wrap(lls_rna(mat, bgsub, x, bg, size_dnb)); 26 | return rcpp_result_gen; 27 | END_RCPP 28 | } 29 | // lls_protein 30 | Rcpp::NumericMatrix lls_protein(arma::mat& mat, arma::vec& bgsub, arma::mat& x, arma::mat& xsd); 31 | RcppExport SEXP _InSituType_lls_protein(SEXP matSEXP, SEXP bgsubSEXP, SEXP xSEXP, SEXP xsdSEXP) { 32 | BEGIN_RCPP 33 | Rcpp::RObject rcpp_result_gen; 34 | Rcpp::RNGScope rcpp_rngScope_gen; 35 | Rcpp::traits::input_parameter< arma::mat& >::type mat(matSEXP); 36 | Rcpp::traits::input_parameter< arma::vec& >::type bgsub(bgsubSEXP); 37 | Rcpp::traits::input_parameter< arma::mat& >::type x(xSEXP); 38 | Rcpp::traits::input_parameter< arma::mat& >::type xsd(xsdSEXP); 39 | rcpp_result_gen = Rcpp::wrap(lls_protein(mat, bgsub, x, xsd)); 40 | return rcpp_result_gen; 41 | END_RCPP 42 | } 43 | 44 | static const R_CallMethodDef CallEntries[] = { 45 | {"_InSituType_lls_rna", (DL_FUNC) &_InSituType_lls_rna, 5}, 46 | {"_InSituType_lls_protein", (DL_FUNC) &_InSituType_lls_protein, 4}, 47 | {NULL, NULL, 0} 48 | }; 49 | 50 | RcppExport void R_init_InSituType(DllInfo *dll) { 51 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 52 | R_useDynamicSymbols(dll, FALSE); 53 | } 54 | -------------------------------------------------------------------------------- /src/rcpparma_dnbinom_sparse.cpp: -------------------------------------------------------------------------------- 1 | // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; indent-tabs-mode: nil; -*- 2 | 3 | // we only include RcppArmadillo.h which pulls Rcpp.h in for us 4 | #include 5 | 6 | // via the depends attribute we tell Rcpp to create hooks for 7 | // RcppArmadillo so that the build process will know what to do 8 | // 9 | // [[Rcpp::depends(RcppArmadillo)]] 10 | // [[Rcpp::plugins(cpp11)]] 11 | using namespace Rcpp; 12 | using namespace arma; 13 | 14 | // Add a flag to enable OpenMP at compile time 15 | // [[Rcpp::plugins(openmp)]] 16 | 17 | // Protect against compilers without OpenMP 18 | #ifdef _OPENMP 19 | #include 20 | 21 | static int NBthreads = -1; 22 | 23 | int get_lldist_threads(const int n_profiles) { 24 | if (NBthreads == -1) { 25 | // Max allocation of threads equal to 80% of cores 26 | NBthreads = floor(0.8*omp_get_num_procs()); 27 | 28 | // Reduce max based on OpenMP settings 29 | NBthreads = std::min(NBthreads, omp_get_thread_limit()); 30 | NBthreads = std::min(NBthreads, omp_get_max_threads()); 31 | } 32 | const int ans = n_profiles + 2; // desired number of threads 33 | return std::min(ans, NBthreads); 34 | } 35 | #endif 36 | 37 | //' sum from negative binomial density function 38 | //' 39 | //' Probability density function of the negative binomial distribution (written in C++) 40 | //' 41 | //' @param mat dgCMatrix expression counts 42 | //' @param bgsub vector of background expression per cell 43 | //' @param x numeric expression for reference profiles 44 | //' @param bg numeric background level 45 | //' @param size_dnb int Dispersion parameter 46 | //' 47 | //' @return rowSums for matrix of densities 48 | //' @useDynLib InSituType, .registration = TRUE 49 | //' @importFrom Rcpp evalCpp 50 | //' @exportPattern "^[[:alpha:]]+" 51 | //' @export 52 | // [[Rcpp::export]] 53 | Rcpp::NumericMatrix 54 | lls_rna(arma::sp_mat& mat, arma::vec& bgsub, arma::mat& x, arma::vec& bg, int& size_dnb) { 55 | unsigned int K = x.n_cols; 56 | Rcpp::NumericMatrix res(mat.n_rows, K); 57 | #pragma omp parallel for num_threads(get_lldist_threads(K)) 58 | for (unsigned int k = 0; k < K; k++) { 59 | const arma::mat::const_col_iterator col_it_begin = x.begin_col(k); 60 | arma::mat::const_col_iterator col_it = x.begin_col(k); 61 | const arma::mat::const_col_iterator col_it_end = x.end_col(k); 62 | const arma::vec s = bgsub / sum(x.col(k)); 63 | for(; col_it != col_it_end; ++col_it) { 64 | arma::vec::const_iterator s_iter = s.begin(); 65 | arma::vec::const_iterator bg_iter = bg.begin(); 66 | for(; s_iter != s.end(); ++s_iter) { 67 | double yhat = (*s_iter) * (*col_it) + (*bg_iter); 68 | int i = s_iter - s.begin(); 69 | int j = col_it - col_it_begin; 70 | res(i, k) += R::dnbinom_mu(mat(i, j), size_dnb, yhat, 1); 71 | ++bg_iter; 72 | } 73 | } 74 | } 75 | return res; 76 | } 77 | 78 | //' sum from Gaussian density function 79 | //' 80 | //' Probability density function of the Gaussian distribution (written in C++) 81 | //' 82 | //' @param mat dgCMatrix expression matrix 83 | //' @param bgsub vector of background expression per cell 84 | //' @param x numeric expression for reference profiles 85 | //' @param xsd numeric expression for reference SD profiles 86 | //' 87 | //' @return rowSums for matrix of densities 88 | //' @useDynLib InSituType, .registration = TRUE 89 | //' @importFrom Rcpp evalCpp 90 | //' @exportPattern "^[[:alpha:]]+" 91 | //' @export 92 | // [[Rcpp::export]] 93 | Rcpp::NumericMatrix 94 | lls_protein(arma::mat& mat, arma::vec& bgsub, arma::mat& x, arma::mat& xsd) { 95 | unsigned int K = x.n_cols; 96 | Rcpp::NumericMatrix res(mat.n_rows, K); 97 | #pragma omp parallel for num_threads(get_lldist_threads(K)) 98 | for (unsigned int k = 0; k < K; k++) { 99 | const arma::mat::const_col_iterator col_it_begin = x.begin_col(k); 100 | arma::mat::const_col_iterator col_it = x.begin_col(k); 101 | arma::mat::const_col_iterator xsd_iter = xsd.begin_col(k); 102 | const arma::mat::const_col_iterator col_it_end = x.end_col(k); 103 | const arma::vec s = bgsub / sum(x.col(k)); 104 | for(; col_it != col_it_end; ++col_it) { 105 | arma::vec::const_iterator s_iter = s.begin(); 106 | //arma::vec::const_iterator bg_iter = bg.begin(); 107 | for(; s_iter != s.end(); ++s_iter) { 108 | double yhat = (*s_iter) * (*col_it); 109 | double sd = (*s_iter) * (*xsd_iter); 110 | int i = s_iter - s.begin(); 111 | int j = col_it - col_it_begin; 112 | res(i, k) += R::dnorm(mat(i, j), yhat, sd, 1); 113 | //++bg_iter; 114 | } 115 | ++xsd_iter; 116 | } 117 | } 118 | return res; 119 | } 120 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(InSituType) 3 | test_check("InSituType") 4 | -------------------------------------------------------------------------------- /tests/testthat/test-colorCellTypes.R: -------------------------------------------------------------------------------- 1 | # create mock cell type abundances: 2 | data("iocolors") 3 | 4 | set.seed(0) 5 | cells <- sample(c(letters[1:10], names(iocolors)[1:4]), 100, replace = TRUE) 6 | tab <- table(cells) 7 | 8 | # run using just names: 9 | cols_names <- colorCellTypes(names = names(tab), freqs = NULL, init_colors = NULL, max_sum_rgb = 600, palette = "brewers") 10 | 11 | # run using abundance info: 12 | cols_freqs <- colorCellTypes(names = NULL, freqs = tab, init_colors = NULL, max_sum_rgb = 600) 13 | 14 | 15 | # test that pre-specified colors are used 16 | cols_init <- colorCellTypes(names = NULL, freqs = tab, init_colors = iocolors, max_sum_rgb = 600) 17 | 18 | test_that("pre-specified colors are used", { 19 | sharedcells <- intersect(names(tab), names(iocolors)) 20 | expect_true(all.equal(cols_init[sharedcells], iocolors[sharedcells])) 21 | }) 22 | 23 | 24 | 25 | # test that legal colors are returned in all cases, with names matching the cell names: 26 | test_that("test that results returned by flagLowGenes have the right formats", { 27 | expect_error(plot(seq_along(tab), col = cols_names), NA) # "NA" means expecting no error 28 | expect_error(plot(seq_along(tab), col = cols_freqs), NA) # "NA" means expecting no error 29 | expect_error(plot(seq_along(tab), col = cols_init), NA) # "NA" means expecting no error 30 | expect_equal(length(intersect(names(cols_names), names(tab))), length(names(tab))) 31 | expect_equal(length(intersect(names(cols_freqs), names(tab))), length(names(tab))) 32 | expect_equal(length(intersect(names(cols_init), names(tab))), length(names(tab))) 33 | 34 | }) 35 | 36 | # test that it works if prespecified colors have no overlap: 37 | test_that("correct results even if prespecified colors have no overlap", { 38 | cols_bad_init <- colorCellTypes(names = NULL, freqs = tab, init_colors = c(no = "red", nope = "blue"), max_sum_rgb = 600) 39 | expect_error(plot(seq_along(tab), col = cols_bad_init), NA) # "NA" means expecting no error 40 | expect_equal(length(intersect(names(cols_bad_init), names(tab))), length(names(tab))) 41 | }) 42 | 43 | 44 | # test that all 3 paletted work: 45 | test_that("all 3 paletted work", { 46 | cols_tab20 <- colorCellTypes(names = NULL, freqs = tab, init_colors = NULL, max_sum_rgb = 600, palette = "tableau20") 47 | cols_brew <- colorCellTypes(names = NULL, freqs = tab, init_colors = NULL, max_sum_rgb = 600, palette = "brewers") 48 | cols_earth <- colorCellTypes(names = NULL, freqs = tab, init_colors = NULL, max_sum_rgb = 600, palette = "earthplus") 49 | expect_error(plot(seq_along(tab), col = cols_tab20), NA) # "NA" means expecting no error 50 | expect_error(plot(seq_along(tab), col = cols_brew), NA) # "NA" means expecting no error 51 | expect_error(plot(seq_along(tab), col = cols_earth), NA) # "NA" means expecting no error 52 | }) 53 | 54 | -------------------------------------------------------------------------------- /tests/testthat/test_RCppExports.R: -------------------------------------------------------------------------------- 1 | data("ioprofiles") 2 | data("mini_nsclc") 3 | bg <- Matrix::rowMeans(mini_nsclc$neg) 4 | genes <- intersect(dimnames(mini_nsclc$counts)[[2]], dimnames(ioprofiles)[[1]]) 5 | mat <- mini_nsclc$counts[, genes] 6 | x <- ioprofiles[genes, 1, drop = FALSE] 7 | 8 | test_that("Rcpp calculation is same as stats package for RNA data type", { 9 | bgsub <- pmax(sweep(mat, 1, bg, "-"), 0) 10 | s <- Matrix::rowSums(bgsub) / sum(x) 11 | s[s <= 0] <- Matrix::rowSums(mat[s <= 0, , drop = FALSE]) / sum(x) 12 | result <- lldist(mat = as(mat, "dgCMatrix"), 13 | assay_type = "rna", 14 | x = x, 15 | bg=bg, 16 | size = 10) 17 | names(result) <- rownames(mat) 18 | yhat <- sweep(s %*% t(x), 1, bg, "+") 19 | lls <- stats::dnbinom(x = as.matrix(mat), size = 10, mu = yhat, log = TRUE) 20 | result_ref <- round(rowSums(lls), digits=2) 21 | expect_true(all.equal(result[,1], result_ref)) 22 | }) 23 | 24 | 25 | data("tonsil_protein") 26 | data("tonsil_reference_profile") 27 | bg <- Matrix::rowMeans(tonsil_protein$neg) 28 | proteins <- intersect(dimnames(tonsil_protein$counts)[[2]], dimnames(tonsil_reference_profile$mean.ref.profile)[[1]]) 29 | mat <- tonsil_protein$counts[, proteins] 30 | x <- tonsil_reference_profile$mean.ref.profile[proteins, 1, drop = FALSE] 31 | xsd <- tonsil_reference_profile$SDs.ref.profile[proteins, 1, drop = FALSE] 32 | 33 | 34 | test_that("Rcpp calculation is same as stats package for protein data type", { 35 | bgsub <- pmax(sweep(mat, 1, bg, "-"), 0) 36 | s <- Matrix::rowSums(bgsub) / sum(x) 37 | s[s <= 0] <- Matrix::rowSums(mat[s <= 0, , drop = FALSE]) / sum(x) 38 | result <- lldist(mat = as.matrix(mat), 39 | assay_type = "Protein", 40 | x = x, 41 | xsd = xsd, 42 | bg=bg, 43 | size = 10) 44 | names(result) <- rownames(mat) 45 | 46 | yhat <- s %*% t(x) 47 | ysd <- s %*% t(xsd) 48 | 49 | lls <- stats::dnorm(x = as.matrix(mat), sd = ysd, mean = yhat, log = TRUE) 50 | 51 | result_ref <- round(rowSums(lls), digits=2) 52 | expect_true(all.equal(result[,1], result_ref)) 53 | }) 54 | -------------------------------------------------------------------------------- /tests/testthat/test_flightpath.R: -------------------------------------------------------------------------------- 1 | 2 | # load data ("raw" and "cellannot"): 3 | data("ioprofiles") 4 | data("iocolors") 5 | data("mini_nsclc") 6 | 7 | 8 | # run unsupervised clustering with several random starts: 9 | res <- insitutype(x = mini_nsclc$counts, 10 | neg = Matrix::rowMeans(mini_nsclc$neg), 11 | bg = NULL, 12 | init_clust = NULL, n_clusts = 6, 13 | anchors = NULL, 14 | nb_size = 10, 15 | n_starts = 2, 16 | align_genes = TRUE, 17 | sketchingdata = NULL, 18 | n_benchmark_cells = 100, 19 | n_phase1 = 50, 20 | n_phase2 = 100, 21 | n_phase3 = 200, 22 | n_chooseclusternumber = 100, 23 | pct_drop = 1/10000, 24 | min_prob_increase = 0.05, 25 | max_iters = 2, 26 | assay_type="RNA") 27 | 28 | 29 | # test flightpath_layout 30 | fp <- flightpath_layout(probs = NULL, logliks = res$logliks, profiles = res$profiles) 31 | 32 | test_that("flightpath_layout returns correct format", { 33 | expect_true(all(dim(fp$clustpos) == c(6, 2))) 34 | expect_true(all(dim(fp$cellpos) == c(nrow(res$logliks), 2))) 35 | expect_true(all(!is.na(fp$clustpos))) 36 | expect_true(all(!is.na(fp$cellpos))) 37 | }) 38 | 39 | 40 | # test flightpath_plot from flightpath results 41 | p <- flightpath_plot(flightpath_result = fp) 42 | test_that("flightpath_plot returns a ggplot object", { 43 | expect_true(any(grepl("gg", class(p)))) 44 | }) 45 | 46 | 47 | # test flightpath_plot from insitutype results 48 | p <- flightpath_plot(insitutype_result = res) 49 | test_that("flightpath_plot returns a ggplot object", { 50 | expect_true(any(grepl("gg", class(p)))) 51 | }) 52 | 53 | # test flightpath_plot showing meanconfidence 54 | p <- flightpath_plot(insitutype_result = res, showclusterconfidence = TRUE) 55 | test_that("flightpath_plot returns a ggplot object when showclusterconfidence = TRUE", { 56 | expect_true(any(grepl("gg", class(p)))) 57 | }) 58 | 59 | -------------------------------------------------------------------------------- /tests/testthat/test_getProfiles.R: -------------------------------------------------------------------------------- 1 | data("ioprofiles") 2 | data("iocolors") 3 | data("mini_nsclc") 4 | 5 | 6 | initclust <- sample(c("a","b","c"), nrow(mini_nsclc$counts), replace = TRUE) 7 | 8 | test_that("getRNAprofiles worked", { 9 | temp <- getRNAprofiles(x = mini_nsclc$counts, neg = 0, clust = initclust) 10 | expect_identical(rownames(temp), colnames(mini_nsclc$counts)) 11 | expect_identical(colnames(temp)[order(colnames(temp))], unique(initclust)[order(unique(initclust))]) 12 | }) 13 | 14 | 15 | test_that("getproteinparameters worked", { 16 | temp <- getProteinParameters(x = mini_nsclc$counts, clust = initclust) 17 | expect_identical(rownames(temp$profiles), colnames(mini_nsclc$counts)) 18 | expect_identical(rownames(temp$sds), colnames(mini_nsclc$counts)) 19 | expect_identical(colnames(temp$profiles)[order(colnames(temp$profiles))], unique(initclust)[order(unique(initclust))]) 20 | expect_identical(colnames(temp$sds)[order(colnames(temp$sds))], unique(initclust)[order(unique(initclust))]) 21 | }) 22 | 23 | 24 | -------------------------------------------------------------------------------- /tests/testthat/test_getSpatialContext.R: -------------------------------------------------------------------------------- 1 | 2 | # load data ("raw" and "cellannot"): 3 | data("mini_nsclc") 4 | 5 | 6 | test_that("getNeighborhood expression works under diverse settings", { 7 | 8 | n1 <- getSpatialContext(counts = mini_nsclc$counts, xy = cbind(mini_nsclc$x, mini_nsclc$y), N = 50) 9 | expect_equal(dim(n1), dim(mini_nsclc$counts)) 10 | 11 | n2 <- getSpatialContext(counts = mini_nsclc$counts, xy = cbind(mini_nsclc$x, mini_nsclc$y), rad = 0.1, dim_reduce_to = 20) 12 | expect_equal(dim(n2), c(nrow(mini_nsclc$counts), 20)) 13 | }) 14 | 15 | -------------------------------------------------------------------------------- /tests/testthat/test_refinecells_cell_merging_logic.R: -------------------------------------------------------------------------------- 1 | # example logliks: 2 | logliks <- matrix(c(-3, -3, -2, -1, -1, -2), 3 | nrow = 2, 4 | dimnames = list(paste0("cell", 1:2), paste0("old_", letters[1:3]))) 5 | 6 | # define merges: 7 | merges <- c("old_a" = "new1", "old_b" = "new1", "old_c" = "old_c") 8 | 9 | # run: 10 | res <- refineClusters(merges = merges, logliks = logliks) 11 | 12 | # confirm it works: 13 | test_that("new cluster names are right", { 14 | expect_equal(colnames(res$logliks), c("new1", "old_c.new")) 15 | }) 16 | 17 | test_that("new cluster assignments are right", { 18 | expect_equal(res$clust, c("cell1" = "old_c.new", "cell2" = "new1")) 19 | }) 20 | 21 | test_that("probabilities are right", { 22 | expect_equal(res$logliks[, 1], c("cell1" = -2, "cell2" = -1), tolerance = 2) 23 | }) 24 | -------------------------------------------------------------------------------- /tests/testthat/test_spatialUpdate.R: -------------------------------------------------------------------------------- 1 | data("ioprofiles") 2 | data("iocolors") 3 | data("mini_nsclc") 4 | 5 | 6 | initclust <- sample(c("a","b","c"), nrow(mini_nsclc$counts), replace = TRUE) 7 | 8 | updatedclust <- spatialUpdate(celltype = initclust, 9 | counts = mini_nsclc$counts, 10 | neg = Matrix::rowMeans(mini_nsclc$neg), 11 | cohort = NULL, altdata = NULL, 12 | xy = cbind(mini_nsclc$x, mini_nsclc$y), 13 | tissue = NULL, 14 | nb_size = 10, assay_type = "rna") 15 | test_that("spatialUpdate worked", { 16 | expect_true(all(is.element(c( "clust","prob","profiles","sds","logliks","logliks_from_lost_celltypes"), names(updatedclust)))) 17 | }) 18 | 19 | 20 | --------------------------------------------------------------------------------