├── .Rbuildignore
├── .gitattributes
├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── .lintr
├── DESCRIPTION
├── FAQs.md
├── InSituType.Rproj
├── LICENSE
├── NAMESPACE
├── NEWS.md
├── R
    ├── RcppExports.R
    ├── chooseClusterNumber.R
    ├── colorCellTypes.R
    ├── data.R
    ├── fastCohorting.R
    ├── find_anchor_cells.R
    ├── flightpath_layout.R
    ├── gen_profiles_protein.R
    ├── geoSketch.R
    ├── getProfiles.R
    ├── getSpatialContext.R
    ├── insitutype.R
    ├── insitutypeML.R
    ├── nbclust.R
    ├── refineClusters.R
    ├── rescaleProfiles.R
    ├── spatialUpdate.R
    └── utilities.R
├── README.md
├── azure-pipelines.yml
├── data
    ├── human_signature.RData
    ├── iocolors.RData
    ├── ioprofiles.RData
    ├── mini_nsclc.RData
    ├── mouse_signature.RData
    ├── tonsil_annotation.RData
    ├── tonsil_protein.RData
    └── tonsil_reference_profile.RData
├── man
    ├── Estep.Rd
    ├── Mstep.Rd
    ├── alignGenes.Rd
    ├── chooseClusterNumber.Rd
    ├── choose_anchors_from_stats.Rd
    ├── colorCellTypes.Rd
    ├── estimateBackground.Rd
    ├── estimatePlatformEffects.Rd
    ├── fastCohorting.Rd
    ├── find_anchor_cells.Rd
    ├── flightpath_layout.Rd
    ├── flightpath_plot.Rd
    ├── gen_profiles_protein_annotation.Rd
    ├── gen_profiles_protein_expression.Rd
    ├── geoSketch.Rd
    ├── geoSketch_get_plaid.Rd
    ├── geoSketch_sample_from_plaids.Rd
    ├── getMeanClusterConfidence.Rd
    ├── getProteinParameters.Rd
    ├── getRNAprofiles.Rd
    ├── getSpatialContext.Rd
    ├── get_anchor_stats.Rd
    ├── get_neighborhood_expression.Rd
    ├── human_signature.Rd
    ├── insitutype.Rd
    ├── insitutypeML.Rd
    ├── iocolors.Rd
    ├── ioprofiles.Rd
    ├── ismax.Rd
    ├── lldist.Rd
    ├── lls_protein.Rd
    ├── lls_rna.Rd
    ├── logliks2probs.Rd
    ├── mini_nsclc.Rd
    ├── mouse_signature.Rd
    ├── nbclust.Rd
    ├── nearestNeighborGraph.Rd
    ├── neighbor_colMeans.Rd
    ├── neighbor_colSums.Rd
    ├── numCores.Rd
    ├── prepDataForSketching.Rd
    ├── probs2logliks.Rd
    ├── radiusBasedGraph.Rd
    ├── refineAnchors.Rd
    ├── refineClusters.Rd
    ├── spatialUpdate.Rd
    ├── tonsil_annotation.Rd
    ├── tonsil_protein.Rd
    ├── tonsil_reference_profile.Rd
    ├── updateProfilesFromAnchors.Rd
    ├── updateReferenceProfiles.Rd
    └── update_logliks_with_cohort_freqs.Rd
├── reqs.md
├── specs.md
├── src
    ├── Makevars
    ├── Makevars.win
    ├── RcppExports.cpp
    └── rcpparma_dnbinom_sparse.cpp
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-colorCellTypes.R
    │   ├── test_RCppExports.R
    │   ├── test_flightpath.R
    │   ├── test_getProfiles.R
    │   ├── test_getSpatialContext.R
    │   ├── test_insitutype_Protein.R
    │   ├── test_insitutype_RNA.R
    │   ├── test_refinecells_cell_merging_logic.R
    │   └── test_spatialUpdate.R
└── vignettes
    ├── NSCLC-RNA_InsituType-vignette.Rmd
    ├── NSCLC-RNA_InsituType-vignette.html
    ├── NSCLC-clustering-SingleCellExperiment-vignette.Rmd
    ├── NSCLC-clustering-SingleCellExperiment-vignette.html
    ├── NSCLC-clustering-vignette.Rmd
    ├── NSCLC-clustering-vignette.html
    ├── NSCLC-semi-supervised-cell-typing-vignette.Rmd
    ├── NSCLC-semi-supervised-cell-typing-vignette.html
    ├── NSCLC-supervised-cell-typing-vignette.Rmd
    ├── NSCLC-supervised-cell-typing-vignette.html
    ├── TONSIL_Protein_Insitu_Cell_Typing-vignette.Rmd
    └── TONSIL_Protein_Insitu_Cell_Typing-vignette.html


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^azure-pipelines\.yml$
4 | ^\.vscode$
5 | ^reqs\.md$
6 | ^specs\.md$
7 | ^LICENSE\.md$
8 | ^\.lintr$
9 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   issues:
 3 |     types: [opened]
 4 | 
 5 | 
 6 | jobs:
 7 |   welcome:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: EddieHubCommunity/gh-action-community/src/welcome@main
11 |         with:
12 |           github-token: ${{ secrets.GITHUB_TOKEN }}
13 |           issue-message: "Thank you for contacting us about our tools! To receive assistance, kindly email support.spatial@bruker.com with detailed information about your issue. If applicable, attach a screenshot of any encountered errors and include a copy of the modified script in Notepad. Our customer support team will help facilitate a review and resolution of the issue."
14 |           footer: "Thank you for choosing Bruker Spatial Biology,\nBruker Spatial Biology Dev Team"
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | *.Rproj
 3 | .Rhistory
 4 | .RData
 5 | .Ruserdata
 6 | .DS_Store
 7 | *.o
 8 | *.so
 9 | *.dll
10 | inst/doc
11 | 


--------------------------------------------------------------------------------
/.lintr:
--------------------------------------------------------------------------------
1 | linters: linters_with_defaults(
2 |   line_length_linter(120),
3 |   object_name_linter("camelCase")
4 |   )
5 | encoding: "UTF-8"
6 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: InSituType
 2 | Type: Package
 3 | Title: An R package for performing cell typing in SMI and other single cell data
 4 | Version: 2.0
 5 | Authors@R: c(person("Patrick", "Danaher", email = "pdanaher@nanostring.com", role = c("aut")),
 6 |              person("Sangsoon", "Woo", email = "sawoo@nanostring.com", role = c("aut")),
 7 |              person("Zhi", "Yang", email = "zyang@nanostring.com", role = c("aut")),
 8 |              person("David", "Ross", email = "dross@nanostring.com", role = c("aut", "cre")),
 9 |              person("Lidan", "Wu", email = "lwu@nanostring.com", role = c("aut")),
10 |              person("Yongfang", "Lu", email = "ylu@nanostring.com", role = c("aut")))
11 | Description: Insitutype is an algorithm for performing cell typing in single cell 
12 |              spatial transcriptomics data, such as is generated by the CosMx platform. 
13 |              It can perform supervised cell typing from reference profiles, unsupervised clustering,
14 |              or semi-supervised cell typing in which cells both reference cell types and de novo
15 |              clusters are fit. 
16 | Imports:
17 |   data.table,
18 |   dplyr,
19 |   fastglm,
20 |   ggplot2,
21 |   graphics,
22 |   grDevices,
23 |   irlba,
24 |   lsa,
25 |   magrittr,
26 |   Matrix,
27 |   mclust,
28 |   methods,
29 |   Rcpp (>= 1.0.9),
30 |   rlang,
31 |   scales,
32 |   SingleCellExperiment,
33 |   sparseMatrixStats,
34 |   spatstat.geom,
35 |   stats,
36 |   SummarizedExperiment,
37 |   tibble,
38 |   umap,
39 |   utils,
40 |   uwot
41 | License: NanoString Technologies, Inc. Software License Agreement for Non-Commercial Use
42 | Encoding: UTF-8
43 | LazyData: true
44 | Suggests:
45 |     rmarkdown,
46 |     knitr,
47 |     testthat
48 | VignetteBuilder: knitr
49 | Depends:
50 |   R (>= 3.5.0)
51 | RoxygenNote: 7.3.1
52 | LinkingTo: Rcpp, RcppArmadillo
53 | 


--------------------------------------------------------------------------------
/FAQs.md:
--------------------------------------------------------------------------------
  1 | # FAQs and advanced methods
  2 | 
  3 | #### Topics
  4 | 
  5 | - [Workflow overview](#workflow-overview)
  6 | - [Choosing the n_clust argument](#choosing-nclust)
  7 | - [Updating reference profiles](#updating-reference-profiles)
  8 | - [On confidence scores](#confidence-scores)
  9 | - [Which genes to use](#which-genes-to-use)
 10 | - [Interpreting clustering results](#interpreting-clustering-results)
 11 | - [Targeted subclustering](#targeted-subclustering)
 12 | 
 13 | ## Workflow overview
 14 | The broad Insitutype workflow is as follows:
 15 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/45d89004-dc46-40a1-bde8-33d204e0f0b8)
 16 | 
 17 | 
 18 | ## Unsupervised vs. Supervised vs. Semi-supervised cell typing
 19 | InSituType runs in 3 modes:
 20 | - Supervised: call only cell types defined in reference profiles. Set `nclust = 0` to run in fully supervised mode. 
 21 | - Unsupervised: de novo clustering, with no reference cell types
 22 | - Semi-supervised: find new clusters while also calling reference cell types. `Set reference_profiles = NULL` to run in unsupervised mode. 
 23 | 
 24 | Considerations for choosing a workflow:
 25 | - Supervised is most convenient if you are confident that your reference profiles contain all the cell types in your dataset. 
 26 | However, many reference profiles from scRNA-seq don't fit spatial data well, so using reference profiles can be challenging. 
 27 | - Semi-supervised mode is the most powerful but most challenging workflow. We use this in >80% of analyses. 
 28 |  Success hinges on how well the reference profiles are calibrated to spatial data. InSituType tries to
 29 |  perform this calibration using anchor cells, but this does not always succeed. 
 30 | - We recommend trying semi-supervised cell typing first, assuming there are new clusters you expect to discover. 
 31 | - Unsupervised has no difficulty with poorly-calibrated reference profiles, but it requires you to name each cluster, 
 32 |  which can be onerous. It may also fail to define distinctions that are important to you.
 33 | 
 34 | ## Choosing reference profiles
 35 | Keep in mind the following when selecting reference profiles:
 36 | - Quality of scRNA-seq references varies greatly. Finding mis-annotated cell types is not uncommon,
 37 | and for smaller datasets, profiles of rare cell types will be noisy. Exercsie some skepticism. 
 38 | - Large platform effects separate scRNA-seq and spatial platforms. When possible, use a reference from the same platform as your data.
 39 | - A large collection of single cell references can be found here: https://github.com/Nanostring-Biostats/cellprofilelibrary
 40 | - A growing collection of CosMx references is here: https://github.com/Nanostring-Biostats/CosMx-Cell-Profiles
 41 | 
 42 | 
 43 | ## Choosing nclust
 44 | We recommend choosing a slightly generous value of `nclust`, then using `refineClusters` to condense the resulting clusters. For example, if you're running semi-supervised cell typing and you expect to find 5 new clusters, set `nclust = 8`. Or for unsupervised clustering with an expectation of 12 cell types, set `nclust = 16`. 
 45 | It's generally easy to tell when two clusters come from the same cell type: they'll be adjacent in UMAP space, and the flightpath plot will show them frequently confused with each other. 
 46 | 
 47 | Final note: Insitutype splits big clusters with higher counts more aggressively than other clusters. For example, in a tumor study, it will subcluster tumor cells many times before it subclusters e.g. fibroblasts. The simplest solution is to increase nclust as needed, then condense the over-clustered cell type as desired. 
 48 | 
 49 | 
 50 | ## Updating reference profiles
 51 | 
 52 | Cell typing's biggest challenge is using a reference dataset from a different platform. Platform effects between scRNA-seq and spatial platforms can be profound. 
 53 | Insitutype has 3 treatments for reference profiles:
 54 | 1. Use the reference profile matrix as-is
 55 | 2. Choose anchor cells, then rescale genes based on estimated platform effects. (Less aggressive, only fits gene-level effects.)
 56 | 3. Choose anchor cells, then refit the reference profiles entirely. (Most aggressive, fits a new value for every gene x cell type.)
 57 | 
 58 | We suggest using the below flowchart to choose from among these options:
 59 | 
 60 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/824dec47-2221-4fe8-92a0-15693c749d55)
 61 | 
 62 | For more on starting with a coarse reference then subclustering, see the "Targeted subclustering" discussion further on. 
 63 | 
 64 | ## Confidence Scores
 65 | Insitutype returns a posterior probability for each cell type call. In practice, we have found these probabilities to be overconfident. 
 66 | Below is an image from the preprint demonstrating this phenomenon. For various posterior probability bins, it shows the accuracy rate actually achieved (with a confidence interval). 
 67 | 
 68 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/f02df11d-405b-411d-8049-4ab3d021d0a4)
 69 | 
 70 | So 100% confident probabilties appear to be accurate, but lower probabilities are overconfident. 
 71 | Also, remember that these probabilities are based on all the information available to the model. They don't consider that the model might be missing cell types, or that the reference profiles could be incorrect. 
 72 | 
 73 | In short, the posterior probabilities are useful for differentiating strong from weak cell typing calls, but you should be conservative when choosing a threshold. We often use a threshold of 80%, calling cells below that confidence as "unclassified". 
 74 | 
 75 | ## Which genes to use
 76 | 
 77 | Insitutype was designed using 1000-plex CosMx data, where we found it most powerful to use all genes in the panel. 
 78 | In our new 6000-plex data, it's worth considering using Insitutype on a well-chosen subset of genes. As a rule of thumb, genes should be retained if either of the following applies: 
 79 | 1. They have solidly above-background expression in the CosMx data
 80 | 2. They have moderate-to-high expression in at least one reference profile
 81 | 
 82 | For typical 6000plex experiments, we speculate that cell typing using somewhere between 3000-5000 genes would be optimal. 
 83 | 
 84 | 
 85 | ## Interpreting clustering results
 86 | 
 87 | Once Insitutype has run, take time to scrutinize the results. You'll need to:
 88 | 1. Confirm cell types from the reference profiles are correct
 89 | 2. Interpret new clusters
 90 | 
 91 | First, we recommend the following QC plots:
 92 | 
 93 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/aa2c47ba-8c4e-412d-b790-5205ae9739fc)
 94 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/f1f1694c-c0df-41fe-a823-ca34a16d553b)
 95 | 
 96 | Example code for generating the above profiles heatmap:
 97 | ```
 98 | pdf("<writehere.pdf>", height = 20, width = 6)
 99 | mat <- res$profiles  # ("res" is the insitutype output)
100 | mat <- sweep(mat, 1, pmax(apply(mat, 1 ,max), 0.1), "/")
101 | pheatmap(mat, col = colorRampPalette(c("white", "darkblue"))(100),
102 |          fontsize_row = 5)
103 | dev.off()
104 | ```
105 | 
106 | We have found the below workflows to be effective and efficent:
107 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/3adda877-53e7-48ca-8781-927e77739943)
108 | 
109 | ![image](https://github.com/Nanostring-Biostats/InSituType/assets/4357938/24a28e1b-e1bf-4be1-bf38-0c4ebeb574d4)
110 | 
111 | 
112 | 
113 | ## Targeted subclustering
114 | 
115 | This is an advanced method. Sometimes it can be hard to subcluster a cell type if many of its genes are impacted by contamination from segmentation errors. Immune cells in the context of tumors are a good example.
116 | To subcluster say T-cells in a tumor, you might initially call a single T-cell cluster. Then, considering just these cells and just the genes unlikely to be contaminated in T-cells (genes with high T-cell expression or with low expression in surrounding cell types), run unsupervised Insitutype. 
117 | 
118 | 
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/InSituType.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | NanoString Technologies, Inc.
 2 | Software License Agreement for Non-Commercial Use
 3 | By downloading, installing, accessing, modifying or otherwise making use of the Program (defined below), you agree to be bound by the terms and conditions of this Software License Agreement for Non-Commercial Use (this “License”).
 4 | 1.	DEFINITIONS
 5 | 1.1.	“Affiliate” means, with respect to an individual or entity, another individual or entity: (i) on whose behalf such individual or entity is acting, or (ii) that exercises control, is controlled by, or is under common control with such individual or entity. For the purposes of this definition, the term “control” means the right, whether by ownership, exercise of voting rights, contract, or otherwise, to direct the actions of an individual or entity.
 6 | 1.2.	“Distribute” means to distribute, share, make available, or otherwise provide the Program or Modified Program, as applicable, or access thereto (including via a computer network) to any third party.
 7 | 1.3.	“Licensor” means the individual or entity licensing the rights granted in this License.
 8 | 1.4.	“Licensee” or “you” means the individual or entity receiving or exercising the rights granted under this License, provided that the individual or entity is not a NanoString Competitor.
 9 | 1.5.	“Non-Commercial Use” means any use where profit or other commercial benefit is not a direct or indirect motive or intended result.
10 | 1.6.	“Modified Program” means a derivative work of, or a work that is based on, uses or incorporates, the Program (whether or not in combination with other works, materials or content).
11 | 1.7.	“NanoString” means NanoString Technologies, Inc.
12 | 1.8.	“NanoString Competitor” means any individual or entity that directly or indirectly competes with NanoString or any of NanoString’s Affiliates or whose Affiliate directly or indirectly competes with NanoString or any of NanoString’s Affiliates.
13 | 1.9.	“Program” means the copyrightable work of authorship, program, code, or software licensed under this License.
14 | 2.	LICENSE 
15 | 2.1.	Grant. Subject to the terms and conditions of this License, Licensor hereby grants to Licensee a worldwide, royalty-free, non-exclusive, revocable license to: (a) use, Distribute, and reproduce the Program, and (b) use, create, Distribute, and reproduce Modified Programs, in each case, solely for your internal, Non-Commercial Use. No rights are granted to NanoString Competitors.
16 | 2.2.	No Endorsement. Nothing in this License may be construed as permission to assert or imply that Licensor, NanoString, or other contributors to the Program sponsors, endorses, or is otherwise connected with the Licensee or the entity or institution that Licensee represents.
17 | 2.3.	Trademarks. Trademark rights are not licensed to you under this License.
18 | 2.4.	Grant of Patent License. Subject to the terms and conditions of this License, NanoString hereby grants to you a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, import, and otherwise transfer the Program, where such license applies only to those patent claims licensable by NanoString that are necessarily infringed by Licensee alone or by combination of its modification(s) to the Program or Modified Program to which such modification(s) was submitted. If you institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Program, Modified Program, or a modification incorporated within the Program or a Modified Program constitutes direct or contributory patent infringement, then any patent licenses granted to you under this License for the Program or any such Modified Program shall terminate as of the date such litigation is filed.
19 | 3.	CONDITIONS TO THE RIGHT TO DISTRIBUTE
20 | 3.1.	Notices. If you Distribute the Program or a Modified Program in any form, you must also provide to the recipient:
21 | 3.1.1.	a copy of this License; and 
22 | 3.1.2.	for Modified Programs, prominent notices identifying the portions of the Modified Program that have been modified, stating that you have modified the Program.
23 | 3.2.	Attribution. Except as otherwise expressly permitted under this License, you must keep intact, and you may not modify or remove, any notices, disclaimers, or attributions included in or provided with the Program. In addition, you must also include a prominent hypertext link back to NanoString’s website at www.nanostring.com. 
24 | 3.3.	License. You may only Distribute the Program or the Modified Program under the terms of this License (or any later version, at your election). You may not offer or impose any additional or different terms or conditions that, or take any measures to, restrict the exercise of the rights granted under this License.
25 | 4.	NO REPRESENTATIONS OR WARRANTIES; LIMITATIONS OF LIABILITY
26 | 4.1.	Disclaimer. UNLESS OTHERWISE AGREED BY LICENSOR IN WRITING, TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, LICENSOR OFFERS THE PROGRAM AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND WITH REGARD TO THE PROGRAM, WHETHER EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. THE LICENSOR DOES NOT REPRESENT OR WARRANT THAT THE PROGRAM WILL BE ERROR FREE AND DOES NOT PROMISE THAT ANY SUCH ERRORS WILL BE CORRECTED.
27 | SOME JURISDICTIONS DO NOT ALLOW FOR THE EXCLUSION OF IMPLIED WARRANTIES, SO THE FOREGOING MAY NOT APPLY TO YOU.
28 | 4.2.	Limitation of Liability. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT WILL THE LICENSOR OR NANOSTRING BE LIABLE TO YOU UNDER ANY LEGAL THEORY FOR ANY DAMAGES OF ANY KIND, INCLUDING ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF OR RELATED TO THE PROGRAM OR USE THEREOF, EVEN IF LICENSOR OR NANOSTRING HAS BEEN ADVISED OF THE POSSIBILITY OR LIKELIHOOD OF SUCH DAMAGES.
29 | 5.	MISCELLANEOUS
30 | 5.1.	Right to Enforce. NanoString is an express third-party beneficiary of this License and will be entitled to enforce the provisions of this License as if it were a party hereto. 
31 | 5.2.	Waiver; Amendment. No term or provision hereof will be considered waived by the Licensor, and no breach excused by Licensor, unless such waiver or consent is in writing and signed by an authorized representative of Licensor.  The waiver by Licensor of, or consent by Licensor to, a breach of any provision of this License by the Licensee, will not constitute, operate or be construed as a waiver of, consent to, or excuse of any other or subsequent breach by Licensee.  This License may be amended or modified only by an agreement in writing signed by an authorized representative of each of Licensor and Licensee.
32 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(Estep)
 4 | export(Mstep)
 5 | export(chooseClusterNumber)
 6 | export(choose_anchors_from_stats)
 7 | export(colorCellTypes)
 8 | export(estimatePlatformEffects)
 9 | export(fastCohorting)
10 | export(find_anchor_cells)
11 | export(flightpath_layout)
12 | export(flightpath_plot)
13 | export(getProteinParameters)
14 | export(getRNAprofiles)
15 | export(getSpatialContext)
16 | export(get_anchor_stats)
17 | export(insitutype)
18 | export(insitutypeML)
19 | export(lls_protein)
20 | export(lls_rna)
21 | export(numCores)
22 | export(refineAnchors)
23 | export(refineClusters)
24 | export(spatialUpdate)
25 | export(updateProfilesFromAnchors)
26 | export(updateReferenceProfiles)
27 | exportMethods(insitutype)
28 | exportMethods(insitutypeML)
29 | exportPattern("^[[:alpha:]]+")
30 | import(ggplot2)
31 | importFrom(Matrix,colSums)
32 | importFrom(Matrix,rowMeans)
33 | importFrom(Matrix,rowSums)
34 | importFrom(Matrix,sparseMatrix)
35 | importFrom(Matrix,t)
36 | importFrom(Rcpp,evalCpp)
37 | importFrom(SingleCellExperiment,SingleCellExperiment)
38 | importFrom(SummarizedExperiment,assay)
39 | importFrom(data.table,data.table)
40 | importFrom(data.table,melt)
41 | importFrom(data.table,rbindlist)
42 | importFrom(dplyr,filter)
43 | importFrom(dplyr,group_by)
44 | importFrom(dplyr,summarise_all)
45 | importFrom(grDevices,col2rgb)
46 | importFrom(grDevices,colors)
47 | importFrom(graphics,lines)
48 | importFrom(graphics,par)
49 | importFrom(graphics,plot)
50 | importFrom(irlba,irlba)
51 | importFrom(irlba,prcomp_irlba)
52 | importFrom(lsa,cosine)
53 | importFrom(magrittr,"%>%")
54 | importFrom(mclust,Mclust)
55 | importFrom(mclust,mclustBIC)
56 | importFrom(mclust,predict.Mclust)
57 | importFrom(methods,as)
58 | importFrom(methods,is)
59 | importFrom(rlang,.data)
60 | importFrom(scales,alpha)
61 | importFrom(spatstat.geom,closepairs)
62 | importFrom(spatstat.geom,nncross)
63 | importFrom(spatstat.geom,nndist)
64 | importFrom(spatstat.geom,nnwhich)
65 | importFrom(spatstat.geom,ppp)
66 | importFrom(stats,dnbinom)
67 | importFrom(stats,lm)
68 | importFrom(stats,qnorm)
69 | importFrom(stats,rnorm)
70 | importFrom(tibble,column_to_rownames)
71 | importFrom(tibble,rownames_to_column)
72 | importFrom(umap,umap)
73 | importFrom(utils,data)
74 | importFrom(uwot,umap_transform)
75 | useDynLib(InSituType, .registration = TRUE)
76 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # InSituType 2.0.0
 2 | 
 3 | * Enable use in protein datasets via the assay_type argument. This required a major overhaul under the hood, but has little impact on existing RNA workflows. 
 4 | * More advanced methods for updating reference profiles via anchor cells, implemented in `updateReferenceProfiles`.
 5 | * New function `spatialUpdate` for using alternative data types (e.g. space or immunofluorescence) and the Insitutype likelihood framework to update cell typing results from any method. 
 6 | * New function `getSpatialContext` for conveniently calculating cells' spatial contexts / neighborhood expression. 
 7 | * New functions `getRNAprofiles` and `getProteinParameters`, which serve as user-facing tools for getting profile matrices. 
 8 | 
 9 | # InSituType 1.2.3
10 | 
11 | * handle collinearity issues with fastCohorting:
12 | Reduce to 2 PC's.
13 | If this fails, then try successively smaller # of cohorts with the 2 pc's.
14 | 
15 | # InSituType 1.2.2
16 | 
17 | * Add Compatibility of assay_type and platform effect correction 
18 | 
19 | # InSituType 1.2.1
20 | 
21 | * Create "undefined" profile for cells with zero counts
22 | 
23 | # InSituType 1.2.0
24 | 
25 | * Also cluster continuous data from protein assay
26 | 
27 | # InSituType 1.1.1
28 | 
29 | * Support platform effect correction
30 | * Support anchor refinement via UMAP projection 
31 | 
32 | # InSituType 1.1.0
33 | 
34 | * Support matrices with more than 4B elements
35 | 
36 | # InSituType 1.0.0
37 | 
38 | * License updated
39 | * lldist parallelized with OpenMP
40 | 
41 | # InSituType 0.99.4
42 | 
43 | * Re-submission to Bioconductor
44 | 
45 | # InSituType 0.99.3
46 | 
47 | * Merge subclustering fix
48 | 
49 | # InSituType 0.99.2
50 | 
51 | * Optionally use SingleCellExperiment class
52 | 
53 | # InSituType 0.99.1
54 | 
55 | * Added reference to CosMx paper and dataset
56 | 
57 | # InSituType 0.99.0
58 | 
59 | * Submission to Bioconductor 3.16
60 | 
61 | # InSituType 1.1.1
62 | 
63 | * Updated `flightpath_layout.R` to save the plot in a temp folder in the current work directory
64 | 
65 | # InSituType 1.1.0
66 | 
67 | * Fix several places counts matrix was being converted to dense to calculate a statistic
68 |   * Revert conversion to `sparse matrix` of `dense` `mu` matrix and result from `dnbinom`
69 | 
70 | # InSituType 1.0.0
71 | 
72 | * Integrated rcpp support for the package
73 | * Added `dnbinom` for `sparse matrices`
74 | * Updated the `unit tests`
75 | * Removed `.o` files in `src` folder
76 | 
77 | # InSituType 0.1.2
78 | 
79 | * Updated package dependencies
80 | 
81 | # InSituType 0.1.1
82 | 
83 | * Added `lsa`, `SpatialDecon`, `irlba`, `mclust`, `rmarkdown` to the `DESCRIPTION` file
84 | * Fixed a Roxygen example for the R function `geoSketch` where it was trying to use `Ptolemy` and `Giotto` packages that are not being used within the package
85 | 
86 | # InSituType 0.1.0
87 | 
88 | * Added a `NEWS.md` file to track changes to the package.
89 | * Added BioConductor package dependencies (notably SpatialDecon and lsa)
90 | * Renamed vignettes to allow for compilation
91 | * Deleted old vignettes (labelled OLD)
92 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #' sum from negative binomial density function
 5 | #'
 6 | #' Probability density function of the negative binomial distribution (written in C++)
 7 | #'
 8 | #' @param mat dgCMatrix expression counts
 9 | #' @param bgsub vector of background expression per cell
10 | #' @param x numeric expression for reference profiles
11 | #' @param bg numeric background level
12 | #' @param size_dnb int Dispersion parameter
13 | #'
14 | #' @return rowSums for matrix of densities
15 | #' @useDynLib InSituType, .registration = TRUE
16 | #' @importFrom Rcpp evalCpp
17 | #' @exportPattern "^[[:alpha:]]+" 
18 | #' @export
19 | lls_rna <- function(mat, bgsub, x, bg, size_dnb) {
20 |     .Call(`_InSituType_lls_rna`, mat, bgsub, x, bg, size_dnb)
21 | }
22 | 
23 | #' sum from Gaussian density function
24 | #'
25 | #' Probability density function of the Gaussian distribution (written in C++)
26 | #'
27 | #' @param mat dgCMatrix expression matrix
28 | #' @param bgsub vector of background expression per cell
29 | #' @param x numeric expression for reference profiles
30 | #' @param xsd numeric expression for reference SD profiles
31 | #' 
32 | #' @return rowSums for matrix of densities
33 | #' @useDynLib InSituType, .registration = TRUE
34 | #' @importFrom Rcpp evalCpp
35 | #' @exportPattern "^[[:alpha:]]+" 
36 | #' @export
37 | lls_protein <- function(mat, bgsub, x, xsd) {
38 |     .Call(`_InSituType_lls_protein`, mat, bgsub, x, xsd)
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/R/chooseClusterNumber.R:
--------------------------------------------------------------------------------
  1 | #' Estimate the correct number of clusters using a subset of the data
  2 | #'
  3 | #' For a subset of the data, perform clustering under a range of cluster numbers.
  4 | #'  Report on loglikelihood vs. number of clusters, and suggest a best choice.
  5 | #' @param counts Counts matrix, cells * genes. 
  6 | #' @param neg Vector of mean negprobe counts per cell (default = "rna")
  7 | #' @param assay_type Assay type of RNA, protein 
  8 | #' @param bg Expected background
  9 | #' @param fixed_profiles Matrix of cluster profiles to hold unchanged throughout iterations.
 10 | #' @param fixed_sds Matrix of SDs expression of genes x cell types,to hold unchanged throughout iterations. Only for assay_type of protein
 11 | #' @param cohort Vector of cells' cohort assignments. 
 12 | #' @param init_clust Vector of initial cluster assignments.
 13 | #' @param n_clusts Vector giving a range of cluster numbers to consider.
 14 | #' @param max_iters Number of iterations in each clustering attempt. Recommended to choose
 15 | #'  a smaller number for a quicker, approximate clustering.
 16 | #' @param subset_size Number of cells to include in clustering.
 17 | #' @param align_genes Logical, for whether to align the genes in fixed_profiles with the colnames in count
 18 | #' @param plotresults Logical, for whether to plot the results.
 19 | #' @param nb_size The size parameter to assume for the NB distribution.
 20 | #' @param pct_drop the decrease in percentage of cell types with a valid switchover to 
 21 | #'  another cell type compared to the last iteration. Default value: 1/10000. A valid 
 22 | #'  switchover is only applicable when a cell has changed the assigned cell type with its
 23 | #'  highest cell type probability increased by min_prob_increase. 
 24 | #' @param min_prob_increase the threshold of probability used to determine a valid cell 
 25 | #'  type switchover
 26 | #' @param ... Arguments passed to nbclust.
 27 | #' @export
 28 | #'
 29 | #' @importFrom graphics plot
 30 | #' @importFrom graphics lines
 31 | #' @importFrom graphics par
 32 | #' @importFrom stats lm
 33 | #'
 34 | #' @return A list, with the following elements:
 35 | #' \itemize{
 36 | #'  \item
 37 | #' }
 38 | #' @examples
 39 | #' data("mini_nsclc")
 40 | #' chooseClusterNumber(mini_nsclc$counts, Matrix::rowMeans(mini_nsclc$neg), assay_type="RNA",
 41 | #'  n_clust = 2:5)
 42 | 
 43 | chooseClusterNumber <-
 44 |   function(counts,
 45 |            neg,
 46 |            assay_type = c("rna", "protein"),
 47 |            bg = NULL,
 48 |            fixed_profiles = NULL,
 49 |            fixed_sds = NULL,
 50 |            cohort = NULL,
 51 |            init_clust = NULL,
 52 |            n_clusts = 2:12,
 53 |            max_iters = 10,
 54 |            subset_size = 1000,
 55 |            align_genes = TRUE,
 56 |            plotresults = FALSE,
 57 |            nb_size = 10,
 58 |            pct_drop = 0.005,
 59 |            min_prob_increase = 0.05,
 60 |            ...) {
 61 |     assay_type <- match.arg(tolower(assay_type), c("rna", "protein"))  
 62 | 
 63 |   # infer bg if not provided: assume background is proportional to the scaling factor s
 64 |   s <- rowSums(counts)
 65 |   if (is.null(bg)) {
 66 |     bgmod <- stats::lm(neg ~ s - 1)
 67 |     bg <- bgmod$fitted
 68 |   } 
 69 | 
 70 |   # subset the data:
 71 |   use <- sample(seq_len(nrow(counts)), subset_size)
 72 |   counts <- counts[use, ]
 73 |   s <- s[use]
 74 |   neg <- neg[use]
 75 |   bg <- bg[use]
 76 |   if (!is.null(init_clust)) {
 77 |     init_clust <- init_clust[use]
 78 |   }
 79 | 
 80 |   if (length(n_clusts) <= 0) {
 81 |     stop("n_clusts needs to be more than one value.")
 82 |   } else if (!all(sapply(n_clusts, function(x) x > 0 && x / as.integer(x) == 1))) {
 83 |     stop("n_clusts need to be a vector of positive integers.")
 84 |   }
 85 | 
 86 |   # align genes in fixed_profiles:
 87 |   if (align_genes && !is.null(fixed_profiles)) {
 88 |     sharedgenes <- intersect(rownames(fixed_profiles), colnames(counts))
 89 |     counts <- counts[, sharedgenes]
 90 |     fixed_profiles <- fixed_profiles[sharedgenes, ]
 91 |     fixed_sds <- fixed_sds[sharedgenes, ]
 92 |   }  
 93 |   # cluster under each value of n_clusts, and save loglik:
 94 |   totallogliks <- sapply(n_clusts, function(x) {
 95 |     
 96 |     # get init clust:
 97 |     tempinit <- rep(letters[seq_len(x)], each = ceiling(nrow(counts) / x))[
 98 |       seq_len(nrow(counts))]
 99 |    
100 |     # run nbclust:
101 |     message(sprintf("Clustering with n_clust = %s", x))
102 |     tempclust <- nbclust(
103 |       counts = counts, 
104 |       neg = neg, 
105 |       bg = bg, 
106 |       fixed_profiles = fixed_profiles,
107 |       fixed_sds = fixed_sds, 
108 |       cohort = cohort,
109 |       init_clust = tempinit,
110 |       nb_size = nb_size,
111 |       assay_type=assay_type,
112 |       pct_drop = pct_drop,
113 |       min_prob_increase = min_prob_increase,
114 |       max_iters = max_iters)  
115 | 
116 |     # get the loglik of the clustering result:
117 |     loglik_thisclust <- lldist(x = tempclust$profiles,
118 |                                mat = counts,
119 |                                xsd = tempclust$sds,
120 |                                bg = bg,
121 |                                size = nb_size,
122 |                                assay_type = assay_type)
123 | 
124 |     total_loglik_this_clust <- sum(apply(loglik_thisclust, 1, max))
125 |     return(total_loglik_this_clust)
126 |   })
127 | 
128 |   # report goodness-of-fit
129 |   n_parameters <- n_clusts * ncol(counts)
130 |   aic <- n_parameters * 2 - 2 * totallogliks
131 |   bic <- n_parameters * log(nrow(counts)) - 2 * totallogliks
132 | 
133 |   best_clust_number <- n_clusts[order(aic)[1]]
134 | 
135 |   if (plotresults) {
136 |     original_par <- par()$mfrow
137 |     graphics::par(mfrow = c(2, 1))
138 |     graphics::plot(n_clusts, totallogliks, xlab = "Number of clusters", ylab = "Log-likelihood")
139 |     graphics::lines(n_clusts, totallogliks)
140 |     graphics::plot(n_clusts, aic, xlab = "Number of clusters", ylab = "AIC")
141 |     graphics::lines(n_clusts, aic)
142 |     par(mfrow = original_par)
143 |   }
144 | 
145 |   out <- list(best_clust_number = best_clust_number,
146 |               n_clusts = n_clusts,
147 |               loglik = totallogliks,
148 |               aic = aic,
149 |               bic = bic)
150 |   return(out)
151 | }
152 | 


--------------------------------------------------------------------------------
/R/colorCellTypes.R:
--------------------------------------------------------------------------------
  1 | #' Function to choose colors for cell types
  2 | #'
  3 | #' Uses Giotto::getDistinctColors to begin with. Orders colors so the most
  4 | #' common cell types get the lightest colors. Removes colors that are too light
  5 | #' (sum of rgb values > 600)
  6 | #' @param names Vector of cell type names
  7 | #' @param freqs Optional, named vector of cell type abundance (e.g. c(T = 1000,
  8 | #'   tumor = 15000...))
  9 | #' @param init_colors Optional, a named vector of cell colors. This will be used
 10 | #'   for all cell types in the "names" vector that match names(init_colors).
 11 | #'   Intended for use with the iocolors vector (found in the Ptolemy package
 12 | #'   data).
 13 | #' @param max_sum_rgb Don't return any colors with total rgb values above this
 14 | #'   level. (Removes excessively light colors.)
 15 | #' @param palette One of "tableau20", "brewers" or "earthplus".
 16 | #' @return A named color vector
 17 | #' @importFrom grDevices col2rgb colors
 18 | #' @export
 19 | #' @examples
 20 | #' data("mini_nsclc")
 21 | #' unsup <- insitutype(
 22 | #'  x = mini_nsclc$counts,
 23 | #'  neg = Matrix::rowMeans(mini_nsclc$neg),
 24 | #'  n_clusts = 8,
 25 | #'  n_phase1 = 200,
 26 | #'  n_phase2 = 500,
 27 | #'  n_phase3 = 2000,
 28 | #'  n_starts = 1,
 29 | #'  max_iters = 5,
 30 | #'  assay_type="RNA"
 31 | #' ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
 32 | #' colorCellTypes(freqs = table(unsup$clust), palette = "brewers")
 33 | 
 34 | colorCellTypes <- function(names = NULL, freqs = NULL, init_colors = NULL, max_sum_rgb = 600, 
 35 |                            palette = "earthplus") {
 36 |   
 37 |   if (is.null(freqs) && is.null(names)) {
 38 |     stop("must specify either names or freqs")
 39 |   } 
 40 |   
 41 |   if (is.null(freqs) && palette == "earthplus") {
 42 |     warning("this palette is best used when cell frequencies are known.")
 43 |   }
 44 |   
 45 |   if (is.null(freqs)) {
 46 |     # format names into freqs, then work with freqs henceforth
 47 |     freqs <- rep(1, length(names))
 48 |     names(freqs) <- names
 49 |   }
 50 |   
 51 |   ### "brewers" version: increasingly bright Rcolorbrewer paletted:
 52 |   if (palette == "brewers") {
 53 |     # start with R colorbrewer pallettes, then add a ton of filler colors:
 54 |     cols <- c('#8DD3C7','#FFFFB3','#BEBADA','#FB8072','#80B1D3','#FDB462','#B3DE69','#FCCDE5',
 55 |               '#D9D9D9','#BC80BD','#CCEBC5','#FFED6F','#66C2A5','#FC8D62','#8DA0CB','#E78AC3',
 56 |               '#A6D854','#FFD92F','#E5C494','#B3B3B3','#E41A1C','#377EB8','#4DAF4A','#984EA3',
 57 |               '#FF7F00','#FFFF33','#A65628','#F781BF','#999999','firebrick','darkorange2','tan3',
 58 |               'magenta','wheat4','palevioletred2','dodgerblue4','tomato3','mediumspringgreen',
 59 |               'grey26','antiquewhite4','red1','blue2','olivedrab4','lightyellow1','rosybrown3',
 60 |               'lightsteelblue4','rosybrown','rosybrown2','snow1','pink4','ghostwhite','ivory4',
 61 |               'lightgoldenrod','royalblue1','deeppink1','white','violetred2','hotpink2',
 62 |               'lightblue3','chartreuse4','azure2','plum','springgreen2','lemonchiffon1',
 63 |               'goldenrod2','grey6','darkorchid','palevioletred4','green4','lightsalmon1',
 64 |               'saddlebrown','rosybrown1','antiquewhite1','whitesmoke','plum4','cyan2',
 65 |               'forestgreen','burlywood3','lightyellow4','firebrick1','khaki3','salmon3',
 66 |               'sienna2','coral1','tan1','mediumvioletred','springgreen1','lemonchiffon',
 67 |               'lightgoldenrod4','darkred','navajowhite1','lightcoral','mediumturquoise',
 68 |               'lavenderblush','mistyrose1','indianred2','darkgoldenrod4','lightgoldenrod1',
 69 |               'lightsalmon3','lavender','magenta4','tomato2','seashell3','purple','tan2',
 70 |               'palevioletred3','coral3','lightblue1','darkorange4','orange1','darkolivegreen',
 71 |               'maroon1','skyblue3','cadetblue2','mediumorchid3','gold3','violetred1',
 72 |               'ivory2','snow4','aquamarine','darkgrey','darkolivegreen3','turquoise4',
 73 |               'sienna4','springgreen4','peachpuff4','seashell','violet','turquoise',
 74 |               'bisque2','lightsteelblue2','honeydew','lightsteelblue3','lawngreen',
 75 |               'tomato4','lightsalmon4','chocolate2','black','lightpink4','deepskyblue4',
 76 |               'aquamarine3','dodgerblue1','salmon1','yellow3','wheat','skyblue4','navajowhite4',
 77 |               'purple2','lavenderblush1','darkorange1','khaki2','aquamarine1','honeydew2',
 78 |               'cornsilk','lightskyblue4','mediumpurple2','paleturquoise1','seashell1',
 79 |               'darkcyan','orchid','royalblue','darkseagreen2','seagreen4','darkmagenta',
 80 |               'lightblue','mediumblue','chocolate3','yellow','darkgoldenrod2','mediumorchid4',
 81 |               'palegreen2','olivedrab','darkslateblue','chocolate1','maroon2','grey36',
 82 |               'orangered','goldenrod1','bisque3','deeppink3','peachpuff3','darkgreen',
 83 |               'royalblue4','darkgoldenrod1','blanchedalmond','mistyrose4','turquoise2',
 84 |               'ivory3','orchid1','limegreen','mediumpurple1','darkorange3','lemonchiffon4',
 85 |               'palevioletred1','magenta2','blue4','cyan1','thistle4','peru','grey56','cornsilk4',
 86 |               'mediumorchid2','green2','lightblue4','salmon4','burlywood4','burlywood1','orange',
 87 |               'burlywood','purple4','plum1','violetred3','khaki4','lightgoldenrodyellow',
 88 |               'lavenderblush3','lightpink3','azure4','orangered4','yellow2','mistyrose2',
 89 |               'deepskyblue2','mediumaquamarine','slateblue1','orange2','coral2','darkorchid4',
 90 |               'lightsalmon','gold2','darkseagreen')
 91 |     cols <- cols[!duplicated(cols)]
 92 |     
 93 |     # remove colors that are too light:
 94 |     sum_rgb <- colSums(grDevices::col2rgb(cols))
 95 |     cols <- cols[sum_rgb < max_sum_rgb]
 96 |     # add more colors if needed:
 97 |     n_removed <- sum(sum_rgb >= max_sum_rgb)
 98 |     if (n_removed > 0) {
 99 |       newcols <-  sample(colors()[!grepl("grey", colors())], length(freqs) * 2)[length(freqs) + seq_len(length(freqs))]
100 |       newcols <- newcols[colSums(grDevices::col2rgb(newcols)) < max_sum_rgb]
101 |       cols <- c(cols, newcols[seq_len(n_removed)])
102 |     }
103 |     
104 |     # order so the most common cells have lighter colors:
105 |     cols <- cols[seq_along(freqs)]
106 |     names(cols) <- names(freqs)[order(freqs, decreasing = TRUE)]
107 |   }
108 |   
109 |   ### "tableau20" palette: start with the tablueau20 colors:
110 |   if (palette == "tableau20") {
111 |     tab20 <- c('#aec7e8','#ffbb78','#98df8a','#ff9896','#c5b0d5','#c7c7c7',
112 |                '#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#8c564b',
113 |                '#e377c2','#17becf','#7f7f7f',
114 |                '#8DD3C7','#FFFFB3','#BEBADA','#FB8072','#80B1D3','#FDB462',
115 |                '#B3DE69','#FCCDE5','#D9D9D9','#BC80BD','#CCEBC5','#FFED6F',
116 |                sample(colors()[!grepl("grey", colors())], 200, replace = FALSE))
117 |     cols <- tab20[seq_along(freqs)]
118 |     names(cols) <- names(freqs)[order(freqs)]
119 |   }
120 |   
121 |   ### "earthplus" palette: earthtones for common cells, radiant colors for rare cells:
122 |   if (palette == "earthplus") {
123 |     # step 1: top least common cells, as long as <1% freq, get "radiant" colors:
124 |     radiantcolors <-
125 |       c(
126 |         "#FF0000",
127 |         "#00CCFF",
128 |         "#00FF00",
129 |         "#FFFF00",
130 |         "#FF00CC",
131 |         "#00FFFF",
132 |         "#FF3300",
133 |         "#CC00FF",
134 |         "#CCFF00",
135 |         "#66FF33"
136 |       )
137 |     richcolors <- c("#660099", "#006600", "#000000", "#000066")
138 |     nlow <- min(sum(freqs < 0.01), 14)
139 |     lowcols <- c(radiantcolors, richcolors)[seq_len(nlow)]
140 |     
141 |     # step 2: most common cells, as long as >10% freq, get "earth" colors:
142 |     earthtones <- c('#D9AF6B','#AF6458','#526A83','#68855C','#9C9C5E','#855C75')
143 |     nhigh <- min(sum(freqs > 0.1), length(earthtones)) 
144 |     highcols <- earthtones[seq_len(nhigh)]
145 |     
146 |     # step 3: remainder get mid-range colors
147 |     moderatecolors <- c('#1D6996','#73AF48','#E17C05','#94346E','#EDAD08','#38A6A5', 
148 |                         '#CC503E','#0F8554','#5F4690',   
149 |                         '#8DD3C7','#FFFFB3','#BEBADA','#FB8072','#80B1D3','#FDB462',
150 |                         '#B3DE69','#FCCDE5','#D9D9D9','#BC80BD','#CCEBC5','#FFED6F',
151 |                         sample(colors()[!grepl("grey", colors())], 200, replace = FALSE))
152 |     nmid <- length(freqs) - nlow - nhigh
153 |     if (nmid < length(moderatecolors)) {
154 |       midcols <- moderatecolors[seq_len(nmid)]
155 |     } else {
156 |       stop("too many cell types")
157 |     }
158 |     cols <- c(lowcols, midcols, highcols)
159 |     names(cols) <- names(freqs)[order(freqs)]
160 |   }
161 |   
162 |   # if init_colors are provided, use them when possible:
163 |   if (!is.null(init_colors)) {
164 |     overlap <- intersect(names(cols), names(init_colors))
165 |     cols[overlap] <- init_colors[overlap]
166 |   }
167 |   
168 |   return(cols)
169 | }
170 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Small example SMI data from a NSCLC tumor
 3 | #'
 4 | #' A 2000-cell excerpt from a 1000-plex SMI study of a NSCLC tumor. 
 5 | #'
 6 | #' @format A list with the following elements:
 7 | #'  \itemize{
 8 | #'  \item counts A matrix of raw counts, with cells in rows and genes in columns
 9 | #'  \item counts A matrix of negprobe counts, with cells in rows and negprobes in columns
10 | #'  \item x x positions
11 | #'  \item y y position
12 | #'  \item umap umap projection
13 | #'  }
14 | "mini_nsclc"
15 | 
16 | 
17 | 
18 | #' Matrix of immune cell profiles
19 | #'
20 | #' A matrix of gene * cell type expected expression values
21 | #'
22 | #' @format A matrix of 27161 genes x 16 cell types. 
23 | "ioprofiles"
24 | 
25 | #' Default colors for the cell types in the ioprofiles matrix
26 | #'
27 | #' A named vector of colors, giving colors for the cell types of the ioprofiles
28 | #'  matrix.
29 | #'
30 | #' @format A named vector
31 | "iocolors"
32 | 
33 | 
34 | #' Small example SMI protein data from a tonsil tissue
35 | #'
36 | #' A 21844-cells excerpt from a 68-plex SMI study of a tonsil tissue. 
37 | #'
38 | #' @format A list with the following elements:
39 | #'  \itemize{
40 | #'  \item counts A matrix of raw counts, with cells in rows and proteins in columns
41 | #'  \item negs A matrix of IgG counts, with cells in rows and IgGs in columns
42 | #'  \item xy_coord x and y positions
43 | #'  \item UMAP umap projection
44 | #'  }
45 | "tonsil_protein"
46 | 
47 | 
48 | 
49 | #' Reference profile examples from a tonsil tissue
50 | #'#'
51 | #' @format A list with the following elements:
52 | #'  \itemize{
53 | #'  \item tonsil_reference_profile A matrix of raw counts, with cells in rows and proteins in columns
54 | #'  \item counts A matrix of IgG counts, with cells in rows and IgGs in columns
55 | #'  \item xy_coord x and y positions
56 | #'  \item UMAP umap projection
57 | #'  }
58 | "tonsil_reference_profile"
59 | 
60 | 
61 | #' Matrix of anchor cells' annotation file
62 | #'  A matrix including cell_ID and cellType for anchors cells
63 | #' 
64 | #'  matrix.
65 | #'
66 | #' @format A matrix of 11844 cells and 2 columns
67 | "tonsil_annotation"
68 | 
69 | 
70 | #' Example human marker proteins 
71 | #'  For inputting \code{into gen_profiles_protein_expression()}
72 | #' 
73 | #'  data frame
74 | #'
75 | #' @format A matrix of 11844 cells and 2 columns
76 | "human_signature"
77 | 
78 | 
79 | #' Example mouse marker proteins 
80 | #'  For inputting \code{into gen_profiles_protein_expression()}
81 | #' 
82 | #'  data frame
83 | #'
84 | #' @format A matrix of 11844 cells and 2 columns
85 | "mouse_signature"
86 | 


--------------------------------------------------------------------------------
/R/fastCohorting.R:
--------------------------------------------------------------------------------
 1 | #' Quickly split cells into cohorts 
 2 | #' 
 3 | #' Quickly split cells into cohorts using non-RNA data like spatial context and immunofluorescence values.
 4 | #' Rule of thumb: include any variables that might be informative for cell typing, 
 5 | #'  *except* variables you'll want to analyze later. For example, if you'll later
 6 | #'  perform differential expression as a function of spatial context, then it's 
 7 | #'  safer to exclude spatial context from the cell typing exercise (and therefore 
 8 | #'  from this function).
 9 | #' @param mat Matrix of variables to be used in cohorting, cells in rows, and variables in columns.
10 | #'  Recommended to use < 20 variables. 
11 | #' @param n_cohorts Number of clusters to divide cells into
12 | #' @param gaussian_transform Whether to map each variable onto the quantiles of a normal distribution. 
13 | #' @return A vector of cohort assignments. 
14 | #' @export
15 | #' @importFrom mclust Mclust
16 | #' @importFrom mclust predict.Mclust
17 | #' @importFrom mclust mclustBIC
18 | #' @importFrom stats qnorm
19 | #' @examples
20 | #' data("mini_nsclc")
21 | #' ## simulate immunofluorescence data: 
22 | #' immunofluordata <- matrix(rpois(n = nrow(mini_nsclc$counts) * 4, lambda = 100), 
23 | #'                           nrow(mini_nsclc$counts))
24 | #' cohort <- fastCohorting(immunofluordata, gaussian_transform = TRUE)
25 | #' table(cohort)
26 | fastCohorting <- function(mat, n_cohorts = NULL, gaussian_transform = TRUE) {
27 |   
28 |   if (any(is.na(mat))) {
29 |     stop("NA's detected in mat. fastCohorting needs complete data.")
30 |   }
31 | 
32 |   # gaussian transform if called for:
33 |   if (gaussian_transform) {
34 |     for (i in seq_len(ncol(mat))) {
35 |       mat[, i] <- qnorm(rank(mat[, i]) / (nrow(mat) + 1))
36 |     }
37 |   }
38 |   
39 |   # choose number of cohorts:
40 |   if (is.null(n_cohorts)) {
41 |     n_cohorts <- 3
42 |     if (nrow(mat) > 10000) n_cohorts <- 10
43 |     if (nrow(mat) > 50000) n_cohorts <- 25
44 |     if (nrow(mat) > 100000) n_cohorts <- 50
45 |     if (nrow(mat) > 200000) n_cohorts <- 100
46 |   }
47 |  
48 |   # cluster in a subsample:
49 |   sub <- sample(seq_len(nrow(mat)), min(20000, nrow(mat)))
50 |   tryCatch({
51 |     mc <- mclust::Mclust(data = mat[sub, ], G = n_cohorts, modelNames = "EEE")
52 |     if(is.null(mc)) stop("Cohorting failed with ", ncohorts, " groups. Results in NULL mclust::Mclust object.")
53 |     # classify all cells:
54 |     cohort <- mclust::predict.Mclust(object = mc, newdata = mat)$classification
55 |   },error = function(e){
56 |     message("First attempt at autocohorting failed, possibly due to high collinearity of biomarkers. User should consider manually cohorting.") 
57 |     message("Automatically attempting to cohort in 2-PC space:") 
58 |     message(paste0("Projecting data to a lower dimensional 2-PC space for cohorting."))
59 |     message(paste0("Error in cohorting with ", n_cohorts, " groups."))
60 |     
61 |     ### project to 2-d pca space
62 |     pc2 <- irlba::prcomp_irlba(mat, n=min(ncol(mat), 2))
63 |     n_cohorts_try <- rev(c(2, 3, 10, 25, 50, 100))
64 |     n_cohorts_try <- n_cohorts_try[n_cohorts_try <= n_cohorts]
65 |     
66 |     for(ii in seq_along(n_cohorts_try)){
67 |       tryCatch({
68 |         # cluster in a subsample:
69 |         mc <<- mclust::Mclust(data = pc2$x[sub, ], G = n_cohorts_try[ii], modelNames = "EEE")
70 |         if(is.null(mc)) stop("Cohorting PC's with ", ncohorts_try[ii], " groups results in NULL mclust::Mclust object.")
71 |         break
72 |       }, error = function(e){
73 |         if(ii == length(n_cohorts_try)) stop("All attempts at cohorting have failed.  Please take a look at biomarkers used for cohorting to diagnose potential issues.")
74 |         message(paste0("Error in cohorting with ", n_cohorts_try[ii], " groups."))
75 |         message(paste0("Retrying with ", n_cohorts_try[ii + 1], " groups."))
76 |       })
77 |     }
78 |     # classify all cells:
79 |     cohort <<- mclust::predict.Mclust(object = mc, newdata = pc2$x)$classification
80 |   }) 
81 |   
82 |   names(cohort) <- rownames(mat)
83 |   return(cohort)
84 | }
85 | 


--------------------------------------------------------------------------------
/R/flightpath_layout.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #' "Flightpath" (umap-like) plot of clustering results
  4 | #'
  5 | #' Arrays cells in 2d space based on their probability of belonging to a given
  6 | #' cluster.
  7 | #' @param logliks Matrix of cells' log-likelihoods under each cluster. Must
  8 | #'   provide this or probs argument.
  9 | #' @param probs Matrix of cells' probabilities of belonging to each cluster.
 10 | #'   Must provide this or logliks argument.
 11 | #' @param profiles Matrix of cell type mean expression profiles. If provided,
 12 | #'   profiles rather than probs will be used to lay out the centroids.
 13 | #' @param cluster_xpos Vector of cluster centroids' x positions (i.e. where you
 14 | #'   want each cell type to appear in the plot)
 15 | #' @param cluster_ypos Vector of cluster centroids' y positions
 16 | #' @return A list with two elements: \enumerate{ \item clustpos: a matrix of
 17 | #'   cluster centroids * x,y positions in the flightpath plot \item cellpos: A
 18 | #'   matrix of cells * x,y positions in the flightpath plot }
 19 | #' @importFrom umap umap
 20 | #' @importFrom stats rnorm
 21 | #' @export
 22 | #' @examples
 23 | #' data("mini_nsclc")
 24 | #' unsup <- insitutype(
 25 | #'  x = mini_nsclc$counts,
 26 | #'  neg = Matrix::rowMeans(mini_nsclc$neg),
 27 | #'  assay_type = "RNA",
 28 | #'  n_clusts = 8,
 29 | #'  n_phase1 = 200,
 30 | #'  n_phase2 = 500,
 31 | #'  n_phase3 = 2000,
 32 | #'  n_starts = 1,
 33 | #'  max_iters = 5
 34 | #' ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
 35 | #' flightpath_layout(logliks = unsup$logliks, profiles = unsup$profiles)
 36 | flightpath_layout <- function(logliks = NULL, probs = NULL, profiles = NULL, cluster_xpos = NULL, cluster_ypos = NULL) {
 37 | 
 38 |   if (is.null(probs) && is.null(logliks)) {
 39 |     stop("Must provide either probs or logliks.")
 40 |   }
 41 |   if (is.null(probs) && !is.null(logliks)) {
 42 |     probs <- logliks2probs(logliks)
 43 |   }
 44 |   # force NA probs to 0:
 45 |   probs <- replace(probs, is.na(probs), 0)
 46 |   # get cluster centroid positions if not pre-specified:
 47 |   if (is.null(cluster_xpos) || is.null(cluster_ypos)) {
 48 |     # controls for a umap-based layout:
 49 |     conf <- umap::umap.defaults
 50 |     conf$min_dist <- 3
 51 |     conf$spread <- conf$min_dist * 1.1
 52 |     conf$n_neighbors <- ncol(probs)
 53 |     if (!is.null(profiles)) {
 54 |       clustum <- umap::umap(t(sqrt(profiles)), config = conf)$layout
 55 |     } else {
 56 |       clustum <- umap::umap(t(probs), config = conf)$layout
 57 |     }
 58 |     
 59 |     cluster_xpos <- clustum[, 1]
 60 |     cluster_ypos <- clustum[, 2]
 61 |   }
 62 | 
 63 |   # get cell xy positions as a weighted average of the umap positions
 64 |   ux <- probs %*% cluster_xpos
 65 |   uy <- probs %*% cluster_ypos
 66 |   
 67 |   # jitter the xy positions, jittering widely for prob = 1 cells and minimally for prob < 0.5 cells:
 68 |   jitterrange <- 0.01 * c(0.0005, 0.9) * max(diff(range(ux)), diff(range(uy))) 
 69 |   jitteramount <- jitterrange[1] + pmax((2 * apply(probs, 1, max) - 1), 0)  * jitterrange[2]
 70 |   ux <- ux + rnorm(length(ux), mean = 0, sd = jitteramount)
 71 |   uy <- uy + rnorm(length(ux), mean = 0, sd = jitteramount)
 72 | 
 73 |   out <- list(clustpos = cbind(cluster_xpos, cluster_ypos),
 74 |              cellpos = cbind(ux, uy),
 75 |              clust = colnames(probs)[apply(probs, 1, which.max)])
 76 |   colnames(out$clustpos) <- c("x", "y")
 77 |   colnames(out$cellpos) <- c("x", "y")
 78 |   
 79 |   # get clusters' mean confidence:
 80 |   out$meanconfidence <- getMeanClusterConfidence(probs)
 81 |   return(out)
 82 | }
 83 | 
 84 | 
 85 | 
 86 | 
 87 | #'Plot flightpath results
 88 | #'
 89 | #'@param flightpath_result The list output by the flightpath_layout function.
 90 | #'  Two elements: clustpos, cellpos. Must provide either this or
 91 | #'  insitutype_result.
 92 | #'@param insitutype_result The list output by insitutype or insitutypeML. Must
 93 | #'  provide either this or insitutype_result.
 94 | #'@param col Optional, a vector of cell colors, with length equal to the number
 95 | #'  of individual cells.
 96 | #'@param showclusterconfidence Logical, for whether to label clusters with the
 97 | #'  average posterior probability of the cells within them. Gives a readout of
 98 | #'  how distinct a cluster is from the others.
 99 | #'@importFrom utils data
100 | #'@importFrom scales alpha
101 | #'@import ggplot2
102 | #'@importFrom grDevices colors
103 | #'@importFrom rlang .data
104 | #'@return a ggplot object
105 | #'
106 | #'@export
107 | #'@examples 
108 | #' data("ioprofiles")
109 | #' unsup <- insitutype(
110 | #'  x = mini_nsclc$counts,
111 | #'  neg = Matrix::rowMeans(mini_nsclc$neg),
112 | #'  n_clusts = 8,
113 | #'  n_phase1 = 200,
114 | #'  n_phase2 = 500,
115 | #'  n_phase3 = 2000,
116 | #'  n_starts = 1,
117 | #'  max_iters = 5,
118 | #'  assay_type="RNA"
119 | #' ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
120 | #' flightpath_plot(insitutype_result = unsup)
121 | 
122 | flightpath_plot <- function(flightpath_result = NULL, insitutype_result = NULL, col = NULL, showclusterconfidence = TRUE){
123 |   
124 |   # get the flightpath results to use 
125 |   if (!is.null(flightpath_result) && !is.null(insitutype_result)) {
126 |     warning("flightpath_result and insitutype_result were both provided. Using only flightpath_result.")
127 |     insitutype_result <- NULL
128 |   }
129 |   if (is.null(flightpath_result) && is.null(insitutype_result)) {
130 |     stop("Must provide either flightpath_result or insitutype_result.")
131 |   }
132 |   if (is.null(flightpath_result)) {
133 |     flightpath_result <- flightpath_layout(logliks = insitutype_result$logliks, profiles = insitutype_result$profiles)
134 |   }
135 |   
136 |   # create color scheme if needed:
137 |   if (is.null(col)) {
138 |     utils::data("iocolors", package = "InSituType", envir = environment())
139 |     scols <- c('#8DD3C7','#FFFFB3','#BEBADA','#FB8072','#80B1D3','#FDB462','#B3DE69','#FCCDE5','#D9D9D9','#BC80BD',
140 |                '#CCEBC5','#FFED6F','#E41A1C','#377EB8','#4DAF4A','#984EA3','#FF7F00','#FFFF33','#A65628','#F781BF','#999999', 
141 |                sample(colors()[!grepl("grey", colors())], 100))[seq_along(unique(flightpath_result$clust))]
142 |     names(scols) <- unique(flightpath_result$clust)
143 |     iotypespresent <- intersect(names(environment()[['iocolors']]), names(scols))
144 |     scols[iotypespresent] <- environment()[['iocolors']][iotypespresent]
145 |     col <- scols[flightpath_result$clust]
146 |   }
147 | 
148 |   # prep data for plotting:
149 |   df <-
150 |     data.frame(
151 |       x = flightpath_result$cellpos[, 1],
152 |       y = flightpath_result$cellpos[, 2],
153 |       col = scales::alpha(col, 0.7)
154 |     )
155 |   df_text <- data.frame(x = flightpath_result$clustpos[, 1],
156 |                         y = flightpath_result$clustpos[, 2],
157 |                         group = rownames(flightpath_result$clustpos),
158 |                         col = "black")
159 |   
160 |   if (showclusterconfidence) {
161 |     confthresh <- 0.8
162 |     confidencecolors <- c('#FEB24C','#FD9D43','#FC863A','#FC6330','#F64226',
163 |                           '#E8251F','#D2111F','#B60224','#620015','#000000')
164 |     df_text$col <- confidencecolors[
165 |       1 + round(9 * (pmax(flightpath_result$meanconfidence, confthresh) - confthresh) / (1 - confthresh))]
166 |     
167 |     df_text$group <- paste0(df_text$group, "(", round(flightpath_result$meanconfidence, 2), ")")
168 |   }
169 |   p <- ggplot2::ggplot() +
170 |     ggplot2::geom_point(df, mapping  = ggplot2::aes(x = flightpath_result$cellpos[, 1], 
171 |                                                     y = flightpath_result$cellpos[, 2], 
172 |                                                     color = I(col),
173 |                                                     size = I(0.1))) +
174 |     ggplot2::scale_color_identity() +
175 |     ggplot2::geom_text(df_text,
176 |               mapping = ggplot2::aes(x = .data$x, y = .data$y, label = .data$group, col = I(col)),
177 |               size = 3) +
178 |     ggplot2::xlab("") +
179 |     ggplot2::ylab("") +
180 |     ggplot2::theme_bw() +
181 |     ggplot2::theme(legend.position = "none",
182 |           panel.grid = ggplot2::element_blank(),
183 |           axis.text = ggplot2::element_blank())
184 |   flightpath_plot_folder <- "./NBClust-Plots" # tempdir()
185 |   if (!dir.exists(flightpath_plot_folder)) dir.create(flightpath_plot_folder, showWarnings = FALSE, recursive = TRUE)
186 |   flightpath_plot_filename <- paste(format(Sys.time(), "%Y-%m-%d_%H-%M-%S-%Z"), "flightpath_plot.png", sep="-")
187 |   flightpath_plot_file <- paste(flightpath_plot_folder,flightpath_plot_filename , sep="/")
188 |   message("Saving flightpath_plot to: ", flightpath_plot_file)
189 |   ggsave(filename = flightpath_plot_filename, plot = p, device = "png", path = flightpath_plot_folder,
190 |          width = 7,
191 |          height = 7,
192 |          units="in")
193 | 
194 |   return(p)
195 | }
196 | 
197 | 
198 | #' Summarize clusters' mean confidence
199 | #' 
200 | #' Calculate the mean confidence of the cell calls from each cluster
201 | #' @param probs Matrix of probabilities
202 | #' @return a vector of mean confidences, with values of 1 corresponding to clusters with only prob == 1
203 | #' @examples
204 | #' data("mini_nsclc")
205 | #' probs <- sapply(rownames(mini_nsclc$counts), function(x) {a = runif(10); a/sum(a)})
206 | #' dimnames(probs)[[1]] <- letters[1:10]
207 | #' probs <- t(probs)
208 | #' getMeanClusterConfidence(probs)
209 | getMeanClusterConfidence <- function(probs) {
210 |   
211 |   maxprobs <- apply(probs, 1, max, na.rm = TRUE)
212 |   meanconfidence <- sapply(colnames(probs), function(name) {
213 |     thisclust <- probs[, name] == maxprobs
214 |     mean(probs[thisclust, name, drop = FALSE])
215 |   })
216 |   
217 |   return(meanconfidence)
218 | }
219 | 


--------------------------------------------------------------------------------
/R/gen_profiles_protein.R:
--------------------------------------------------------------------------------
  1 | #' Generate the mean reference profile and its SD reference profile based on the data itself
  2 | #' This function is based on signature matrix included in CELESTA package 
  3 | #' First, we rebuild a nested cell typing lists based on the 2-D signature matrix
  4 | #' Second, we identify anchor cells ranked by their expression level for each cell type's protein marker
  5 | #' Third, we estimate averaged expression level and SDs for proteins and cell types using the anchors
  6 | #'
  7 | #' @param exp.mat a matrix of raw protein expression data. cells are in rows and proteins are in columns
  8 | #' @param sig_mat a signature matrix of cell types. cell types x protein markers 
  9 | #' @param cutoff a cutoff of quantile. e.g) cutoff=0.9 means that top 90 percentiles of cells are called anchors for the protein expression
 10 | #' @param min.num.cells a minimum number of cells each cell type to estimate its mean or SDs. default value is 30.
 11 | #' @param keep_marker_proteins whether just marker proteins from the signature matrix is kept. default value is FALSE, which returns all proteins included in the data
 12 | #' 
 13 | #' @importFrom magrittr %>%
 14 | #' @importFrom tibble rownames_to_column column_to_rownames
 15 | #' @importFrom dplyr summarise_all group_by filter
 16 | #' @return A list, with the following elements:
 17 | #' \enumerate{
 18 | #' \item mean.ref.profile: a matrix of cluster-specific expression profiles. proteins x cell types
 19 | #' \item SDs.ref.profile: a matrix of standard deviation profiles of pre-defined clusters. proteins x cell types
 20 | #' \item anchors: a vector giving "anchor" cell types. Vector elements will be mainly NA's (for non-anchored cells)
 21 | #' }
 22 | #' @name gen_profiles_protein_expression
 23 | #' @examples 
 24 | #' data("tonsil_protein")
 25 | #' data("human_signature")
 26 | #' data("mouse_signature")
 27 | #' references <- gen_profiles_protein_expression(
 28 | #'  exp.mat=tonsil_protein$counts,
 29 | #'  sig_mat=NULL)
 30 | gen_profiles_protein_expression <- function(exp.mat, sig_mat=NULL, cutoff=0.9, min.num.cells=30, keep_marker_proteins=FALSE){
 31 | 
 32 |   if(is.null(sig_mat)){
 33 | 
 34 |     ## call the human's signature matrix
 35 |     sig_mat = InSituType::human_signature
 36 |     
 37 |     ## If the panel is for mouse, we call the mouse's signature matrix
 38 |     if(length(intersect(names(sig_mat), names(exp.mat)) == 0)){
 39 |       sig_mat = InSituType::mouse_signature
 40 |     }
 41 |   }
 42 | 
 43 |   markerProteins <- intersect(colnames(sig_mat), colnames(exp.mat))
 44 |   ## Split Lineage levels into columns
 45 |   sig_mat[is.na(sig_mat)] <- 0
 46 |   sig_mat$level1 <- lapply(strsplit(sig_mat$Lineage_level, "_"), function(x){x[1]}) %>% unlist()
 47 |   sig_mat$level2 <- lapply(strsplit(sig_mat$Lineage_level, "_"), function(x){x[2]}) %>% unlist()
 48 |   sig_mat$level3 <- lapply(strsplit(sig_mat$Lineage_level, "_"), function(x){x[3]}) %>% unlist()
 49 |   
 50 |   markerProtein_celltype_level <- vector("list", length=max(sig_mat$level1))
 51 |   for (i in 1:max(sig_mat$level1)){
 52 |     
 53 |     if(i ==1){
 54 |       markerProtein_celltype_level[[i]] <- data.frame(celltype = sig_mat[sig_mat$level1==i,]$celltype, 
 55 |                                                       marker_protein=apply(sig_mat[sig_mat$level1==i,], 1, function(x){colnames(sig_mat[sig_mat$level1==i,])[which(x==1)[1]]}),
 56 |                                                       upper_celltype = "Parent")
 57 |     }else{
 58 |       markerProtein_celltype_level[[i]] <- data.frame(celltype = sig_mat[sig_mat$level1==i,]$celltype, 
 59 |                                                       marker_protein=apply(sig_mat[sig_mat$level1==i,], 1, function(x){colnames(sig_mat[sig_mat$level1==i,])[which(x==1)[1]]}),
 60 |                                                       upper_celltype = sig_mat$celltype[which(sig_mat$level3==unique(sig_mat[sig_mat$level1==i,]$level2))])
 61 |     }
 62 |   }
 63 |   
 64 |   dat_mat_level <- vector("list", length=max(sig_mat$level1))
 65 |   for (i in 1:length(markerProtein_celltype_level)){
 66 |     if(i ==1){
 67 |       dat_mat_level[[i]] <- lapply(markerProtein_celltype_level[[i]]$marker_protein, function(x){
 68 |         if(max(exp.mat)<=1){
 69 |           cutoff <- 0.9
 70 |         }else{
 71 |           cutoff <- quantile(exp.mat[, x], prob=0.9)
 72 |         }
 73 |         rownames(exp.mat)[which(exp.mat[, x] > cutoff)]
 74 |       })
 75 |       names(dat_mat_level[[i]]) <- markerProtein_celltype_level[[i]]$celltype
 76 |     }else{
 77 |       
 78 |       dat_mat_level[[i]] <- vector("list", nrow(markerProtein_celltype_level[[i]]))
 79 |       names(dat_mat_level[[i]]) <- markerProtein_celltype_level[[i]]$celltype
 80 |       
 81 |       for(j in 1:length(markerProtein_celltype_level[[i]]$celltype)){
 82 |         
 83 |         if(!is.na(markerProtein_celltype_level[[i]][j,]$marker_protein)){
 84 |           
 85 |           ## Identify the upper level's cell type and where it is located in the signature matrix' lineage level
 86 |           for(k in 1:(i-1)){
 87 |             tempDD <- markerProtein_celltype_level[[k]] %>% filter(celltype==markerProtein_celltype_level[[i]]$upper_celltype[1])
 88 |             
 89 |             if(nrow(tempDD)==1){
 90 |               tempMar=tempDD
 91 |               idx_k=k
 92 |             }else{
 93 |               paste("pass")
 94 |             }
 95 |           }
 96 |           
 97 |           tempD <- exp.mat[rownames(exp.mat) %in% dat_mat_level[[idx_k]][[tempMar$celltype]], ]
 98 |           
 99 |           if(max(exp.mat)<=1){
100 |             cutoff <- 0.9
101 |           }else{
102 |             cutoff <- quantile(tempD[, markerProtein_celltype_level[[i]][j,]$marker_protein], prob=0.9)
103 |           }
104 |           
105 |           tempID <- rownames(tempD)[which(tempD[, markerProtein_celltype_level[[i]][j,]$marker_protein] > cutoff)]
106 |           
107 |           dat_mat_level[[idx_k]][[tempMar$celltype]] <- setdiff(dat_mat_level[[idx_k]][[tempMar$celltype]], tempID)       
108 |           dat_mat_level[[i]][[markerProtein_celltype_level[[i]]$celltype[j]]] <- tempID
109 |         }else{
110 |           break
111 |           
112 |         }
113 |       }
114 |     }
115 |   }
116 |   
117 |   markerProtein_celltype_all <- do.call("rbind", markerProtein_celltype_level)
118 |   marker_id_cell_type <- do.call(c, dat_mat_level)
119 |   marker_id_cell_type_insitu <- marker_id_cell_type[lapply(marker_id_cell_type, length)!=0]
120 |   marker_id_cell_type_insitu_df <- lapply(1:length(marker_id_cell_type_insitu), function(x){data.frame(cell_ID=marker_id_cell_type_insitu[[x]], 
121 |                                                                                                     celltype=rep(names(marker_id_cell_type_insitu[x]), length(marker_id_cell_type_insitu[[x]])))})
122 |   names(marker_id_cell_type_insitu_df) <- names(marker_id_cell_type_insitu)
123 |   anchors <- do.call("rbind", marker_id_cell_type_insitu_df) %>% as.data.frame()
124 |   anchors_duplicate <- anchors[which(duplicated(anchors$cell_ID)==TRUE),]$cell_ID
125 |   
126 |   marker_id_cell_type_unique <- lapply(marker_id_cell_type_insitu_df, 
127 |                                        function(x) {
128 |                                          tempV <- setdiff(x$cell_ID, anchors_duplicate)
129 |                                          if(length(tempV) > 20){
130 |                                            names(tempV) <- x[x$cell_ID %in% tempV,]$celltype
131 |                                            tempV <- tempV
132 |                                          }else{
133 |                                            tempV <- NULL
134 |                                          }
135 |                                          return(tempV)})
136 |   
137 |   # marker_id_cell_type_unique <- Filter(Negate(is.null), marker_id_cell_type_unique)
138 |   
139 |   anchors <- anchors[which(duplicated(anchors$cell_ID)==FALSE),] 
140 |   anchors <- anchors %>% filter(celltype %in% names(marker_id_cell_type_unique))
141 |   
142 |   anchors <- rbind(anchors, data.frame(cell_ID = setdiff(rownames(exp.mat), anchors$cell_ID), celltype=NA))
143 |   rownames(anchors) <- anchors$cell_ID
144 |   anchors$cell_ID <- NULL
145 |   anchors <- t(anchors)[1,]
146 |   
147 |   ############################ Estimate averaged protein expression each cell type with its anchor cells ######################################
148 |   protein_exp_means_list <- lapply(marker_id_cell_type_unique, function(x){
149 |     
150 |     mean.exp <- exp.mat[rownames(exp.mat) %in% x, ] %>% colMeans()
151 |     
152 |   })
153 |   
154 |   mean.ref.profile <- do.call("rbind", protein_exp_means_list) %>% t() %>% as.data.frame()
155 |   
156 |   protein_exp_SDs_list <- lapply(marker_id_cell_type_unique, function(x){
157 |     apply(exp.mat[rownames(exp.mat) %in% x, ], 2, sd )
158 |   })
159 |   names(protein_exp_SDs_list) <- names(marker_id_cell_type_unique)
160 |   SDs.ref.profile <- do.call("rbind", protein_exp_SDs_list) %>% t() %>% as.data.frame()
161 |   
162 |   if(keep_marker_proteins){
163 |     mean.ref.profile <- mean.ref.profile[markerProteins, ]
164 |     SDs.ref.profile <- SDs.ref.profile[markerProteins, ]
165 |   }
166 |   out <- list(mean.ref.profile=mean.ref.profile, SDs.ref.profile=SDs.ref.profile, anchors=anchors[rownames(exp.mat)])
167 |   return(out)
168 | }
169 | 
170 | 
171 | #' Generate the mean reference profile and its SD reference profile from an annotation file
172 | #' This function is only for protein data set with known anchor cells and their cell types
173 | #'
174 | #' @param exp.mat a matrix of raw protein expression data. cells are in rows and proteins are in columns
175 | #' @param anno a data frame or matrix of cell types for anchor cells or manually annotated cell typing information for some cells. Should include cell_ID and celltype at least. 
176 | #' 
177 | #' @return A list, with the following elements:
178 | #' \enumerate{
179 | #' \item mean.ref.profile: a matrix of cluster-specific expression profiles. proteins * cell types
180 | #' \item SDs.ref.profile: a matrix of standard deviation profiles of pre-defined clusters. proteins * cell types
181 | #' \item anchors: a vector giving "anchor" cell types. Vector elements will be mainly NA's (for non-anchored cells)
182 | #' }
183 | 
184 | gen_profiles_protein_annotation <- function(exp.mat, anno) {
185 |   
186 |   anno_ref_mat <- merge(exp.mat %>% as.data.frame() %>% rownames_to_column(var="cell_ID"), anno %>% dplyr::select(c(cell_ID, cellType)), by="cell_ID") %>% column_to_rownames(var="cell_ID")
187 |   
188 |   mean.ref.profile <- anno_ref_mat %>% group_by(cellType) %>% summarise_all(mean) %>% column_to_rownames(var="cellType") %>% t()
189 |   SDs.ref.profile <- anno_ref_mat %>% group_by(cellType) %>% summarise_all(sd) %>% column_to_rownames(var="cellType") %>% t()
190 |   
191 |   ## Set NAs for non-anchor cells' cell types
192 |   anchors <- rbind(anno %>% dplyr::select(c(cell_ID, cellType)), data.frame(cell_ID = setdiff(rownames(exp.mat), rownames(anno_ref_mat)), cellType=NA)) 
193 |   rownames(anchors) <- anchors$cell_ID
194 |   anchors$cell_ID <- NULL
195 |   anchors <- anchors %>% t()
196 |   anchors <- anchors[1,]
197 |   
198 |   out <- list(mean.ref.profile=mean.ref.profile,
199 |               SDs.ref.profile=SDs.ref.profile,
200 |               anchors=anchors[rownames(exp.mat)])
201 |   return(out)
202 | }
203 | 


--------------------------------------------------------------------------------
/R/getProfiles.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Extract mean background-subtracted profiles of RNA data
 3 | #'
 4 | #' Given cell assignments and count data, estimate the mean
 5 | #'  profile of each cluster.
 6 | #' 
 7 | #' @param x Counts matrix, cells * genes.
 8 | #' @param clust Vector of cluster assignments, or a matrix of probabilities
 9 | #'   of cells (rows) belonging to clusters (columns).
10 | #' @param neg Vector of mean background counts (or a single value applied to all cells)
11 | #' @return A matrix of gene x cell type expression profiles. 
12 | #' @export
13 | getRNAprofiles <- function(x, neg, clust) {
14 |   if (length(neg) == 1) {
15 |     neg <- rep(neg, nrow(x))
16 |   }
17 |   temp <- Estep(counts = x, clust = clust, neg = neg, assay_type = "RNA")
18 |   return(temp$profiles)
19 | }
20 | 
21 | #' Extract mean background-subtracted profiles of RNA data
22 | #'
23 | #' Given cell assignments and count data, estimate the mean
24 | #'  profile of each cluster.
25 | #' @param x Expression matrix, cells * proteins.
26 | #' @param clust Vector of cluster assignments, or a matrix of probabilities
27 | #'   of cells (rows) belonging to clusters (columns).
28 | #' @param neg Vector of mean background counts
29 | #' @return List with two elements: "profiles", a matrix of protein x cell type expression profiles, and "sds", a matrix of SD's.
30 | #' @export
31 | getProteinParameters <- function(x, clust) {
32 |   temp <- Estep(counts = x, clust = clust, assay_type = "protein")
33 |   return(temp)
34 | }
35 | 


--------------------------------------------------------------------------------
/R/getSpatialContext.R:
--------------------------------------------------------------------------------
  1 | #' Get the neighborhood expression profile around all cells
  2 | #' @param counts Counts matrix
  3 | #' @param xy 2-column matrix of cells' xy positions
  4 | #' @param tissue vector of tissue IDs. Used to ensure cells for different tissues are never called neighbors
  5 | #' @param N number of neighbors to use. Specify this or \code{rad}. 
  6 | #' @param rad radius to use to define neighbors. Specify this or \code{N}. 
  7 | #' @param dim_reduce_to If entered, the neighborhood matrix will be reduced to this many PCs
  8 | #' @return A matrix of neighborhood expression, potentially by gene, or else by PCs if \code{dim_reduce_to} was set.
  9 | #' @export
 10 | #' @importFrom irlba prcomp_irlba
 11 | getSpatialContext <- function(counts, xy, tissue = NULL, N = 50, rad = NULL, dim_reduce_to = NULL) {
 12 |   
 13 |   # define neighbors:
 14 |   if (is.null(tissue)) {
 15 |     tissue = 1
 16 |   }
 17 |   if (!is.null(N)) {
 18 |     neighbors <- nearestNeighborGraph(x = xy[, 1], y = xy[, 2], N = N, subset = tissue) 
 19 |     rad <- NULL
 20 |   } 
 21 |   if (!is.null(rad)) {
 22 |     neighbors <- radiusBasedGraph(x = xy[, 1], y = xy[, 2], R = rad, subset = tissue) 
 23 |   } 
 24 |   
 25 |   # get neighborhood expression:
 26 |   neighborexpression <- get_neighborhood_expression(counts = counts, neighbors = neighbors) 
 27 |   
 28 |   # dimension reduce
 29 |   if (!is.null(dim_reduce_to)) {
 30 |     neighborexpression <- irlba::prcomp_irlba(neighborexpression, n = dim_reduce_to)$x
 31 |   }
 32 |   return(neighborexpression)
 33 | }
 34 | 
 35 | 
 36 | 
 37 | 
 38 | #' Create spatial network from N nearest neighbors
 39 | #'
 40 | #' For each cell identify \code{N} nearest neighbors in Euclidean space and
 41 | #' create an edge between them in graph structure, optionally subset cells (see
 42 | #' Details).
 43 | #'
 44 | #' Edges will only be created for cells that have the same \code{subset} value,
 45 | #' usually the slide column id but could also be a slide plus FOV id to only
 46 | #' create edges within an FOV.
 47 | #'
 48 | #' @param x spatial coordinate
 49 | #' @param y spatial coordinate
 50 | #' @param N number of nearest neighbors
 51 | #' @param subset same length as x,y (see Details)
 52 | #'
 53 | #' @return sparse adjacency matrix with distances
 54 | #' @importFrom data.table data.table
 55 | #' @importFrom data.table rbindlist
 56 | #' @importFrom spatstat.geom nnwhich
 57 | #' @importFrom spatstat.geom nndist
 58 | #' @importFrom Matrix sparseMatrix
 59 | nearestNeighborGraph <- function(x, y, N, subset=1) {
 60 |   DT <- data.table::data.table(x = x, y = y, subset = subset)
 61 |   nearestNeighbor <- function(i) {
 62 |     subset_dt <- DT[subset == i]
 63 |     idx <- which(DT[["subset"]] == i)
 64 |     ndist <- spatstat.geom::nndist(subset_dt[, .(x, y)],
 65 |                                    k=1:N)
 66 |     nwhich <- spatstat.geom::nnwhich(subset_dt[, .(x, y)],
 67 |                                      k=1:N)
 68 |     ij <- data.table::data.table(i = idx[1:nrow(subset_dt)],
 69 |                                  j = idx[as.vector(nwhich)],
 70 |                                  x = as.vector(ndist))
 71 |     return(ij)
 72 |   }
 73 |   ij <- data.table::rbindlist(lapply(unique(subset), nearestNeighbor))
 74 |   adj.m <- Matrix::sparseMatrix(i = ij$i, j = ij$j, x = ij$x, dims = c(nrow(DT), nrow(DT)))
 75 |   return(adj.m)
 76 | }
 77 | 
 78 | #' Create spatial network from neighbors within radius R
 79 | #'
 80 | #' For each cell identify neighbors within distance \code{R} in Euclidean space
 81 | #' and create an edge between them in graph structure, optionally subset cells
 82 | #' (see Details).
 83 | #'
 84 | #' Edges will only be created for cells that have the same \code{subset} value,
 85 | #' usually the slide column id but could also be a slide plus FOV id to only
 86 | #' create edges within an FOV.
 87 | #'
 88 | #' @param x spatial coordinate
 89 | #' @param y spatial coordinate
 90 | #' @param R radius
 91 | #' @param subset same length as x,y (see Details)
 92 | #'
 93 | #' @return sparse adjacency matrix with distances
 94 | #' @importFrom data.table data.table
 95 | #' @importFrom data.table rbindlist
 96 | #' @importFrom Matrix sparseMatrix
 97 | #' @importFrom spatstat.geom ppp
 98 | #' @importFrom spatstat.geom closepairs
 99 | radiusBasedGraph <- function(x, y, R, subset=1) {
100 |   DT <- data.table::data.table(x = x, y = y, subset = subset)
101 |   radiusNeighbor <- function(i) {
102 |     subset_dt <- DT[subset == i]
103 |     idx <- which(DT[["subset"]] == i)
104 |     pp <- spatstat.geom::ppp(subset_dt$x, subset_dt$y,
105 |                              range(subset_dt$x), range(subset_dt$y))
106 |     cp <- spatstat.geom::closepairs(pp, R)
107 |     ij <- data.table::data.table(i = idx[cp$i],
108 |                                  j = idx[cp$j],
109 |                                  x = cp$d)
110 |     return(ij)
111 |   }
112 |   ij <- data.table::rbindlist(lapply(unique(subset), radiusNeighbor))
113 |   adj.m <- Matrix::sparseMatrix(i = ij$i, j = ij$j, x = ij$x, dims = c(nrow(DT), nrow(DT)))
114 |   return(adj.m)
115 | }
116 | 
117 | 
118 | 
119 | #' Calculate neighborhood expression
120 | #'
121 | #' Calculates the expression profile of each cell's neighborhood
122 | #' @param counts Single cell expression matrix
123 | #' @param neighbors A neighbors adjacency matrix
124 | #' @return A matrix in the same dimensions as \code{counts}, giving the expression profile of each cell's neighborhood.
125 | get_neighborhood_expression <- function(counts, neighbors) {
126 |   
127 |   # check:
128 |   if (nrow(counts) != ncol(neighbors)) {
129 |     stop("misalignment between nrow(counts) and ncol(neighbors)")
130 |   }
131 |   # get clust-specific environment expression
132 |   env <- neighbor_colMeans(counts, neighbors)
133 |   rownames(env) <- rownames(neighbors)
134 |   env <- as.matrix(env)
135 |   return(env)
136 | }
137 | 
138 | #' for each cell, get the colMeans of x over its neighbors:
139 | #' @param x A matrix
140 | #' @param neighbors A (probably sparse) adjacency matrix
141 | neighbor_colMeans <- function(x, neighbors) {
142 |   neighbors@x <- rep(1, length(neighbors@x))
143 |   neighbors <- Matrix::Diagonal(x=1/Matrix::rowSums(neighbors)) %*% neighbors
144 |   neighbors@x[neighbors@x==0] <- 1
145 |   out <- neighbors %*% x
146 |   return(out)
147 | }
148 | 
149 | 
150 | 
151 | #' for each cell, get the colSums of x over its neighbors:
152 | #' @param x A matrix
153 | #' @param neighbors A (probably sparse) adjacency matrix
154 | neighbor_colSums <- function(x, neighbors) {
155 |   neighbors@x <- rep(1, length(neighbors@x))
156 |   neighbors <- Matrix::Diagonal(x=rep(1, nrow(neighbors))) %*% neighbors
157 |   neighbors@x[neighbors@x==0] <- 1
158 |   out <- neighbors %*% x
159 |   return(out)
160 | }
161 | 


--------------------------------------------------------------------------------
/R/insitutypeML.R:
--------------------------------------------------------------------------------
  1 | #' Classify cells based on reference profiles
  2 | #' 
  3 | #' Supervised classification of cells. Each cell is assigned to the cell type 
  4 | #'  under which its observed expression profile is most likely. 
  5 | #' @param x Counts matrix (or dgCMatrix), cells * genes.
  6 | #'
  7 | #'   Alternatively, a \linkS4class{SingleCellExperiment} object containing such
  8 | #'   a matrix.
  9 | #' @param neg Vector of mean negprobe counts per cell. Can be provided 
 10 | #' @param bg Expected background
 11 | #' @param cohort Vector of cells' cohort memberships
 12 | #' @param reference_profiles Matrix of expression profiles of pre-defined clusters,
 13 | #'  e.g. from previous scRNA-seq. These profiles will not be updated by the EM algorithm.
 14 | #'  Colnames must all be included in the init_clust variable.
 15 | #' @param reference_sds Matrix of standard deviation profiles of pre-defined
 16 | #'   clusters. These SD profiles also will not be updated by the EM algorithm. 
 17 | #'   Columns must all be included in the init_clust variable. This parameter should
 18 | #'   be defined if assay_type is protein. Default is NULL. 
 19 | #' @param nb_size The size parameter to assume for the NB distribution.
 20 | #' @param align_genes Logical, for whether to align the counts matrix and the reference_profiles by gene ID.
 21 | #' @param assay_type Assay type of RNA, protein (default = "rna") 
 22 | #' @param ... For the \linkS4class{SingleCellExperiment} method, additional
 23 | #'   arguments to pass to the ANY method.
 24 | #' @param assay.type A string specifying which assay values to use.
 25 | #' @return A list, with the following elements:
 26 | #' \enumerate{
 27 | #' \item clust: a vector given cells' cluster assignments
 28 | #' \item prob: a vector giving the confidence in each cell's cluster
 29 | #' \item profiles: Matrix of clusters' mean background-subtracted profiles
 30 | #' \item logliks: Matrix of cells' log-likelihoods under each cluster. Cells in rows, clusters in columns.
 31 | #' }
 32 | #'
 33 | #' @name insitutypeML
 34 | #' @examples
 35 | #' data("mini_nsclc")
 36 | #' data("ioprofiles")
 37 | #' sup <- insitutypeML(
 38 | #'  x = mini_nsclc$counts,
 39 | #'  neg = Matrix::rowMeans(mini_nsclc$neg),
 40 | #'  reference_profiles = ioprofiles,
 41 | #'  assay_type = "RNA")
 42 | #' table(sup$clust)
 43 | NULL
 44 | 
 45 | .insitutypeML <- function(x, neg = NULL, bg = NULL, cohort = NULL, 
 46 |                           reference_profiles, 
 47 |                           reference_sds=NULL, 
 48 |                           nb_size = 10, 
 49 |                           assay_type = c("rna", "protein"), 
 50 |                           align_genes = TRUE) {
 51 |   assay_type <- match.arg(tolower(assay_type), c("rna", "protein"))
 52 |   
 53 |   # get vector of expected background:
 54 |   bg <- estimateBackground(counts = x, neg = neg, bg = bg)
 55 |   
 56 |   # align genes:
 57 |   if (align_genes) {
 58 |     x <- alignGenes(counts = x, profiles = reference_profiles)
 59 |     reference_profiles <- reference_profiles[colnames(x), ]
 60 |     if (!is.null(reference_sds)) {
 61 |       reference_sds <- reference_sds[colnames(x), ]
 62 |     }
 63 |   }
 64 |   
 65 |   # prep cohort vector:
 66 |   if (is.null(cohort)) {
 67 |     cohort <- rep("all", length(bg))
 68 |   }
 69 |   
 70 |   logliks <- lldist(x = reference_profiles,
 71 |                     xsd = reference_sds,
 72 |                     mat = x,
 73 |                     bg =bg, 
 74 |                     size = nb_size,
 75 |                     assay_type=assay_type)
 76 |   
 77 | 
 78 |   # update logliks based on frequencies within cohorts:
 79 |   logliks <- update_logliks_with_cohort_freqs(logliks = logliks, 
 80 |                                               cohort = cohort, 
 81 |                                               minfreq = 1e-4, 
 82 |                                               nbaselinecells = 100) 
 83 |   if ("undefined" %in% colnames(logliks)) {
 84 |     logliks <- logliks[, -which(colnames(logliks) == "undefined")]
 85 |   }
 86 |   features <- intersect(rownames(reference_profiles), colnames(x))
 87 |   logliks <- cbind(logliks, ifelse(Matrix::rowSums(x[, features]) == 0, 0, -Inf))
 88 |   colnames(logliks)[ncol(logliks)] <- "undefined"
 89 |   
 90 |   # get remaining outputs
 91 |   clust <- colnames(logliks)[apply(logliks, 1, which.max)]
 92 |   names(clust) <- rownames(logliks)
 93 |   
 94 |   probs <- logliks2probs(logliks)
 95 |   prob <- apply(probs, 1, max)
 96 |   names(prob) <- names(clust)
 97 |   profiles_info <- Estep(counts=x, 
 98 |                          clust = clust,
 99 |                          neg = neg, 
100 |                          assay_type=assay_type)
101 |   
102 |   profiles <- profiles_info$profiles
103 |   sds <- profiles_info$sds
104 |   
105 |   # aligns profiles and logliks, removing lost clusters:
106 |   logliks_from_lost_celltypes <- logliks[, !is.element(colnames(logliks), unique(clust)), drop = FALSE]
107 |   logliks <- logliks[, is.element(colnames(logliks), clust), drop = FALSE]
108 |   profiles <- profiles[, colnames(logliks), drop = FALSE]
109 | 
110 |   if(identical(tolower(assay_type), "rna")){
111 |     sds <- NULL
112 |   }
113 |   
114 |   out <- list(clust = clust,
115 |              prob = prob,
116 |              profiles = profiles,
117 |              sds = sds,
118 |              logliks = round(logliks, 4),
119 |              logliks_from_lost_celltypes = round(logliks_from_lost_celltypes, 4))
120 |   return(out)    
121 | }
122 | 
123 | ############################
124 | # S4 method definitions 
125 | ############################
126 | 
127 | #' @export
128 | #' @rdname insitutypeML
129 | setGeneric("insitutypeML", function(x, ...) standardGeneric("insitutypeML"))
130 | 
131 | #' @export
132 | #' @rdname insitutypeML
133 | setMethod("insitutypeML", "ANY", .insitutypeML)
134 | 
135 | #' @export
136 | #' @rdname insitutypeML
137 | #' @importFrom SummarizedExperiment assay
138 | #' @importFrom SingleCellExperiment SingleCellExperiment
139 | setMethod("insitutypeML", "SingleCellExperiment", function(x, ..., assay.type="counts") {
140 |   .insitutypeML(t(assay(x, i=assay.type)), ...)
141 | })
142 | 


--------------------------------------------------------------------------------
/R/refineClusters.R:
--------------------------------------------------------------------------------
  1 | #' Merge cell types in a clustering result
  2 | #'
  3 | #' Take a user-defined list of cells types to rename/combine, then re-compute
  4 | #' cluster assignments and probabilities under the merged cell types.
  5 | #' @param assay_type Assay type of RNA, protein (default = "rna")
  6 | #' @param merges A named vector in which the elements give new cluster names and
  7 | #'   the names give old cluster names. OK to omit cell types that aren't being
  8 | #'   merged.
  9 | #' @param to_delete A vector of cluster names to delete. All cells assigned to
 10 | #'   these clusters will be reassigned to the next best cluster.
 11 | #' @param subcluster A list, where each element's name is a cell type to
 12 | #'   subcluster, and the element itself is the cluster number(s) to use. E.g.
 13 | #'   list("macrophages" = 2, "cancer" = 2:3)
 14 | #' @param logliks Matrix of log-likelihoods output by insitutype, cells in rows,
 15 | #'   clusters in columns
 16 | #' @param counts Counts matrix, cells * genes. Only needed if subclustering is
 17 | #'   run.
 18 | #' @param neg Vector of mean negprobe counts per cell. Only needed if
 19 | #'   subclustering is run.
 20 | #' @param bg Expected background. Optional, and only used if subclustering is
 21 | #'   run.
 22 | #' @param cohort Vector of cells' cohort memberships. Optional, and only needed
 23 | #'   if subclustering is run.
 24 | #' @return A list with two elements: \enumerate{ \item clust: a vector of
 25 | #'   cluster assignments \item prob: Vector of posterior probabilities for each
 26 | #'   cell type \item logliks: a matrix of probabilities of all cells (rows)
 27 | #'   belonging to all clusters (columns) \item profiles: a matrix of the average
 28 | #'   background-subracted profile of each cell type after
 29 | #'   merging/deleting/subclustering }
 30 | #' @export
 31 | #' @examples
 32 | #' #example merges argument:
 33 | #' merges = c("macrophages" = "myeloid",  # merge 3 clusters
 34 | #'            "monocytes" = "myeloid",
 35 | #'            "mDC" = "myeloid",
 36 | #'            "B-cells" = "lymphoid")    # just rename 1 cluster
 37 | #' # example to_delete argument:
 38 | #' to_delete = c("neutrophils")
 39 | #' # example subcluster argument:
 40 | #' subcluster = list("Myofibroblast" = 2:3)
 41 | refineClusters <- function(assay_type = c("rna", "protein"), 
 42 |                            merges = NULL, to_delete = NULL, subcluster = NULL, 
 43 |                            logliks,
 44 |                            counts = NULL, 
 45 |                            neg = NULL, bg = NULL, 
 46 |                            cohort = NULL) {
 47 |   assay_type <- match.arg(tolower(assay_type), c("rna", "protein"))
 48 |   
 49 |   # check that provided cell names are all in logliks:
 50 |   if (any(!is.element(names(merges), colnames(logliks)))) {
 51 |     mismatch <- setdiff(names(merges), colnames(logliks))
 52 |     stop(paste0("The following user-provided cluster name(s) in the merges argument are missing from colnames(logliks): ",
 53 |                 paste0(mismatch, collapse = ", ")))
 54 |   }
 55 |   if (any(!is.element(to_delete, colnames(logliks)))) {
 56 |     mismatch <- setdiff(to_delete, colnames(logliks))
 57 |     stop(paste0("The following user-provided cluster name(s) in the to_delete argument are missing from colnames(logliks): ",
 58 |                 paste0(mismatch, collapse = ", ")))
 59 |   }
 60 |   if (any(!is.element(names(subcluster), colnames(logliks)))) {
 61 |     mismatch <- setdiff(names(subcluster), colnames(logliks))
 62 |     stop(paste0("The following user-provided cluster name(s) in the merges argument are missing from colnames(logliks): ",
 63 |                 paste0(mismatch, collapse = ", ")))
 64 |   }
 65 |   if (length(setdiff(colnames(logliks), to_delete)) == 0) {
 66 |     stop("The to_delete argument is asking for all clusters to be deleted.")
 67 |   }
 68 |   # check that subcluster data is available:
 69 |   if (!is.null(subcluster)) {
 70 |     if (is.null(counts)) {
 71 |       stop("Must provide counts data to subcluster")
 72 |     }
 73 |     if (is.null(neg)) {
 74 |       stop("Must provide neg vector to subcluster")
 75 |     }
 76 |   }
 77 | 
 78 |   # delete those called for:
 79 |   logliks <- logliks[, !is.element(colnames(logliks), to_delete)]
 80 |   
 81 |   # prevent merging into existing names:
 82 |   duplicatednames <- intersect(merges, colnames(logliks))
 83 |   merges[is.element(merges, duplicatednames)] <- paste0(merges[is.element(merges, duplicatednames)], ".new")
 84 |   
 85 |   # get logliks under merged categories: each cell's "new" loglik in a merged cell type is
 86 |   #  its best loglik under the "old" celltype.
 87 |   newlogliks <- matrix(NA, nrow(logliks), length(unique(merges)),
 88 |                        dimnames = list(rownames(logliks), unique(merges)))
 89 |   newlogliks <- sapply(unique(merges), function(newname) {
 90 |     oldnames <- names(merges)[merges == newname]
 91 |     newlogliks[, newname] <- apply(logliks[, oldnames, drop = FALSE], 1, max, na.rm = TRUE)
 92 |   })
 93 |   if (length(newlogliks) > 0) {
 94 |     newlogliks <- cbind(newlogliks, logliks[, setdiff(colnames(logliks), names(merges)), drop = FALSE])
 95 |   } else {
 96 |     newlogliks <- logliks
 97 |   }
 98 |   
 99 |   # get new cluster assignments:
100 |   clust <- colnames(newlogliks)[apply(newlogliks, 1, which.max)]
101 |   names(clust) <- rownames(newlogliks)
102 | 
103 |   ## perform subclustering:
104 |   # subclustering logic:
105 |   # - run unsupervised clustering of the selected cell type 
106 |   # - record the subcluster logliks for the selected cells
107 |   # - for unselected cells, propagate the original supercluster loglik to the subclusters (to prevent unselected cells joining the subclusters)
108 |   # - unselected cells keep their cell type. selected cells go to whichever subcluster gives them the greatest loglik
109 |   for (name in names(subcluster)) {
110 |     message(paste0("Subclustering ", name))
111 |     use <- which(colnames(newlogliks)[apply(newlogliks, 1, which.max)] == name)
112 |     # run insitutype on just the named cell type:
113 |     temp <- insitutype(x = counts[use, ],
114 |                        assay_type = assay_type,
115 |                        neg = neg[use],
116 |                        bg = bg[use],
117 |                        cohort = cohort[use],
118 |                        n_clusts = subcluster[[name]],
119 |                        n_starts = 3, n_benchmark_cells = 5000,
120 |                        n_phase1 = 2000, n_phase2 = 10000, n_phase3 = 20000,
121 |                        n_chooseclusternumber = 2000)
122 |     
123 |     # make logliks matrix for all cells vs. the new clusters, with cells outside 
124 |     # the selected cell type retaining their original loglik for the cluster
125 |     subclustlogliks <- matrix(rep(newlogliks[, name], ncol(temp$logliks)), nrow(counts))
126 |     rownames(subclustlogliks) <- rownames(counts)
127 |     colnames(subclustlogliks) <- colnames(temp$logliks)
128 |     # for cells with subclustering results, overwrite the old logliks:
129 |     subclustlogliks[rownames(temp$logliks), colnames(temp$logliks)] <- temp$logliks  
130 |     # better names:
131 |     colnames(subclustlogliks) <- paste0(name, "_", seq_len(ncol(subclustlogliks)))
132 |     
133 |     # safeguard in case we've created a cell type name that already exists:
134 |     if (any(is.element(colnames(subclustlogliks), colnames(newlogliks)))) {
135 |       colnames(subclustlogliks) <- paste0(colnames(subclustlogliks), "subcluster")
136 |     }
137 |     
138 |     # update logliks matrix:
139 |     newlogliks <- newlogliks[, setdiff(colnames(newlogliks), name)]
140 |     newlogliks <- cbind(newlogliks, subclustlogliks)
141 |     
142 |     # update clust for the subclustered cells:
143 |     clust[use] <- colnames(subclustlogliks)[apply(subclustlogliks[use, ], 1, which.max)]
144 |   }
145 |   
146 |   # get new posterior probs:
147 |   probs <- logliks2probs(newlogliks)
148 |   prob <- apply(probs, 1, max)
149 |   names(prob) <- names(clust)
150 |   
151 |   # re-calculate profiles if available:
152 |   profiles <- NULL
153 |   sds <- NULL
154 |   if (!is.null(counts) && !is.null(neg)) {
155 |     profiles_info <- Estep(counts = counts,
156 |                            clust = clust,
157 |                            neg = neg,
158 |                            assay_type=assay_type)
159 |     profiles <- profiles_info$profiles
160 |     sds <- profiles_info$sds
161 |     
162 |   }
163 |   # aligns profiles and logliks, removing lost clusters:
164 |   logliks_from_lost_celltypes <- newlogliks[, !is.element(colnames(newlogliks), unique(clust)), drop = FALSE]
165 |   newlogliks <- newlogliks[, is.element(colnames(newlogliks), clust), drop = FALSE]
166 |   profiles <- profiles[, colnames(newlogliks), drop = FALSE]
167 |   
168 |   if(identical(tolower(assay_type), "protein")){
169 |     sds <- sds[, colnames(newlogliks), drop = FALSE]
170 |   }
171 |   
172 |   if(identical(tolower(assay_type), "rna")){
173 |     sds <- NULL
174 |   }
175 |   out <- list(clust = clust, prob = prob, logliks = round(newlogliks, 4), # (rounding logliks to save memory)
176 |               profiles = profiles, sds=sds, logliks_from_lost_celltypes = round(logliks_from_lost_celltypes, 4))  
177 |   return(out)
178 | }
179 | 
180 | 
181 | #' Get a logliks matrix from a probabilities matrix
182 | #'
183 | #' @param probs probability matrix
184 | #'
185 | #' @return log-transformed matrix
186 | #' @examples 
187 | #' a <- runif(10)
188 | #' probs2logliks(a/sum(a))
189 | probs2logliks <- function(probs) {
190 |   return(log(probs))
191 | }
192 | 
193 | 
194 | #' convert logliks to probabilities
195 | #' 
196 | #' From cell x cluster log-likelihoods, calculate cell x cluster probabilities
197 | #' @param logliks Matrix of loglikelihoods, as output by insitytupe. Cells in rows, clusters in columns.
198 | #' @return A matrix of probabilities, in the same dimensions as logliks. 
199 | #' @examples 
200 | #' data("mini_nsclc")
201 | #' unsup <- insitutype(
202 | #'  x = mini_nsclc$counts,
203 | #'  neg = Matrix::rowMeans(mini_nsclc$neg),
204 | #'  n_clusts = 8,
205 | #'  n_phase1 = 200,
206 | #'  n_phase2 = 500,
207 | #'  n_phase3 = 2000,
208 | #'  n_starts = 1,
209 | #'  max_iters = 5,
210 | #'  assay_type="RNA"
211 | #' ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
212 | #' logliks2probs(unsup$logliks)
213 | #' 
214 | logliks2probs <- function(logliks) {
215 |   templogliks <- sweep(logliks, 1, apply(logliks, 1, max, na.rm = TRUE), "-")
216 |   # get on likelihood scale:
217 |   liks <- exp(templogliks)
218 |   # convert to probs
219 |   probs <- sweep(liks, 1, rowSums(liks, na.rm = TRUE), "/")
220 |   return(probs)
221 | }
222 | 


--------------------------------------------------------------------------------
/R/spatialUpdate.R:
--------------------------------------------------------------------------------
 1 | #' @title Update cell typing results with spatial context or other alternative data
 2 | #' 
 3 | #' @description
 4 | #' Takes cell typing results, then updates it based on alternative data types, 
 5 | #' e.g. spatial context, morphology, or protein expression. Existing cell typing results are 
 6 | #' put into Insitutype's likelihood framework, which then can use alternative data
 7 | #' as a prior to be updated by the expression data to get a new posterior probability 
 8 | #' of cell type.
 9 | #' Performs this operation by 
10 | #' \enumerate{
11 | #' \item deriving cell type profiles using InSituType:::Estep(), 
12 | #' \item assigning cells to "cohorts" (clusters) derived from their alternative data
13 | #' \item  Inputing the output of steps (1) and (2) into InSituType::insitutype() to 
14 | #'  re-calculate cell type. 
15 | #' }
16 | #' Paths for using alternative data in priority order (choose one; if multiple are input, only the most downstream option will be used):
17 | #' \enumerate{
18 | #' \item Input \code{xy} positions (and possibly \code{tissue}). Then cells will be clustered 
19 | #'  into cohorts based on the expression pattern of their 50 nearest neighboring cells.
20 | #' \item Input a matrix of alternative data (\code{altdata}) to be automatically clustered into cohorts. This supersedes 
21 | #'  the altdata matrix derived from the \code{xy} argument.
22 | #' \item Input your own \code{cohort} vector. This supersedes the above inputs. 
23 | #' }
24 | #' @param celltype Vector of cell type assignments to be updated
25 | #' @param counts Counts matrix (or dgCMatrix), cells * genes.
26 | #' @param neg Vector of mean negprobe counts per cell
27 | #' @param cohort Vector of cells' cohort memberships. Output of a spatial clustering algorithm makes for good cohorts. 
28 | #' @param altdata Matrix of cells' alternative data values
29 | #' @param xy 2-column matrix of cells' xy positions. 
30 | #' @param tissue Vector giving cells' tissue IDs. Used to separate tissue with overlapping xy coordinates.
31 | #' @param nb_size The size parameter to assume for the NB distribution.
32 | #' @param assay_type A string specifying which assay values to use.
33 | #' @importFrom irlba irlba
34 | #' @export
35 | spatialUpdate <- function(celltype, counts, neg, 
36 |                           cohort = NULL, altdata = NULL, xy = NULL, tissue = NULL,
37 |                           nb_size = 10, assay_type = c("rna", "protein")) {
38 |   
39 |   assay_type <- match.arg(tolower(assay_type), c("rna", "protein"))
40 |   
41 |   ## check alternative data args:
42 |   if(all(sapply(c(cohort, altdata, xy), is.null))) {
43 |     stop("Must supply cohort, altdata or xy")
44 |   }
45 |   
46 |   ## process alternative data, obtaining cohort vector:
47 |   if (is.null(cohort)) {
48 |     if (is.null(altdata)) {
49 |       # make altdata from cells' neighborhoods:
50 |       altdata <- getSpatialContext(counts = counts, xy = xy, tissue = tissue, 
51 |                                    N = 50, rad = NULL, dim_reduce_to = 20) 
52 |     }
53 |     # cluster altdata to get cohort:
54 |     cohort <- fastCohorting(mat = altdata, 
55 |                             gaussian_transform = TRUE) 
56 |   }
57 |   
58 |   ## derive reference profiles from initial cell type vector:
59 |   profiles <- Estep(counts = counts, 
60 |                     clust = celltype, 
61 |                     neg = neg,
62 |                     assay_type = assay_type)
63 |   print(str(profiles))
64 |   ## Run supervised cell typing with InSituType
65 |   res <- insitutype(x = counts,
66 |                     cohort = cohort,
67 |                     neg = neg, 
68 |                     reference_profiles = profiles$profiles,
69 |                     reference_sds = profiles$sds,
70 |                     n_clusts = 0,
71 |                     update_reference_profiles = FALSE,
72 |                     assay_type = assay_type)
73 |   res$cohort <- cohort
74 |   return(res)
75 | }
76 | 
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/R/utilities.R:
--------------------------------------------------------------------------------
  1 | #' Prepare bg data for other functions 
  2 | #' 
  3 | #' Process neg data or bg to get background for each cell
  4 | #' @param counts Counts matrix, cells * genes.
  5 | #' @param neg Vector of mean negprobe counts per cell
  6 | #' @param bg Expected background
  7 | #' @return A named vector for the estimated background of each cell
  8 | 
  9 | estimateBackground <- function(counts, neg, bg = NULL){
 10 |   # infer bg if not provided: assume background is proportional to the scaling factor s
 11 |   if (is.null(bg) && is.null(neg)) {
 12 |     stop("Must provide either bg or neg")
 13 |   }
 14 |   
 15 |   if (is.null(bg)) {
 16 |     ## get neg in condition 
 17 |     if (is.null(names(neg))) {
 18 |       names(neg) <- rownames(counts)
 19 |     }
 20 |     if (length(neg) != nrow(counts)) {
 21 |       stop("length of neg should equal nrows of counts.")
 22 |     }
 23 |     
 24 |     s <- Matrix::rowMeans(counts)
 25 |     bgmod <- stats::lm(neg ~ s - 1)
 26 |     bg <- bgmod$fitted
 27 |   }
 28 |   if (length(bg) == 1) {
 29 |     bg <- rep(bg, nrow(counts))
 30 |     names(bg) <- rownames(counts)
 31 |   }
 32 |   
 33 |   # overwrite if non-positive bg
 34 |   bg[bg <=0] <- min(1e-5, bg[bg>0])
 35 |   
 36 |   return(bg)
 37 |   
 38 | }
 39 | 
 40 | 
 41 | #' align genes in counts to profiles for other functions 
 42 | #' 
 43 | #' Process counts to have genes shared with profiles
 44 | #' @param counts Counts matrix, cells * genes.
 45 | #' @param profiles Matrix of reference profiles holding mean expression of genes x cell types. 
 46 | #'  Input linear-scale expression, with genes in rows and cell types in columns.
 47 | #' @return a cells * genes count matrix for shared genes only
 48 | alignGenes <- function(counts, profiles){
 49 |   sharedgenes <- intersect(rownames(profiles), colnames(counts))
 50 |   if (length(sharedgenes) < 10) {
 51 |     stop("Profiles have fewer than 10 genes in common with panel, use different profiles or re-run InSituType in unsupervised mode.")
 52 |   }
 53 |   lostgenes <- setdiff(colnames(counts), rownames(profiles))
 54 |   
 55 |   # subset:
 56 |   counts <- counts[, sharedgenes]
 57 |   
 58 |   # warn about genes being lost:
 59 |   if ((length(lostgenes) > 0) && length(lostgenes < 50)) {
 60 |     message(
 61 |       paste0(
 62 |         "The following genes in the count data are missing from fixed_profiles and will be omitted from downstream: ",
 63 |         paste0(lostgenes, collapse = ",")
 64 |       )
 65 |     )
 66 |   }
 67 |   if (length(lostgenes) > 50) {
 68 |     message(
 69 |       paste0(
 70 |         length(lostgenes),
 71 |         " genes in the count data are missing from fixed_profiles and will be omitted from downstream"
 72 |       )
 73 |     )
 74 |   }
 75 | 
 76 |   return(counts)
 77 | }
 78 | 
 79 | 
 80 | #' Get number of cores for parallelized operations
 81 | #'
 82 | #' @param percentCores percent of cores to use for parallelization [0-1]
 83 | #' @param minNotUsedCores minimum number of cores to leave for background processes
 84 | #' 
 85 | #' @return number of cores to use for mclapply
 86 | #' @export
 87 | numCores <- function(percentCores = 0.9, minNotUsedCores = 2) {
 88 |   if(percentCores > 1 & percentCores <= 0){
 89 |     stop("percentCores is not a valid number, must be between 0-1")
 90 |   }
 91 |   
 92 |   num_cores <- 1
 93 |   if (.Platform$OS.type == "unix") {
 94 |     if (is.null(getOption("mc.cores"))) {
 95 |       num_cores <- parallel::detectCores()
 96 |       if(num_cores <= minNotUsedCores){
 97 |         stop("minNotUsedCores must be fewer than available cores")
 98 |       }
 99 |       num_cores <- min(floor(num_cores*percentCores), num_cores-minNotUsedCores)
100 |     } else {
101 |       num_cores <- getOption("mc.cores") 
102 |     }
103 |   }
104 |   return(num_cores)
105 | }
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # InSituType
 2 |  An R package for performing cell typing in SMI and other single cell data
 3 | 
 4 | **Manuscript**: https://www.biorxiv.org/content/10.1101/2022.10.19.512902v1.abstract
 5 | 
 6 | **Citing Insitutype**: Danaher P, Zhao E, Yang Z, Ross D, Gregory M, Reitz Z, Kim TK, Baxter S, Jackson S, He S, Henderson DA. Insitutype: likelihood-based cell typing for single cell spatial transcriptomics. bioRxiv. 2022 Jan 1.
 7 | 
 8 | ### System requirements
 9 | - R (>= 3.5.0)
10 | - UNIX, Mac or Windows
11 | - Rcpp library (>= 1.0.9)
12 | - see DESCRIPTION for full dependencies
13 | 
14 | ### Demo
15 | See the "vignettes" folder. Vignettes should run in <5 minutes. 
16 | 
17 | ### Instructions for use
18 | Run "insitutype" for unsupervised or semi-supervised clustering. Run "insitutypeML" for supervised cell typing. See the vignettes for example workflows. 
19 | 
20 | ### FAQs and tips:
21 | [https://github.com/Nanostring-Biostats/InSituType/FAQs.md](https://github.com/Nanostring-Biostats/InSituType/blob/main/FAQs.md)
22 | 
23 | ### Reproduction instructions
24 | The full results of the Insitutype manuscript can be reproduced with the code in this repo: https://github.com/Nanostring-Biostats/InSituType-manuscript-analyses
25 | 
26 | ### Installation
27 | ```
28 | # Make sure Matrix and irlba are both up to date (otherwise versioning issues cause prcomp_irlba to error out):
29 | # (This is required as of Feb 2024; with any luck these packages will fix their versioning issues soon and this will not be necessary.)
30 | install.packages("Matrix", type = "source")
31 | install.packages("irlba", type = "source")
32 | 
33 | # Install Insitutype:
34 | devtools::install_github("https://github.com/Nanostring-Biostats/InSituType")
35 | ```
36 | Installation should take < 2 mins on a normal desktop computer. 
37 | 
38 | 
39 | ### Function dependencies:
40 | ![image](https://user-images.githubusercontent.com/4357938/200046292-ba3e3453-b201-4776-b5f5-6bf3dfce6ec6.png)
41 | 


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
 1 | trigger: none
 2 | 
 3 | parameters:
 4 |   - name: imageTag
 5 |     displayName: 'Enter nanopipeline-build-environment image tag from last build https://dev.azure.com/Nanostring/Gemini/_build?definitionId=525&_a=summary'
 6 |     default: '1.3.2-94008'
 7 |     type: string
 8 | 
 9 | resources:
10 | - repo: self
11 | 
12 | variables:
13 |   group: smida-build-pipeline-devnext-variables
14 | 
15 | stages:
16 | - stage: Build
17 |   displayName: Build Docker image
18 |   jobs:
19 |   - job: Build_image
20 |     displayName: Build Docker image
21 |     pool:
22 |       vmImage: ubuntu-latest
23 |     variables:
24 |       - group: smida-build-pipeline-devnext-variables
25 |     steps:
26 |     - task: ECRPullImage@1
27 |       displayName: Pull NanoPipeline Build environment image from DEVNEXT HUB ECR
28 |       inputs:
29 |         awsCredentials: 'atomx-aws-devnext-hub'
30 |         regionName: '$(AWS_REGION)'
31 |         repository: 'ecr-cac1-devnext-cosmx_da_repo-devnext-hub-smida-nanopipeline-build-environment'
32 |         imageSource: 'imagetag'
33 |         imageTag: ${{ parameters.imageTag }}
34 |     - task: Bash@3
35 |       displayName: Run smida-nanopipeline-build-environment Docker Image
36 |       inputs:
37 |         targetType: 'inline'
38 |         script: |
39 |           docker run --rm -v "$(Build.SourcesDirectory)":"/source" $(DEVNEXT_HUB_ACCOUNT_ID).dkr.ecr.$(AWS_REGION).amazonaws.com/ecr-cac1-devnext-cosmx_da_repo-devnext-hub-smida-nanopipeline-build-environment:${{ parameters.imageTag }}
40 |     - task: Bash@3
41 |       displayName: Copy .tar.gz file as latest
42 |       inputs:
43 |         targetType: 'inline'
44 |         script: |
45 |           cp *.tar.gz smiCellTyping_latest.tar.gz
46 |     - task: S3Upload@1
47 |       displayName: Upload CellTyping .tar.gz files to S3 on DEVNEXT HUB
48 |       inputs:
49 |         awsCredentials: 'atomx-aws-devnext-hub'
50 |         regionName: '$(AWS_REGION)'
51 |         bucketName: 's3-cac1-devnext-smida-assets-global-devnext'
52 |         sourceFolder: '$(Build.SourcesDirectory)'
53 |         globExpressions: '+(smiCellTyping|InSituType)_*.tar.gz'
54 |         targetFolder: 'smida/assets/nanopipeline'


--------------------------------------------------------------------------------
/data/human_signature.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/human_signature.RData


--------------------------------------------------------------------------------
/data/iocolors.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/iocolors.RData


--------------------------------------------------------------------------------
/data/ioprofiles.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/ioprofiles.RData


--------------------------------------------------------------------------------
/data/mini_nsclc.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/mini_nsclc.RData


--------------------------------------------------------------------------------
/data/mouse_signature.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/mouse_signature.RData


--------------------------------------------------------------------------------
/data/tonsil_annotation.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/tonsil_annotation.RData


--------------------------------------------------------------------------------
/data/tonsil_protein.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/tonsil_protein.RData


--------------------------------------------------------------------------------
/data/tonsil_reference_profile.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nanostring-Biostats/InSituType/f3ef0dd0814318d74675e6a613fad3ef7f1e23a1/data/tonsil_reference_profile.RData


--------------------------------------------------------------------------------
/man/Estep.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/nbclust.R
 3 | \name{Estep}
 4 | \alias{Estep}
 5 | \title{E step: estimate each cluster's mean profile}
 6 | \usage{
 7 | Estep(counts, clust, neg, assay_type = c("rna", "protein"))
 8 | }
 9 | \arguments{
10 | \item{counts}{Counts matrix, cells * genes.}
11 | 
12 | \item{clust}{Vector of cluster assignments, or a matrix of probabilities
13 | of cells (rows) belonging to clusters (columns).}
14 | 
15 | \item{neg}{Vector of mean background counts}
16 | 
17 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
18 | }
19 | \value{
20 | A list with two elements: 1.  A matrix of cluster profiles, genes * clusters. 
21 |         2. In protein mode, a matrix holding SDs, also genes * clusters. NULL in RNA mode.
22 | }
23 | \description{
24 | Given cell assignments (or posterior probabilities), estimate the mean
25 |  profile of each cluster.
26 | }
27 | \examples{
28 | data("ioprofiles")
29 | unsup <- insitutype(
30 |  x = mini_nsclc$counts,
31 |  neg = Matrix::rowMeans(mini_nsclc$neg),
32 |  n_clusts = 8,
33 |  n_phase1 = 200,
34 |  n_phase2 = 500,
35 |  n_phase3 = 2000,
36 |  n_starts = 1,
37 |  max_iters = 5,
38 |  assay_type="RNA",
39 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
40 | Estep(counts = mini_nsclc$counts, clust = unsup$clust, neg = Matrix::rowMeans(mini_nsclc$neg), assay_type="RNA")
41 | }
42 | 


--------------------------------------------------------------------------------
/man/Mstep.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/nbclust.R
 3 | \name{Mstep}
 4 | \alias{Mstep}
 5 | \title{M step}
 6 | \usage{
 7 | Mstep(
 8 |   counts,
 9 |   means,
10 |   sds = NULL,
11 |   cohort,
12 |   bg = 0.01,
13 |   size = 10,
14 |   digits = 2,
15 |   return_loglik = FALSE,
16 |   assay_type = c("rna", "protein")
17 | )
18 | }
19 | \arguments{
20 | \item{counts}{Counts matrix, cells * genes.}
21 | 
22 | \item{means}{Matrix of mean cluster profiles,
23 | with genes in rows and clusters in columns.}
24 | 
25 | \item{sds}{Matrix of standard deviation cluster profiles,
26 | with genes in rows and clusters in columns.}
27 | 
28 | \item{cohort}{a vector of cells' "cohort" assignment, used to update logliks 
29 | based on cluster frequencies within a cohort.}
30 | 
31 | \item{bg}{Expected background}
32 | 
33 | \item{size}{NB size parameter}
34 | 
35 | \item{digits}{Round the output to this many digits (saves memory)}
36 | 
37 | \item{return_loglik}{If TRUE, logliks will be returned. If FALSE, probabilities will be returned.}
38 | 
39 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
40 | }
41 | \value{
42 | Matrix of probabilities of each cell belonging to each cluster
43 | }
44 | \description{
45 | Compute probability that each cell belongs to a given cluster
46 | }
47 | \examples{
48 | data("mini_nsclc")
49 | data("ioprofiles")
50 | sharedgenes <- intersect(rownames(ioprofiles), colnames(mini_nsclc$counts))
51 | Mstep(mini_nsclc$counts, ioprofiles[sharedgenes, ], bg = Matrix::rowMeans(mini_nsclc$neg), cohort = NULL, assay_type="RNA")
52 | }
53 | 


--------------------------------------------------------------------------------
/man/alignGenes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utilities.R
 3 | \name{alignGenes}
 4 | \alias{alignGenes}
 5 | \title{align genes in counts to profiles for other functions}
 6 | \usage{
 7 | alignGenes(counts, profiles)
 8 | }
 9 | \arguments{
10 | \item{counts}{Counts matrix, cells * genes.}
11 | 
12 | \item{profiles}{Matrix of reference profiles holding mean expression of genes x cell types. 
13 | Input linear-scale expression, with genes in rows and cell types in columns.}
14 | }
15 | \value{
16 | a cells * genes count matrix for shared genes only
17 | }
18 | \description{
19 | Process counts to have genes shared with profiles
20 | }
21 | 


--------------------------------------------------------------------------------
/man/chooseClusterNumber.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chooseClusterNumber.R
 3 | \name{chooseClusterNumber}
 4 | \alias{chooseClusterNumber}
 5 | \title{Estimate the correct number of clusters using a subset of the data}
 6 | \usage{
 7 | chooseClusterNumber(
 8 |   counts,
 9 |   neg,
10 |   assay_type = c("rna", "protein"),
11 |   bg = NULL,
12 |   fixed_profiles = NULL,
13 |   fixed_sds = NULL,
14 |   cohort = NULL,
15 |   init_clust = NULL,
16 |   n_clusts = 2:12,
17 |   max_iters = 10,
18 |   subset_size = 1000,
19 |   align_genes = TRUE,
20 |   plotresults = FALSE,
21 |   nb_size = 10,
22 |   pct_drop = 0.005,
23 |   min_prob_increase = 0.05,
24 |   ...
25 | )
26 | }
27 | \arguments{
28 | \item{counts}{Counts matrix, cells * genes.}
29 | 
30 | \item{neg}{Vector of mean negprobe counts per cell (default = "rna")}
31 | 
32 | \item{assay_type}{Assay type of RNA, protein}
33 | 
34 | \item{bg}{Expected background}
35 | 
36 | \item{fixed_profiles}{Matrix of cluster profiles to hold unchanged throughout iterations.}
37 | 
38 | \item{fixed_sds}{Matrix of SDs expression of genes x cell types,to hold unchanged throughout iterations. Only for assay_type of protein}
39 | 
40 | \item{cohort}{Vector of cells' cohort assignments.}
41 | 
42 | \item{init_clust}{Vector of initial cluster assignments.}
43 | 
44 | \item{n_clusts}{Vector giving a range of cluster numbers to consider.}
45 | 
46 | \item{max_iters}{Number of iterations in each clustering attempt. Recommended to choose
47 | a smaller number for a quicker, approximate clustering.}
48 | 
49 | \item{subset_size}{Number of cells to include in clustering.}
50 | 
51 | \item{align_genes}{Logical, for whether to align the genes in fixed_profiles with the colnames in count}
52 | 
53 | \item{plotresults}{Logical, for whether to plot the results.}
54 | 
55 | \item{nb_size}{The size parameter to assume for the NB distribution.}
56 | 
57 | \item{pct_drop}{the decrease in percentage of cell types with a valid switchover to 
58 | another cell type compared to the last iteration. Default value: 1/10000. A valid 
59 | switchover is only applicable when a cell has changed the assigned cell type with its
60 | highest cell type probability increased by min_prob_increase.}
61 | 
62 | \item{min_prob_increase}{the threshold of probability used to determine a valid cell 
63 | type switchover}
64 | 
65 | \item{...}{Arguments passed to nbclust.}
66 | }
67 | \value{
68 | A list, with the following elements:
69 | \itemize{
70 |  \item
71 | }
72 | }
73 | \description{
74 | For a subset of the data, perform clustering under a range of cluster numbers.
75 |  Report on loglikelihood vs. number of clusters, and suggest a best choice.
76 | }
77 | \examples{
78 | data("mini_nsclc")
79 | chooseClusterNumber(mini_nsclc$counts, Matrix::rowMeans(mini_nsclc$neg), assay_type="RNA",
80 |  n_clust = 2:5)
81 | }
82 | 


--------------------------------------------------------------------------------
/man/choose_anchors_from_stats.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/find_anchor_cells.R
 3 | \name{choose_anchors_from_stats}
 4 | \alias{choose_anchors_from_stats}
 5 | \title{Choose anchor cells given anchor stats}
 6 | \usage{
 7 | choose_anchors_from_stats(
 8 |   counts,
 9 |   neg = NULL,
10 |   bg,
11 |   anchorstats = NULL,
12 |   cos = NULL,
13 |   llr = NULL,
14 |   n_cells = 500,
15 |   min_cosine = 0.3,
16 |   min_scaled_llr = 0.01,
17 |   insufficient_anchors_thresh = 20,
18 |   assay_type = c("rna", "protein")
19 | )
20 | }
21 | \arguments{
22 | \item{counts}{Counts matrix, cells * genes.}
23 | 
24 | \item{neg}{Vector of mean negprobe counts per cell}
25 | 
26 | \item{bg}{Expected background}
27 | 
28 | \item{anchorstats}{Output from get_anchor_stats. Must provide either this or
29 | both cos and llr matrices.}
30 | 
31 | \item{cos}{Matrix of cosine distances from reference profiles. Cells in rows,
32 | cell types in columns.}
33 | 
34 | \item{llr}{Matrix of log likelihood ratios from reference profiles. Cells in
35 | rows, cell types in columns.}
36 | 
37 | \item{n_cells}{Up to this many cells will be taken as anchor points}
38 | 
39 | \item{min_cosine}{Cells must have at least this much cosine similarity to a
40 | fixed profile to be used as an anchor}
41 | 
42 | \item{min_scaled_llr}{Cells must have (log-likelihood ratio / totalcounts)
43 | above this threshold to be used as an anchor}
44 | 
45 | \item{insufficient_anchors_thresh}{Cell types that end up with fewer than
46 | this many anchors will be discarded.}
47 | 
48 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
49 | }
50 | \value{
51 | A vector holding anchor cell assignments (or NA) for each cell in the
52 |   counts matrix
53 | }
54 | \description{
55 | Starting with cosine distances and log likelihood ratios, choose anchor
56 | cells.
57 | }
58 | \examples{
59 | data("ioprofiles")
60 | data("mini_nsclc")
61 | counts <- mini_nsclc$counts
62 | astats <- get_anchor_stats(counts = counts,
63 |                          neg = Matrix::rowMeans(mini_nsclc$neg),
64 |                          sds=NULL, assay_type = "RNA",
65 |                          profiles = ioprofiles)
66 | 
67 | ## estimate per-cell bg as a fraction of total counts:
68 | negmean.per.totcount <- mean(rowMeans(mini_nsclc$neg)) / mean(rowSums(counts))
69 | per.cell.bg <- rowSums(counts) * negmean.per.totcount
70 | 
71 | # now choose anchors:
72 | choose_anchors_from_stats(counts = counts, 
73 |                           neg = Matrix::rowMeans(mini_nsclc$neg),
74 |                           bg = per.cell.bg,
75 |                           anchorstats = astats, 
76 |                           # a very low value chosen for the mini
77 |                           # dataset. Typically hundreds of cells
78 |                           # would be better.
79 |                           n_cells = 50, 
80 |                           min_cosine = 0.4, 
81 |                           min_scaled_llr = 0.03, 
82 |                           insufficient_anchors_thresh = 5,
83 |                           assay_type="RNA")
84 | }
85 | 


--------------------------------------------------------------------------------
/man/colorCellTypes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/colorCellTypes.R
 3 | \name{colorCellTypes}
 4 | \alias{colorCellTypes}
 5 | \title{Function to choose colors for cell types}
 6 | \usage{
 7 | colorCellTypes(
 8 |   names = NULL,
 9 |   freqs = NULL,
10 |   init_colors = NULL,
11 |   max_sum_rgb = 600,
12 |   palette = "earthplus"
13 | )
14 | }
15 | \arguments{
16 | \item{names}{Vector of cell type names}
17 | 
18 | \item{freqs}{Optional, named vector of cell type abundance (e.g. c(T = 1000,
19 | tumor = 15000...))}
20 | 
21 | \item{init_colors}{Optional, a named vector of cell colors. This will be used
22 | for all cell types in the "names" vector that match names(init_colors).
23 | Intended for use with the iocolors vector (found in the Ptolemy package
24 | data).}
25 | 
26 | \item{max_sum_rgb}{Don't return any colors with total rgb values above this
27 | level. (Removes excessively light colors.)}
28 | 
29 | \item{palette}{One of "tableau20", "brewers" or "earthplus".}
30 | }
31 | \value{
32 | A named color vector
33 | }
34 | \description{
35 | Uses Giotto::getDistinctColors to begin with. Orders colors so the most
36 | common cell types get the lightest colors. Removes colors that are too light
37 | (sum of rgb values > 600)
38 | }
39 | \examples{
40 | data("mini_nsclc")
41 | unsup <- insitutype(
42 |  x = mini_nsclc$counts,
43 |  neg = Matrix::rowMeans(mini_nsclc$neg),
44 |  n_clusts = 8,
45 |  n_phase1 = 200,
46 |  n_phase2 = 500,
47 |  n_phase3 = 2000,
48 |  n_starts = 1,
49 |  max_iters = 5,
50 |  assay_type="RNA"
51 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
52 | colorCellTypes(freqs = table(unsup$clust), palette = "brewers")
53 | }
54 | 


--------------------------------------------------------------------------------
/man/estimateBackground.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utilities.R
 3 | \name{estimateBackground}
 4 | \alias{estimateBackground}
 5 | \title{Prepare bg data for other functions}
 6 | \usage{
 7 | estimateBackground(counts, neg, bg = NULL)
 8 | }
 9 | \arguments{
10 | \item{counts}{Counts matrix, cells * genes.}
11 | 
12 | \item{neg}{Vector of mean negprobe counts per cell}
13 | 
14 | \item{bg}{Expected background}
15 | }
16 | \value{
17 | A named vector for the estimated background of each cell
18 | }
19 | \description{
20 | Process neg data or bg to get background for each cell
21 | }
22 | 


--------------------------------------------------------------------------------
/man/estimatePlatformEffects.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rescaleProfiles.R
 3 | \name{estimatePlatformEffects}
 4 | \alias{estimatePlatformEffects}
 5 | \title{Platform effect adjustment on reference profiles based on the expression profiles of anchors}
 6 | \usage{
 7 | estimatePlatformEffects(
 8 |   counts,
 9 |   neg,
10 |   assay_type = c("rna", "protein"),
11 |   bg = NULL,
12 |   anchors,
13 |   profiles,
14 |   sds = NULL,
15 |   blacklist = NULL
16 | )
17 | }
18 | \arguments{
19 | \item{counts}{Counts matrix, cells * genes.}
20 | 
21 | \item{neg}{Vector of mean negprobe counts per cell}
22 | 
23 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
24 | 
25 | \item{bg}{Expected background}
26 | 
27 | \item{anchors}{Vector giving "anchor" cell types, for use in semi-supervised
28 | clustering. Vector elements will be mainly NA's (for non-anchored cells)
29 | and cell type names for cells to be held constant throughout iterations.}
30 | 
31 | \item{profiles}{Matrix of reference profiles holding mean expression of genes x cell types. 
32 | Input linear-scale expression, with genes in rows and cell types in columns.}
33 | 
34 | \item{sds}{Matrix of reference profiles holding SDs expression of genes x cell types. 
35 | Input linear-scale expression, with genes in rows and cell types in columns. Only for assay_type of protein}
36 | 
37 | \item{blacklist}{vector of user-defined genes to be excluded for cell typing (default = NULL)}
38 | }
39 | \value{
40 | A list with five elements: 
41 | \describe{
42 |     \item{rescaled_profiles}{genes * cell types Matrix of rescaled reference profiles with platform effect corrected }
43 |     \item{platformEff_statsDF}{a data.frame for statistics on platform effect estimation with genes in rows and columns for `Gene`, `Beta`, `beta_SE`.}
44 |     \item{anchors}{a named vector of anchors used for platform effect estimation}
45 |     \item{blacklist}{a vector of genes excluded from cell typing, including both outliers identified in platform effect estimation and the user-defined genes}
46 |     \item{lostgenes}{a vector of genes excluded from platform effect estiamtion due to insufficient evidence}
47 | }
48 | }
49 | \description{
50 | The general workflow would be: (1) extract the anchor cells from input; 
51 | (2) Run poisson regression with anchor cells; (3) Filter user defined genes(if any) 
52 | and genes with extreme betas, outside [0.01, 100]; (4) Re-scale Profile with Beta estimates.
53 | }
54 | \details{
55 | Calculates gene-wise scaling factor between reference profiles and the observed profiles of the provided anchors.
56 | }
57 | 


--------------------------------------------------------------------------------
/man/fastCohorting.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fastCohorting.R
 3 | \name{fastCohorting}
 4 | \alias{fastCohorting}
 5 | \title{Quickly split cells into cohorts}
 6 | \usage{
 7 | fastCohorting(mat, n_cohorts = NULL, gaussian_transform = TRUE)
 8 | }
 9 | \arguments{
10 | \item{mat}{Matrix of variables to be used in cohorting, cells in rows, and variables in columns.
11 | Recommended to use < 20 variables.}
12 | 
13 | \item{n_cohorts}{Number of clusters to divide cells into}
14 | 
15 | \item{gaussian_transform}{Whether to map each variable onto the quantiles of a normal distribution.}
16 | }
17 | \value{
18 | A vector of cohort assignments.
19 | }
20 | \description{
21 | Quickly split cells into cohorts using non-RNA data like spatial context and immunofluorescence values.
22 | Rule of thumb: include any variables that might be informative for cell typing, 
23 |  *except* variables you'll want to analyze later. For example, if you'll later
24 |  perform differential expression as a function of spatial context, then it's 
25 |  safer to exclude spatial context from the cell typing exercise (and therefore 
26 |  from this function).
27 | }
28 | \examples{
29 | data("mini_nsclc")
30 | ## simulate immunofluorescence data: 
31 | immunofluordata <- matrix(rpois(n = nrow(mini_nsclc$counts) * 4, lambda = 100), 
32 |                           nrow(mini_nsclc$counts))
33 | cohort <- fastCohorting(immunofluordata, gaussian_transform = TRUE)
34 | table(cohort)
35 | }
36 | 


--------------------------------------------------------------------------------
/man/find_anchor_cells.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/find_anchor_cells.R
 3 | \name{find_anchor_cells}
 4 | \alias{find_anchor_cells}
 5 | \title{Choose anchor cells}
 6 | \usage{
 7 | find_anchor_cells(
 8 |   counts,
 9 |   neg = NULL,
10 |   bg = NULL,
11 |   align_genes = TRUE,
12 |   profiles,
13 |   sds = NULL,
14 |   size = 10,
15 |   assay_type = c("rna", "protein"),
16 |   n_cells = 500,
17 |   min_cosine = 0.3,
18 |   min_scaled_llr = 0.01,
19 |   insufficient_anchors_thresh = 20,
20 |   refinement = FALSE
21 | )
22 | }
23 | \arguments{
24 | \item{counts}{Counts matrix, cells * genes.}
25 | 
26 | \item{neg}{Vector of mean negprobe counts per cell}
27 | 
28 | \item{bg}{Expected background}
29 | 
30 | \item{align_genes}{Logical, for whether to align the columns of the counts
31 | matrix and the rows of the profiles matrix based on their names.}
32 | 
33 | \item{profiles}{Matrix of reference profiles holding mean expression of genes
34 | x cell types. Input linear-scale expression, with genes in rows and cell
35 | types in columns.}
36 | 
37 | \item{sds}{Matrix of reference profiles holding SDs expression of genes x cell types. 
38 | Input linear-scale expression, with genes in rows and cell types in columns. Only for assay_type of protein}
39 | 
40 | \item{size}{Negative binomial size parameter to be used in likelihood calculation. Only for assay_type of RNA}
41 | 
42 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
43 | 
44 | \item{n_cells}{Up to this many cells will be taken as anchor points}
45 | 
46 | \item{min_cosine}{Cells must have at least this much cosine similarity to a
47 | fixed profile to be used as an anchor}
48 | 
49 | \item{min_scaled_llr}{Cells must have (log-likelihood ratio / totalcounts)
50 | above this threshold to be used as an anchor}
51 | 
52 | \item{insufficient_anchors_thresh}{Cell types that end up with fewer than
53 | this many anchors will be discarded.}
54 | 
55 | \item{refinement}{flag to further refine the anchors via UMAP projection (default = FALSE)}
56 | }
57 | \value{
58 | A vector holding anchor cell assignments (or NA) for each cell in the
59 |   counts matrix
60 | }
61 | \description{
62 | Finds cells with very good fits to the reference profiles, and saves these
63 | cells for use as "anchors" in the semi-supervised learning version of
64 | nbclust. The function would first pick anchor cell candidates through stats 
65 | and then refine anchors based on umap projection.
66 | }
67 | \examples{
68 | data("ioprofiles")
69 | data("mini_nsclc")
70 | sharedgenes <- intersect(colnames(mini_nsclc$counts), rownames(ioprofiles))
71 | find_anchor_cells(counts = mini_nsclc$counts[, sharedgenes], 
72 |                   assay_type="RNA", 
73 |                   sds=NULL,
74 |                   neg = Matrix::rowMeans(mini_nsclc$neg),
75 |                   profiles = ioprofiles)
76 | }
77 | 


--------------------------------------------------------------------------------
/man/flightpath_layout.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/flightpath_layout.R
 3 | \name{flightpath_layout}
 4 | \alias{flightpath_layout}
 5 | \title{"Flightpath" (umap-like) plot of clustering results}
 6 | \usage{
 7 | flightpath_layout(
 8 |   logliks = NULL,
 9 |   probs = NULL,
10 |   profiles = NULL,
11 |   cluster_xpos = NULL,
12 |   cluster_ypos = NULL
13 | )
14 | }
15 | \arguments{
16 | \item{logliks}{Matrix of cells' log-likelihoods under each cluster. Must
17 | provide this or probs argument.}
18 | 
19 | \item{probs}{Matrix of cells' probabilities of belonging to each cluster.
20 | Must provide this or logliks argument.}
21 | 
22 | \item{profiles}{Matrix of cell type mean expression profiles. If provided,
23 | profiles rather than probs will be used to lay out the centroids.}
24 | 
25 | \item{cluster_xpos}{Vector of cluster centroids' x positions (i.e. where you
26 | want each cell type to appear in the plot)}
27 | 
28 | \item{cluster_ypos}{Vector of cluster centroids' y positions}
29 | }
30 | \value{
31 | A list with two elements: \enumerate{ \item clustpos: a matrix of
32 |   cluster centroids * x,y positions in the flightpath plot \item cellpos: A
33 |   matrix of cells * x,y positions in the flightpath plot }
34 | }
35 | \description{
36 | Arrays cells in 2d space based on their probability of belonging to a given
37 | cluster.
38 | }
39 | \examples{
40 | data("mini_nsclc")
41 | unsup <- insitutype(
42 |  x = mini_nsclc$counts,
43 |  neg = Matrix::rowMeans(mini_nsclc$neg),
44 |  assay_type = "RNA",
45 |  n_clusts = 8,
46 |  n_phase1 = 200,
47 |  n_phase2 = 500,
48 |  n_phase3 = 2000,
49 |  n_starts = 1,
50 |  max_iters = 5
51 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
52 | flightpath_layout(logliks = unsup$logliks, profiles = unsup$profiles)
53 | }
54 | 


--------------------------------------------------------------------------------
/man/flightpath_plot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/flightpath_layout.R
 3 | \name{flightpath_plot}
 4 | \alias{flightpath_plot}
 5 | \title{Plot flightpath results}
 6 | \usage{
 7 | flightpath_plot(
 8 |   flightpath_result = NULL,
 9 |   insitutype_result = NULL,
10 |   col = NULL,
11 |   showclusterconfidence = TRUE
12 | )
13 | }
14 | \arguments{
15 | \item{flightpath_result}{The list output by the flightpath_layout function.
16 | Two elements: clustpos, cellpos. Must provide either this or
17 | insitutype_result.}
18 | 
19 | \item{insitutype_result}{The list output by insitutype or insitutypeML. Must
20 | provide either this or insitutype_result.}
21 | 
22 | \item{col}{Optional, a vector of cell colors, with length equal to the number
23 | of individual cells.}
24 | 
25 | \item{showclusterconfidence}{Logical, for whether to label clusters with the
26 | average posterior probability of the cells within them. Gives a readout of
27 | how distinct a cluster is from the others.}
28 | }
29 | \value{
30 | a ggplot object
31 | }
32 | \description{
33 | Plot flightpath results
34 | }
35 | \examples{
36 | data("ioprofiles")
37 | unsup <- insitutype(
38 |  x = mini_nsclc$counts,
39 |  neg = Matrix::rowMeans(mini_nsclc$neg),
40 |  n_clusts = 8,
41 |  n_phase1 = 200,
42 |  n_phase2 = 500,
43 |  n_phase3 = 2000,
44 |  n_starts = 1,
45 |  max_iters = 5,
46 |  assay_type="RNA"
47 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
48 | flightpath_plot(insitutype_result = unsup)
49 | }
50 | 


--------------------------------------------------------------------------------
/man/gen_profiles_protein_annotation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_profiles_protein.R
 3 | \name{gen_profiles_protein_annotation}
 4 | \alias{gen_profiles_protein_annotation}
 5 | \title{Generate the mean reference profile and its SD reference profile from an annotation file
 6 | This function is only for protein data set with known anchor cells and their cell types}
 7 | \usage{
 8 | gen_profiles_protein_annotation(exp.mat, anno)
 9 | }
10 | \arguments{
11 | \item{exp.mat}{a matrix of raw protein expression data. cells are in rows and proteins are in columns}
12 | 
13 | \item{anno}{a data frame or matrix of cell types for anchor cells or manually annotated cell typing information for some cells. Should include cell_ID and celltype at least.}
14 | }
15 | \value{
16 | A list, with the following elements:
17 | \enumerate{
18 | \item mean.ref.profile: a matrix of cluster-specific expression profiles. proteins * cell types
19 | \item SDs.ref.profile: a matrix of standard deviation profiles of pre-defined clusters. proteins * cell types
20 | \item anchors: a vector giving "anchor" cell types. Vector elements will be mainly NA's (for non-anchored cells)
21 | }
22 | }
23 | \description{
24 | Generate the mean reference profile and its SD reference profile from an annotation file
25 | This function is only for protein data set with known anchor cells and their cell types
26 | }
27 | 


--------------------------------------------------------------------------------
/man/gen_profiles_protein_expression.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gen_profiles_protein.R
 3 | \name{gen_profiles_protein_expression}
 4 | \alias{gen_profiles_protein_expression}
 5 | \title{Generate the mean reference profile and its SD reference profile based on the data itself
 6 | This function is based on signature matrix included in CELESTA package 
 7 | First, we rebuild a nested cell typing lists based on the 2-D signature matrix
 8 | Second, we identify anchor cells ranked by their expression level for each cell type's protein marker
 9 | Third, we estimate averaged expression level and SDs for proteins and cell types using the anchors}
10 | \usage{
11 | gen_profiles_protein_expression(
12 |   exp.mat,
13 |   sig_mat = NULL,
14 |   cutoff = 0.9,
15 |   min.num.cells = 30,
16 |   keep_marker_proteins = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{exp.mat}{a matrix of raw protein expression data. cells are in rows and proteins are in columns}
21 | 
22 | \item{sig_mat}{a signature matrix of cell types. cell types x protein markers}
23 | 
24 | \item{cutoff}{a cutoff of quantile. e.g) cutoff=0.9 means that top 90 percentiles of cells are called anchors for the protein expression}
25 | 
26 | \item{min.num.cells}{a minimum number of cells each cell type to estimate its mean or SDs. default value is 30.}
27 | 
28 | \item{keep_marker_proteins}{whether just marker proteins from the signature matrix is kept. default value is FALSE, which returns all proteins included in the data}
29 | }
30 | \value{
31 | A list, with the following elements:
32 | \enumerate{
33 | \item mean.ref.profile: a matrix of cluster-specific expression profiles. proteins x cell types
34 | \item SDs.ref.profile: a matrix of standard deviation profiles of pre-defined clusters. proteins x cell types
35 | \item anchors: a vector giving "anchor" cell types. Vector elements will be mainly NA's (for non-anchored cells)
36 | }
37 | }
38 | \description{
39 | Generate the mean reference profile and its SD reference profile based on the data itself
40 | This function is based on signature matrix included in CELESTA package 
41 | First, we rebuild a nested cell typing lists based on the 2-D signature matrix
42 | Second, we identify anchor cells ranked by their expression level for each cell type's protein marker
43 | Third, we estimate averaged expression level and SDs for proteins and cell types using the anchors
44 | }
45 | \examples{
46 | data("tonsil_protein")
47 | data("human_signature")
48 | data("mouse_signature")
49 | references <- gen_profiles_protein_expression(
50 |  exp.mat=tonsil_protein$counts,
51 |  sig_mat=NULL)
52 | }
53 | 


--------------------------------------------------------------------------------
/man/geoSketch.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/geoSketch.R
 3 | \name{geoSketch}
 4 | \alias{geoSketch}
 5 | \title{Function for creating a biased sample of a given dataset with the aim of retaining cells with unique expression vectors}
 6 | \usage{
 7 | geoSketch(
 8 |   X,
 9 |   N,
10 |   alpha = 0.1,
11 |   max_iter = 200,
12 |   returnBins = FALSE,
13 |   minCellsPerBin = 1
14 | )
15 | }
16 | \arguments{
17 | \item{X}{feature matrix with cellIDs as rows and featureIDs as columns (can be counts, normalized expression, PCA, UMAP, etc.)}
18 | 
19 | \item{N}{desired sample size}
20 | 
21 | \item{alpha}{defines the acceptable minimum number of bins to sample from as `(1-alpha)*N`}
22 | 
23 | \item{max_iter}{maximum number of iterations used to achieve an acceptable minimum number of bins}
24 | 
25 | \item{returnBins}{determines whether or not to pass back bin labels for each cell}
26 | 
27 | \item{minCellsPerBin}{the minimum number of cells required for a bin to be considered for sampling}
28 | }
29 | \value{
30 | sampledCells, a vector of cellIDs sampled using the geometric sketching method
31 | 
32 | Plaid, a named vector of binIDs where names correspond to cellIDs
33 | }
34 | \description{
35 | Function for creating a biased sample of a given dataset with the aim of retaining cells with unique expression vectors
36 | }
37 | \examples{
38 | data("mini_nsclc")
39 | geoSketch(mini_nsclc$counts, 200)
40 | }
41 | 


--------------------------------------------------------------------------------
/man/geoSketch_get_plaid.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/geoSketch.R
 3 | \name{geoSketch_get_plaid}
 4 | \alias{geoSketch_get_plaid}
 5 | \title{Bin cells into "plaids"}
 6 | \usage{
 7 | geoSketch_get_plaid(
 8 |   X,
 9 |   N,
10 |   alpha = 0.1,
11 |   max_iter = 200,
12 |   returnBins = FALSE,
13 |   minCellsPerBin = 1
14 | )
15 | }
16 | \arguments{
17 | \item{X}{feature matrix with cellIDs as rows and featureIDs as columns (can
18 | be counts, normalized expression, PCA, UMAP, etc.)}
19 | 
20 | \item{N}{desired sample size}
21 | 
22 | \item{alpha}{defines the acceptable minimum number of bins to sample from as
23 | `(1-alpha)*N`}
24 | 
25 | \item{max_iter}{maximum number of iterations used to achieve an acceptable
26 | minimum number of bins}
27 | 
28 | \item{returnBins}{determines whether or not to pass back bin labels for each
29 | cell}
30 | 
31 | \item{minCellsPerBin}{the minimum number of cells required for a bin to be
32 | considered for sampling}
33 | }
34 | \value{
35 | Plaid, a named vector of binIDs where names correspond to cellIDs
36 | }
37 | \description{
38 | Assign cells to "plaids", very rough clusters.
39 | }
40 | \examples{
41 | data("mini_nsclc")
42 | geoSketch_get_plaid(mini_nsclc$counts, 100)
43 | }
44 | 


--------------------------------------------------------------------------------
/man/geoSketch_sample_from_plaids.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/geoSketch.R
 3 | \name{geoSketch_sample_from_plaids}
 4 | \alias{geoSketch_sample_from_plaids}
 5 | \title{Subsample from plaids}
 6 | \usage{
 7 | geoSketch_sample_from_plaids(Plaid, N)
 8 | }
 9 | \arguments{
10 | \item{Plaid}{Vector of cells' plaid IDs}
11 | 
12 | \item{N}{desired sample size}
13 | }
14 | \value{
15 | Plaid, a named vector of binIDs where names correspond to cellIDs
16 | 
17 | sampledCells, a vector of cellIDs sampled using the geometric sketching method
18 | }
19 | \description{
20 | Sample cells, trying to give each plaid equal representation
21 | }
22 | \examples{
23 | data("mini_nsclc")
24 | plaids <- geoSketch_get_plaid(mini_nsclc$counts, 100)
25 | geoSketch_sample_from_plaids(plaids, 5)
26 | }
27 | 


--------------------------------------------------------------------------------
/man/getMeanClusterConfidence.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/flightpath_layout.R
 3 | \name{getMeanClusterConfidence}
 4 | \alias{getMeanClusterConfidence}
 5 | \title{Summarize clusters' mean confidence}
 6 | \usage{
 7 | getMeanClusterConfidence(probs)
 8 | }
 9 | \arguments{
10 | \item{probs}{Matrix of probabilities}
11 | }
12 | \value{
13 | a vector of mean confidences, with values of 1 corresponding to clusters with only prob == 1
14 | }
15 | \description{
16 | Calculate the mean confidence of the cell calls from each cluster
17 | }
18 | \examples{
19 | data("mini_nsclc")
20 | probs <- sapply(rownames(mini_nsclc$counts), function(x) {a = runif(10); a/sum(a)})
21 | dimnames(probs)[[1]] <- letters[1:10]
22 | probs <- t(probs)
23 | getMeanClusterConfidence(probs)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/getProteinParameters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getProfiles.R
 3 | \name{getProteinParameters}
 4 | \alias{getProteinParameters}
 5 | \title{Extract mean background-subtracted profiles of RNA data}
 6 | \usage{
 7 | getProteinParameters(x, clust)
 8 | }
 9 | \arguments{
10 | \item{x}{Expression matrix, cells * proteins.}
11 | 
12 | \item{clust}{Vector of cluster assignments, or a matrix of probabilities
13 | of cells (rows) belonging to clusters (columns).}
14 | 
15 | \item{neg}{Vector of mean background counts}
16 | }
17 | \value{
18 | List with two elements: "profiles", a matrix of protein x cell type expression profiles, and "sds", a matrix of SD's.
19 | }
20 | \description{
21 | Given cell assignments and count data, estimate the mean
22 |  profile of each cluster.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/getRNAprofiles.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getProfiles.R
 3 | \name{getRNAprofiles}
 4 | \alias{getRNAprofiles}
 5 | \title{Extract mean background-subtracted profiles of RNA data}
 6 | \usage{
 7 | getRNAprofiles(x, neg, clust)
 8 | }
 9 | \arguments{
10 | \item{x}{Counts matrix, cells * genes.}
11 | 
12 | \item{neg}{Vector of mean background counts (or a single value applied to all cells)}
13 | 
14 | \item{clust}{Vector of cluster assignments, or a matrix of probabilities
15 | of cells (rows) belonging to clusters (columns).}
16 | }
17 | \value{
18 | A matrix of gene x cell type expression profiles.
19 | }
20 | \description{
21 | Given cell assignments and count data, estimate the mean
22 |  profile of each cluster.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/getSpatialContext.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getSpatialContext.R
 3 | \name{getSpatialContext}
 4 | \alias{getSpatialContext}
 5 | \title{Get the neighborhood expression profile around all cells}
 6 | \usage{
 7 | getSpatialContext(
 8 |   counts,
 9 |   xy,
10 |   tissue = NULL,
11 |   N = 50,
12 |   rad = NULL,
13 |   dim_reduce_to = NULL
14 | )
15 | }
16 | \arguments{
17 | \item{counts}{Counts matrix}
18 | 
19 | \item{xy}{2-column matrix of cells' xy positions}
20 | 
21 | \item{tissue}{vector of tissue IDs. Used to ensure cells for different tissues are never called neighbors}
22 | 
23 | \item{N}{number of neighbors to use. Specify this or \code{rad}.}
24 | 
25 | \item{rad}{radius to use to define neighbors. Specify this or \code{N}.}
26 | 
27 | \item{dim_reduce_to}{If entered, the neighborhood matrix will be reduced to this many PCs}
28 | }
29 | \value{
30 | A matrix of neighborhood expression, potentially by gene, or else by PCs if \code{dim_reduce_to} was set.
31 | }
32 | \description{
33 | Get the neighborhood expression profile around all cells
34 | }
35 | 


--------------------------------------------------------------------------------
/man/get_anchor_stats.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/find_anchor_cells.R
 3 | \name{get_anchor_stats}
 4 | \alias{get_anchor_stats}
 5 | \title{Get anchor stats}
 6 | \usage{
 7 | get_anchor_stats(
 8 |   counts,
 9 |   neg = NULL,
10 |   bg = NULL,
11 |   align_genes = TRUE,
12 |   profiles,
13 |   sds = NULL,
14 |   size = 10,
15 |   assay_type = c("rna", "protein"),
16 |   min_cosine = 0.3
17 | )
18 | }
19 | \arguments{
20 | \item{counts}{Counts matrix, cells * genes.}
21 | 
22 | \item{neg}{Vector of mean negprobe counts per cell}
23 | 
24 | \item{bg}{Expected background}
25 | 
26 | \item{align_genes}{Logical, for whether to align the columns of the counts matrix and the rows of
27 | the profiles matrix based on their names.}
28 | 
29 | \item{profiles}{Matrix of reference profiles holding mean expression of genes x cell types. 
30 | Input linear-scale expression, with genes in rows and cell types in columns.}
31 | 
32 | \item{sds}{Matrix of reference profiles holding SDs expression of genes x cell types. 
33 | Input linear-scale expression, with genes in rows and cell types in columns. Only for assay_type of protein}
34 | 
35 | \item{size}{Negative binomial size parameter to be used in likelihood calculation.}
36 | 
37 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
38 | 
39 | \item{min_cosine}{Cells must have at least this much cosine similarity to a fixed profile to be used as an anchor.}
40 | }
41 | \value{
42 | A list with two elements: cos, the matrix of cosine distances;
43 |  and llr, the matrix of log likelihood ratios of each cell under each cell type vs. the 2nd best cell type.
44 | }
45 | \description{
46 | Compute the statistics used in finding anchor cells.
47 | Often the anchor cell selection process will involve some trial-and-error. 
48 | This function performs the computationally-expensive steps that only need to 
49 | happen once.
50 | }
51 | \examples{
52 | data("ioprofiles")
53 | data("mini_nsclc")
54 | get_anchor_stats(counts = mini_nsclc$counts,
55 |                  neg = Matrix::rowMeans(mini_nsclc$neg),
56 |                  profiles = ioprofiles,
57 |                  sds=NULL, 
58 |                  assay_type = "RNA")
59 | }
60 | 


--------------------------------------------------------------------------------
/man/get_neighborhood_expression.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getSpatialContext.R
 3 | \name{get_neighborhood_expression}
 4 | \alias{get_neighborhood_expression}
 5 | \title{Calculate neighborhood expression}
 6 | \usage{
 7 | get_neighborhood_expression(counts, neighbors)
 8 | }
 9 | \arguments{
10 | \item{counts}{Single cell expression matrix}
11 | 
12 | \item{neighbors}{A neighbors adjacency matrix}
13 | }
14 | \value{
15 | A matrix in the same dimensions as \code{counts}, giving the expression profile of each cell's neighborhood.
16 | }
17 | \description{
18 | Calculates the expression profile of each cell's neighborhood
19 | }
20 | 


--------------------------------------------------------------------------------
/man/human_signature.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{human_signature}
 5 | \alias{human_signature}
 6 | \title{Example human marker proteins 
 7 |  For inputting \code{into gen_profiles_protein_expression()}}
 8 | \format{
 9 | A matrix of 11844 cells and 2 columns
10 | }
11 | \usage{
12 | human_signature
13 | }
14 | \description{
15 | data frame
16 | }
17 | \keyword{datasets}
18 | 


--------------------------------------------------------------------------------
/man/insitutype.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/insitutype.R
  3 | \name{insitutype}
  4 | \alias{insitutype}
  5 | \alias{insitutype,ANY-method}
  6 | \alias{insitutype,SingleCellExperiment-method}
  7 | \title{Run insitutype.}
  8 | \usage{
  9 | insitutype(x, ...)
 10 | 
 11 | \S4method{insitutype}{ANY}(
 12 |   x,
 13 |   neg,
 14 |   assay_type = c("rna", "protein"),
 15 |   bg = NULL,
 16 |   anchors = NULL,
 17 |   cohort = NULL,
 18 |   n_clusts,
 19 |   reference_profiles = NULL,
 20 |   reference_sds = NULL,
 21 |   update_reference_profiles = TRUE,
 22 |   sketchingdata = NULL,
 23 |   align_genes = TRUE,
 24 |   nb_size = 10,
 25 |   init_clust = NULL,
 26 |   n_starts = 10,
 27 |   n_benchmark_cells = 10000,
 28 |   n_phase1 = 10000,
 29 |   n_phase2 = 20000,
 30 |   n_phase3 = 1e+05,
 31 |   n_chooseclusternumber = 2000,
 32 |   pct_drop = 1/10000,
 33 |   min_prob_increase = 0.05,
 34 |   max_iters = 40,
 35 |   n_anchor_cells = 2000,
 36 |   min_anchor_cosine = 0.3,
 37 |   min_anchor_llr = 0.03,
 38 |   insufficient_anchors_thresh = 20,
 39 |   refinement = FALSE,
 40 |   rescale = TRUE,
 41 |   refit = TRUE
 42 | )
 43 | 
 44 | \S4method{insitutype}{SingleCellExperiment}(x, ..., assay.type = "counts")
 45 | }
 46 | \arguments{
 47 | \item{x}{Counts matrix (or dgCMatrix), cells * genes.
 48 | 
 49 |   Alternatively, a \linkS4class{SingleCellExperiment} object containing such
 50 |   a matrix.}
 51 | 
 52 | \item{...}{For the \linkS4class{SingleCellExperiment} method, additional
 53 | arguments to pass to the ANY method.}
 54 | 
 55 | \item{neg}{Vector of mean negprobe counts per cell}
 56 | 
 57 | \item{assay_type}{Assay type of rna, protein (default = "rna")}
 58 | 
 59 | \item{bg}{Expected background}
 60 | 
 61 | \item{anchors}{Vector giving "anchor" cell types, for use in semi-supervised
 62 | clustering. Vector elements will be mainly NA's (for non-anchored cells)
 63 | and cell type names for cells to be held constant throughout iterations.}
 64 | 
 65 | \item{cohort}{Vector of cells' cohort memberships}
 66 | 
 67 | \item{n_clusts}{Number of clusters, in addition to any pre-specified cell
 68 | types. Enter 0 to run purely supervised cell typing from fixed profiles.
 69 | Enter a range of integers to automatically select the optimal number of
 70 | clusters.}
 71 | 
 72 | \item{reference_profiles}{Matrix of mean expression profiles of pre-defined
 73 | clusters, e.g. from previous scRNA-seq. These profiles will not be updated
 74 | by the EM algorithm. Columns must all be included in the init_clust
 75 | variable.}
 76 | 
 77 | \item{reference_sds}{Matrix of standard deviation profiles of pre-defined
 78 | clusters. These SD profiles also will not be updated by the EM algorithm. 
 79 | Columns must all be included in the init_clust variable. This parameter should
 80 | be defined if assay_type is protein. Default is NULL.}
 81 | 
 82 | \item{update_reference_profiles}{Logical, for whether to use the data to
 83 | update the reference profiles. Default and strong recommendation is TRUE.
 84 | (However, if the reference profiles are from the same platform as the
 85 | study, then FALSE could be better.)}
 86 | 
 87 | \item{sketchingdata}{Optional matrix of data for use in non-random sampling
 88 | via "sketching". If not provided, then the data's first 20 PCs will be
 89 | used.}
 90 | 
 91 | \item{align_genes}{Logical, for whether to align the counts matrix and the
 92 | fixed_profiles by gene ID.}
 93 | 
 94 | \item{nb_size}{The size parameter to assume for the NB distribution. This 
 95 | parameter is only for RNA.}
 96 | 
 97 | \item{init_clust}{Vector of initial cluster assignments. If NULL, initial
 98 | assignments will be automatically inferred.}
 99 | 
100 | \item{n_starts}{the number of iterations}
101 | 
102 | \item{n_benchmark_cells}{the number of cells for benchmarking}
103 | 
104 | \item{n_phase1}{Subsample size for phase 1 (random starts)}
105 | 
106 | \item{n_phase2}{Subsample size for phase 2 (refining in a larger subset)}
107 | 
108 | \item{n_phase3}{Subsample size for phase 3 (getting final solution in a very
109 | large subset)}
110 | 
111 | \item{n_chooseclusternumber}{Subsample size for choosing an optimal number of
112 | clusters}
113 | 
114 | \item{pct_drop}{the decrease in percentage of cell types with a valid
115 | switchover to another cell type compared to the last iteration. Default
116 | value: 1/10000. A valid switchover is only applicable when a cell has
117 | changed the assigned cell type with its highest cell type probability
118 | increased by min_prob_increase.}
119 | 
120 | \item{min_prob_increase}{the threshold of probability used to determine a
121 | valid cell type switchover}
122 | 
123 | \item{max_iters}{Maximum number of iterations.}
124 | 
125 | \item{n_anchor_cells}{For semi-supervised learning. Maximum number of anchor
126 | cells to use for each cell type.}
127 | 
128 | \item{min_anchor_cosine}{For semi-supervised learning. Cells must have at
129 | least this much cosine similarity to a fixed profile to be used as an
130 | anchor.}
131 | 
132 | \item{min_anchor_llr}{For semi-supervised learning. Cells must have
133 | (log-likelihood ratio / totalcounts) above this threshold to be used as an
134 | anchor}
135 | 
136 | \item{insufficient_anchors_thresh}{Cell types that end up with fewer than
137 | this many anchors after anchor selection will be discarded.}
138 | 
139 | \item{refinement}{Logical, flag for further anchor refinement, used when update_reference_profiles = TRUE (default = FALSE)}
140 | 
141 | \item{rescale}{Logical, flag for platform effect correction, used when update_reference_profiles = TRUE (default = FALSE)}
142 | 
143 | \item{refit}{Logical, flag for fitting reference profiles to anchors, used when update_reference_profiles = TRUE (default = TRUE)}
144 | 
145 | \item{assay.type}{A string specifying which assay values to use.}
146 | }
147 | \value{
148 | A list, with the following elements: \enumerate{ \item clust: a
149 |   vector given cells' cluster assignments \item prob: a vector giving the
150 |   confidence in each cell's cluster \item logliks: Matrix of cells'
151 |   log-likelihoods under each cluster. Cells in rows, clusters in columns.
152 |   \item profiles: a matrix of cluster-specific expression profiles \item
153 |   anchors: from semi-supervised clustering: a vector giving the identifies
154 |   and cell types of anchor cells }
155 | }
156 | \description{
157 | A wrapper for nbclust, to manage subsampling and multiple random starts.
158 | }
159 | \examples{
160 | data("mini_nsclc")
161 | unsup <- insitutype(
162 |  x = mini_nsclc$counts,
163 |  neg = Matrix::rowMeans(mini_nsclc$neg),
164 |  assay_type = "rna",
165 |  n_clusts = 8,
166 |  n_phase1 = 200,
167 |  n_phase2 = 500,
168 |  n_phase3 = 2000,
169 |  n_starts = 1,
170 |  max_iters = 5
171 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
172 | table(unsup$clust)
173 | }
174 | 


--------------------------------------------------------------------------------
/man/insitutypeML.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/insitutypeML.R
 3 | \name{insitutypeML}
 4 | \alias{insitutypeML}
 5 | \alias{insitutypeML,ANY-method}
 6 | \alias{insitutypeML,SingleCellExperiment-method}
 7 | \title{Classify cells based on reference profiles}
 8 | \usage{
 9 | insitutypeML(x, ...)
10 | 
11 | \S4method{insitutypeML}{ANY}(
12 |   x,
13 |   neg = NULL,
14 |   bg = NULL,
15 |   cohort = NULL,
16 |   reference_profiles,
17 |   reference_sds = NULL,
18 |   nb_size = 10,
19 |   assay_type = c("rna", "protein"),
20 |   align_genes = TRUE
21 | )
22 | 
23 | \S4method{insitutypeML}{SingleCellExperiment}(x, ..., assay.type = "counts")
24 | }
25 | \arguments{
26 | \item{x}{Counts matrix (or dgCMatrix), cells * genes.
27 | 
28 |   Alternatively, a \linkS4class{SingleCellExperiment} object containing such
29 |   a matrix.}
30 | 
31 | \item{...}{For the \linkS4class{SingleCellExperiment} method, additional
32 | arguments to pass to the ANY method.}
33 | 
34 | \item{neg}{Vector of mean negprobe counts per cell. Can be provided}
35 | 
36 | \item{bg}{Expected background}
37 | 
38 | \item{cohort}{Vector of cells' cohort memberships}
39 | 
40 | \item{reference_profiles}{Matrix of expression profiles of pre-defined clusters,
41 | e.g. from previous scRNA-seq. These profiles will not be updated by the EM algorithm.
42 | Colnames must all be included in the init_clust variable.}
43 | 
44 | \item{reference_sds}{Matrix of standard deviation profiles of pre-defined
45 | clusters. These SD profiles also will not be updated by the EM algorithm. 
46 | Columns must all be included in the init_clust variable. This parameter should
47 | be defined if assay_type is protein. Default is NULL.}
48 | 
49 | \item{nb_size}{The size parameter to assume for the NB distribution.}
50 | 
51 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
52 | 
53 | \item{align_genes}{Logical, for whether to align the counts matrix and the reference_profiles by gene ID.}
54 | 
55 | \item{assay.type}{A string specifying which assay values to use.}
56 | }
57 | \value{
58 | A list, with the following elements:
59 | \enumerate{
60 | \item clust: a vector given cells' cluster assignments
61 | \item prob: a vector giving the confidence in each cell's cluster
62 | \item profiles: Matrix of clusters' mean background-subtracted profiles
63 | \item logliks: Matrix of cells' log-likelihoods under each cluster. Cells in rows, clusters in columns.
64 | }
65 | }
66 | \description{
67 | Supervised classification of cells. Each cell is assigned to the cell type 
68 |  under which its observed expression profile is most likely.
69 | }
70 | \examples{
71 | data("mini_nsclc")
72 | data("ioprofiles")
73 | sup <- insitutypeML(
74 |  x = mini_nsclc$counts,
75 |  neg = Matrix::rowMeans(mini_nsclc$neg),
76 |  reference_profiles = ioprofiles,
77 |  assay_type = "RNA")
78 | table(sup$clust)
79 | }
80 | 


--------------------------------------------------------------------------------
/man/iocolors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{iocolors}
 5 | \alias{iocolors}
 6 | \title{Default colors for the cell types in the ioprofiles matrix}
 7 | \format{
 8 | A named vector
 9 | }
10 | \usage{
11 | iocolors
12 | }
13 | \description{
14 | A named vector of colors, giving colors for the cell types of the ioprofiles
15 |  matrix.
16 | }
17 | \keyword{datasets}
18 | 


--------------------------------------------------------------------------------
/man/ioprofiles.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{ioprofiles}
 5 | \alias{ioprofiles}
 6 | \title{Matrix of immune cell profiles}
 7 | \format{
 8 | A matrix of 27161 genes x 16 cell types.
 9 | }
10 | \usage{
11 | ioprofiles
12 | }
13 | \description{
14 | A matrix of gene * cell type expected expression values
15 | }
16 | \keyword{datasets}
17 | 


--------------------------------------------------------------------------------
/man/ismax.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/nbclust.R
 3 | \name{ismax}
 4 | \alias{ismax}
 5 | \title{For a numeric object, return a logical object of whether each element is the max or not.}
 6 | \usage{
 7 | ismax(x)
 8 | }
 9 | \arguments{
10 | \item{x}{a vector of values}
11 | }
12 | \value{
13 | a vecetor of logical values
14 | }
15 | \description{
16 | For a numeric object, return a logical object of whether each element is the max or not.
17 | }
18 | \examples{
19 | ismax(c(3, 5, 5, 2))
20 | }
21 | 


--------------------------------------------------------------------------------
/man/lldist.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/nbclust.R
 3 | \name{lldist}
 4 | \alias{lldist}
 5 | \title{Calculate the likelihood of the expression mat
 6 |   using the reference profiles of x}
 7 | \usage{
 8 | lldist(
 9 |   x,
10 |   xsd = NULL,
11 |   mat,
12 |   bg = 0.01,
13 |   size = 10,
14 |   digits = 2,
15 |   assay_type = c("rna", "protein")
16 | )
17 | }
18 | \arguments{
19 | \item{x}{a vector of a reference mean profile for the cell type}
20 | 
21 | \item{xsd}{a vector of a reference standard deviation profile for the cell type}
22 | 
23 | \item{mat}{a matrix of expression levels in all cells: for Protein data, we use raw data for calculating the scaling factor}
24 | 
25 | \item{bg}{background level (default: 0.01)}
26 | 
27 | \item{size}{the parameters for dnbinom function (default: 10)}
28 | 
29 | \item{digits}{the number of digits for rounding}
30 | 
31 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
32 | }
33 | \value{
34 | likelihood for profile
35 | 
36 | cells x profiles matrix of log likelihoods
37 | }
38 | \description{
39 | Calculate the likelihood of the expression mat
40 |   using the reference profiles of x
41 | }
42 | \examples{
43 | data("mini_nsclc")
44 | data("ioprofiles")
45 | bg <- Matrix::rowMeans(mini_nsclc$neg)
46 | genes <- intersect(dimnames(mini_nsclc$counts)[[2]], dimnames(ioprofiles)[[1]])
47 | mat <- mini_nsclc$counts[, genes]
48 | x <- ioprofiles[genes, ]
49 | lldist(x = x, mat = mini_nsclc$counts, bg = bg, assay_type="RNA")
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/man/lls_protein.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{lls_protein}
 4 | \alias{lls_protein}
 5 | \title{sum from Gaussian density function}
 6 | \usage{
 7 | lls_protein(mat, bgsub, x, xsd)
 8 | }
 9 | \arguments{
10 | \item{mat}{dgCMatrix expression matrix}
11 | 
12 | \item{bgsub}{vector of background expression per cell}
13 | 
14 | \item{x}{numeric expression for reference profiles}
15 | 
16 | \item{xsd}{numeric expression for reference SD profiles}
17 | }
18 | \value{
19 | rowSums for matrix of densities
20 | }
21 | \description{
22 | Probability density function of the Gaussian distribution (written in C++)
23 | }
24 | 


--------------------------------------------------------------------------------
/man/lls_rna.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{lls_rna}
 4 | \alias{lls_rna}
 5 | \title{sum from negative binomial density function}
 6 | \usage{
 7 | lls_rna(mat, bgsub, x, bg, size_dnb)
 8 | }
 9 | \arguments{
10 | \item{mat}{dgCMatrix expression counts}
11 | 
12 | \item{bgsub}{vector of background expression per cell}
13 | 
14 | \item{x}{numeric expression for reference profiles}
15 | 
16 | \item{bg}{numeric background level}
17 | 
18 | \item{size_dnb}{int Dispersion parameter}
19 | }
20 | \value{
21 | rowSums for matrix of densities
22 | }
23 | \description{
24 | Probability density function of the negative binomial distribution (written in C++)
25 | }
26 | 


--------------------------------------------------------------------------------
/man/logliks2probs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/refineClusters.R
 3 | \name{logliks2probs}
 4 | \alias{logliks2probs}
 5 | \title{convert logliks to probabilities}
 6 | \usage{
 7 | logliks2probs(logliks)
 8 | }
 9 | \arguments{
10 | \item{logliks}{Matrix of loglikelihoods, as output by insitytupe. Cells in rows, clusters in columns.}
11 | }
12 | \value{
13 | A matrix of probabilities, in the same dimensions as logliks.
14 | }
15 | \description{
16 | From cell x cluster log-likelihoods, calculate cell x cluster probabilities
17 | }
18 | \examples{
19 | data("mini_nsclc")
20 | unsup <- insitutype(
21 |  x = mini_nsclc$counts,
22 |  neg = Matrix::rowMeans(mini_nsclc$neg),
23 |  n_clusts = 8,
24 |  n_phase1 = 200,
25 |  n_phase2 = 500,
26 |  n_phase3 = 2000,
27 |  n_starts = 1,
28 |  max_iters = 5,
29 |  assay_type="RNA"
30 | ) # choosing inadvisably low numbers to speed the vignette; using the defaults in recommended.
31 | logliks2probs(unsup$logliks)
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/man/mini_nsclc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{mini_nsclc}
 5 | \alias{mini_nsclc}
 6 | \title{Small example SMI data from a NSCLC tumor}
 7 | \format{
 8 | A list with the following elements:
 9 |  \itemize{
10 |  \item counts A matrix of raw counts, with cells in rows and genes in columns
11 |  \item counts A matrix of negprobe counts, with cells in rows and negprobes in columns
12 |  \item x x positions
13 |  \item y y position
14 |  \item umap umap projection
15 |  }
16 | }
17 | \usage{
18 | mini_nsclc
19 | }
20 | \description{
21 | A 2000-cell excerpt from a 1000-plex SMI study of a NSCLC tumor.
22 | }
23 | \keyword{datasets}
24 | 


--------------------------------------------------------------------------------
/man/mouse_signature.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{mouse_signature}
 5 | \alias{mouse_signature}
 6 | \title{Example mouse marker proteins 
 7 |  For inputting \code{into gen_profiles_protein_expression()}}
 8 | \format{
 9 | A matrix of 11844 cells and 2 columns
10 | }
11 | \usage{
12 | mouse_signature
13 | }
14 | \description{
15 | data frame
16 | }
17 | \keyword{datasets}
18 | 


--------------------------------------------------------------------------------
/man/nbclust.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/nbclust.R
 3 | \name{nbclust}
 4 | \alias{nbclust}
 5 | \title{Cluster via EM algorithm based on cell logliks}
 6 | \usage{
 7 | nbclust(
 8 |   counts,
 9 |   neg,
10 |   assay_type = c("rna", "protein"),
11 |   bg = NULL,
12 |   fixed_profiles = NULL,
13 |   fixed_sds = NULL,
14 |   init_profiles = NULL,
15 |   init_sds = NULL,
16 |   init_clust = NULL,
17 |   nb_size = 10,
18 |   cohort = NULL,
19 |   pct_drop = 1/10000,
20 |   min_prob_increase = 0.05,
21 |   max_iters = 40,
22 |   logresults = FALSE
23 | )
24 | }
25 | \arguments{
26 | \item{counts}{Counts matrix, cells * genes.}
27 | 
28 | \item{neg}{Vector of mean negative probe counts per cell.}
29 | 
30 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
31 | 
32 | \item{bg}{Expected background}
33 | 
34 | \item{fixed_profiles}{Matrix of mean expression profiles to hold unchanged throughout iterations. genes * cell types}
35 | 
36 | \item{fixed_sds}{Matrix of standard deviation profiles of pre-defined
37 | clusters to hold unchanged throughout iterations. 
38 | Columns must all be included in the init_clust variable. This parameter is 
39 | only for assay_type of protein.}
40 | 
41 | \item{init_profiles}{Matrix of cluster mean profiles under which to begin iterations.
42 | If NULL, initial assignments will be automatically inferred, using init_clust 
43 | if available, and using random clusters if not.}
44 | 
45 | \item{init_sds}{Matrix of cluster SDs profiles under which to begin iterations.
46 | If NULL, initial assignments will be automatically inferred, using init_clust 
47 | if available, and using random clusters if not. Only for assay_type of protein}
48 | 
49 | \item{init_clust}{Vector of initial cluster assignments.
50 | If NULL, initial assignments will be automatically inferred.}
51 | 
52 | \item{nb_size}{The size parameter to assume for the NB distribution. Only for assay_type of RNA.}
53 | 
54 | \item{cohort}{Vector of cells' "cohort" assignments, uses to assess frequencies in each cluster.}
55 | 
56 | \item{pct_drop}{the decrease in percentage of cell types with a valid switchover to 
57 | another cell type compared to the last iteration. Default value: 1/10000. A valid 
58 | switchover is only applicable when a cell has changed the assigned cell type with its
59 | highest cell type probability increased by min_prob_increase.}
60 | 
61 | \item{min_prob_increase}{the threshold of probability used to determine a valid cell 
62 | type switchover}
63 | 
64 | \item{max_iters}{Maximum number of iterations}
65 | 
66 | \item{logresults}{Populate clusterlog in returned list
67 | 
68 |  @importFrom stats lm}
69 | }
70 | \value{
71 | A list, with the following elements:
72 | \enumerate{
73 | \item probs: a matrix of probabilities of all cells (rows) belonging to all clusters (columns)
74 | \item profiles: a matrix of cluster-specific expression profiles
75 | }
76 | }
77 | \description{
78 | Cluster single cell gene expression data using an EM algorithm.
79 | }
80 | \examples{
81 | data("ioprofiles")
82 | data("mini_nsclc")
83 | sharedgenes <- intersect(colnames(mini_nsclc$counts), rownames(ioprofiles))
84 | nbclust(counts = mini_nsclc$counts[, sharedgenes],
85 |        neg =  Matrix::rowMeans(mini_nsclc$neg), 
86 |        assay_type = "RNA",
87 |        bg = NULL,
88 |        fixed_profiles = ioprofiles[sharedgenes, 1:3],
89 |        init_profiles = NULL, 
90 |        init_clust = rep(c("a", "b"), 
91 |        nrow(mini_nsclc$counts) / 2),
92 |        nb_size = 10,
93 |        cohort = rep("a", nrow(mini_nsclc$counts)),
94 |        pct_drop = 1/10000,
95 |        min_prob_increase = 0.05, 
96 |        max_iters = 3, 
97 |        logresults = FALSE)
98 | }
99 | 


--------------------------------------------------------------------------------
/man/nearestNeighborGraph.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getSpatialContext.R
 3 | \name{nearestNeighborGraph}
 4 | \alias{nearestNeighborGraph}
 5 | \title{Create spatial network from N nearest neighbors}
 6 | \usage{
 7 | nearestNeighborGraph(x, y, N, subset = 1)
 8 | }
 9 | \arguments{
10 | \item{x}{spatial coordinate}
11 | 
12 | \item{y}{spatial coordinate}
13 | 
14 | \item{N}{number of nearest neighbors}
15 | 
16 | \item{subset}{same length as x,y (see Details)}
17 | }
18 | \value{
19 | sparse adjacency matrix with distances
20 | }
21 | \description{
22 | For each cell identify \code{N} nearest neighbors in Euclidean space and
23 | create an edge between them in graph structure, optionally subset cells (see
24 | Details).
25 | }
26 | \details{
27 | Edges will only be created for cells that have the same \code{subset} value,
28 | usually the slide column id but could also be a slide plus FOV id to only
29 | create edges within an FOV.
30 | }
31 | 


--------------------------------------------------------------------------------
/man/neighbor_colMeans.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getSpatialContext.R
 3 | \name{neighbor_colMeans}
 4 | \alias{neighbor_colMeans}
 5 | \title{for each cell, get the colMeans of x over its neighbors:}
 6 | \usage{
 7 | neighbor_colMeans(x, neighbors)
 8 | }
 9 | \arguments{
10 | \item{x}{A matrix}
11 | 
12 | \item{neighbors}{A (probably sparse) adjacency matrix}
13 | }
14 | \description{
15 | for each cell, get the colMeans of x over its neighbors:
16 | }
17 | 


--------------------------------------------------------------------------------
/man/neighbor_colSums.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getSpatialContext.R
 3 | \name{neighbor_colSums}
 4 | \alias{neighbor_colSums}
 5 | \title{for each cell, get the colSums of x over its neighbors:}
 6 | \usage{
 7 | neighbor_colSums(x, neighbors)
 8 | }
 9 | \arguments{
10 | \item{x}{A matrix}
11 | 
12 | \item{neighbors}{A (probably sparse) adjacency matrix}
13 | }
14 | \description{
15 | for each cell, get the colSums of x over its neighbors:
16 | }
17 | 


--------------------------------------------------------------------------------
/man/numCores.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utilities.R
 3 | \name{numCores}
 4 | \alias{numCores}
 5 | \title{Get number of cores for parallelized operations}
 6 | \usage{
 7 | numCores(percentCores = 0.9, minNotUsedCores = 2)
 8 | }
 9 | \arguments{
10 | \item{percentCores}{percent of cores to use for parallelization [0-1]}
11 | 
12 | \item{minNotUsedCores}{minimum number of cores to leave for background processes}
13 | }
14 | \value{
15 | number of cores to use for mclapply
16 | }
17 | \description{
18 | Get number of cores for parallelized operations
19 | }
20 | 


--------------------------------------------------------------------------------
/man/prepDataForSketching.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/geoSketch.R
 3 | \name{prepDataForSketching}
 4 | \alias{prepDataForSketching}
 5 | \title{Prepare data for geoSketch}
 6 | \usage{
 7 | prepDataForSketching(counts, assay_type = c("rna", "protein"))
 8 | }
 9 | \arguments{
10 | \item{counts}{Counts matrix: cells x genes}
11 | 
12 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
13 | }
14 | \value{
15 | A matrix of data for geoSketch, with cells in rows and features in columns
16 | }
17 | \description{
18 | Process raw counts data for input into geoSketching.
19 | }
20 | \examples{
21 | data("mini_nsclc")
22 | prepDataForSketching(counts=mini_nsclc$counts, assay_type="RNA")
23 | }
24 | 


--------------------------------------------------------------------------------
/man/probs2logliks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/refineClusters.R
 3 | \name{probs2logliks}
 4 | \alias{probs2logliks}
 5 | \title{Get a logliks matrix from a probabilities matrix}
 6 | \usage{
 7 | probs2logliks(probs)
 8 | }
 9 | \arguments{
10 | \item{probs}{probability matrix}
11 | }
12 | \value{
13 | log-transformed matrix
14 | }
15 | \description{
16 | Get a logliks matrix from a probabilities matrix
17 | }
18 | \examples{
19 | a <- runif(10)
20 | probs2logliks(a/sum(a))
21 | }
22 | 


--------------------------------------------------------------------------------
/man/radiusBasedGraph.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/getSpatialContext.R
 3 | \name{radiusBasedGraph}
 4 | \alias{radiusBasedGraph}
 5 | \title{Create spatial network from neighbors within radius R}
 6 | \usage{
 7 | radiusBasedGraph(x, y, R, subset = 1)
 8 | }
 9 | \arguments{
10 | \item{x}{spatial coordinate}
11 | 
12 | \item{y}{spatial coordinate}
13 | 
14 | \item{R}{radius}
15 | 
16 | \item{subset}{same length as x,y (see Details)}
17 | }
18 | \value{
19 | sparse adjacency matrix with distances
20 | }
21 | \description{
22 | For each cell identify neighbors within distance \code{R} in Euclidean space
23 | and create an edge between them in graph structure, optionally subset cells
24 | (see Details).
25 | }
26 | \details{
27 | Edges will only be created for cells that have the same \code{subset} value,
28 | usually the slide column id but could also be a slide plus FOV id to only
29 | create edges within an FOV.
30 | }
31 | 


--------------------------------------------------------------------------------
/man/refineAnchors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/find_anchor_cells.R
 3 | \name{refineAnchors}
 4 | \alias{refineAnchors}
 5 | \title{Filter anchor candidates via projection of reference profiles to anchor-derived UMAP}
 6 | \usage{
 7 | refineAnchors(
 8 |   counts,
 9 |   neg = NULL,
10 |   bg = NULL,
11 |   align_genes = TRUE,
12 |   profiles,
13 |   anchor_candidates,
14 |   nn_cells = 500,
15 |   insufficient_anchors_thresh = 20
16 | )
17 | }
18 | \arguments{
19 | \item{counts}{Counts matrix, cells * genes.}
20 | 
21 | \item{neg}{Vector of mean negprobe counts per cell}
22 | 
23 | \item{bg}{Expected background}
24 | 
25 | \item{align_genes}{Logical, for whether to align the columns of the counts matrix and the rows of
26 | the profiles matrix based on their names.}
27 | 
28 | \item{profiles}{Matrix of reference profiles holding mean expression of genes x cell types. 
29 | Input linear-scale expression, with genes in rows and cell types in columns.}
30 | 
31 | \item{anchor_candidates}{Named vector of anchor candidates with cell_ID in name and corresponding cell type in values.}
32 | 
33 | \item{nn_cells}{Number of top nearest neighbors to the projected reference profiles to be selected as final anchor cells.}
34 | 
35 | \item{insufficient_anchors_thresh}{Cell types that end up with fewer than this many anchors will be discarded.}
36 | }
37 | \value{
38 | anchors, a named vector for the final anchor cells
39 | }
40 | \description{
41 | Calculates expression UMAP model for anchor candidates, then projects reference 
42 | profiles to the anchor-derived UMAP and select anchor candidates within top 
43 | nearest neighbors of the projected reference profiles of same cell type in the 
44 | UMAP as the final anchor cells.
45 | }
46 | 


--------------------------------------------------------------------------------
/man/refineClusters.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/refineClusters.R
 3 | \name{refineClusters}
 4 | \alias{refineClusters}
 5 | \title{Merge cell types in a clustering result}
 6 | \usage{
 7 | refineClusters(
 8 |   assay_type = c("rna", "protein"),
 9 |   merges = NULL,
10 |   to_delete = NULL,
11 |   subcluster = NULL,
12 |   logliks,
13 |   counts = NULL,
14 |   neg = NULL,
15 |   bg = NULL,
16 |   cohort = NULL
17 | )
18 | }
19 | \arguments{
20 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
21 | 
22 | \item{merges}{A named vector in which the elements give new cluster names and
23 | the names give old cluster names. OK to omit cell types that aren't being
24 | merged.}
25 | 
26 | \item{to_delete}{A vector of cluster names to delete. All cells assigned to
27 | these clusters will be reassigned to the next best cluster.}
28 | 
29 | \item{subcluster}{A list, where each element's name is a cell type to
30 | subcluster, and the element itself is the cluster number(s) to use. E.g.
31 | list("macrophages" = 2, "cancer" = 2:3)}
32 | 
33 | \item{logliks}{Matrix of log-likelihoods output by insitutype, cells in rows,
34 | clusters in columns}
35 | 
36 | \item{counts}{Counts matrix, cells * genes. Only needed if subclustering is
37 | run.}
38 | 
39 | \item{neg}{Vector of mean negprobe counts per cell. Only needed if
40 | subclustering is run.}
41 | 
42 | \item{bg}{Expected background. Optional, and only used if subclustering is
43 | run.}
44 | 
45 | \item{cohort}{Vector of cells' cohort memberships. Optional, and only needed
46 | if subclustering is run.}
47 | }
48 | \value{
49 | A list with two elements: \enumerate{ \item clust: a vector of
50 |   cluster assignments \item prob: Vector of posterior probabilities for each
51 |   cell type \item logliks: a matrix of probabilities of all cells (rows)
52 |   belonging to all clusters (columns) \item profiles: a matrix of the average
53 |   background-subracted profile of each cell type after
54 |   merging/deleting/subclustering }
55 | }
56 | \description{
57 | Take a user-defined list of cells types to rename/combine, then re-compute
58 | cluster assignments and probabilities under the merged cell types.
59 | }
60 | \examples{
61 | #example merges argument:
62 | merges = c("macrophages" = "myeloid",  # merge 3 clusters
63 |            "monocytes" = "myeloid",
64 |            "mDC" = "myeloid",
65 |            "B-cells" = "lymphoid")    # just rename 1 cluster
66 | # example to_delete argument:
67 | to_delete = c("neutrophils")
68 | # example subcluster argument:
69 | subcluster = list("Myofibroblast" = 2:3)
70 | }
71 | 


--------------------------------------------------------------------------------
/man/spatialUpdate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/spatialUpdate.R
 3 | \name{spatialUpdate}
 4 | \alias{spatialUpdate}
 5 | \title{Update cell typing results with spatial context or other alternative data}
 6 | \usage{
 7 | spatialUpdate(
 8 |   celltype,
 9 |   counts,
10 |   neg,
11 |   cohort = NULL,
12 |   altdata = NULL,
13 |   xy = NULL,
14 |   tissue = NULL,
15 |   nb_size = 10,
16 |   assay_type = c("rna", "protein")
17 | )
18 | }
19 | \arguments{
20 | \item{celltype}{Vector of cell type assignments to be updated}
21 | 
22 | \item{counts}{Counts matrix (or dgCMatrix), cells * genes.}
23 | 
24 | \item{neg}{Vector of mean negprobe counts per cell}
25 | 
26 | \item{cohort}{Vector of cells' cohort memberships. Output of a spatial clustering algorithm makes for good cohorts.}
27 | 
28 | \item{altdata}{Matrix of cells' alternative data values}
29 | 
30 | \item{xy}{2-column matrix of cells' xy positions.}
31 | 
32 | \item{tissue}{Vector giving cells' tissue IDs. Used to separate tissue with overlapping xy coordinates.}
33 | 
34 | \item{nb_size}{The size parameter to assume for the NB distribution.}
35 | 
36 | \item{assay_type}{A string specifying which assay values to use.}
37 | }
38 | \description{
39 | Takes cell typing results, then updates it based on alternative data types, 
40 | e.g. spatial context, morphology, or protein expression. Existing cell typing results are 
41 | put into Insitutype's likelihood framework, which then can use alternative data
42 | as a prior to be updated by the expression data to get a new posterior probability 
43 | of cell type.
44 | Performs this operation by 
45 | \enumerate{
46 | \item deriving cell type profiles using InSituType:::Estep(), 
47 | \item assigning cells to "cohorts" (clusters) derived from their alternative data
48 | \item  Inputing the output of steps (1) and (2) into InSituType::insitutype() to 
49 |  re-calculate cell type. 
50 | }
51 | Paths for using alternative data in priority order (choose one; if multiple are input, only the most downstream option will be used):
52 | \enumerate{
53 | \item Input \code{xy} positions (and possibly \code{tissue}). Then cells will be clustered 
54 |  into cohorts based on the expression pattern of their 50 nearest neighboring cells.
55 | \item Input a matrix of alternative data (\code{altdata}) to be automatically clustered into cohorts. This supersedes 
56 |  the altdata matrix derived from the \code{xy} argument.
57 | \item Input your own \code{cohort} vector. This supersedes the above inputs. 
58 | }
59 | }
60 | 


--------------------------------------------------------------------------------
/man/tonsil_annotation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{tonsil_annotation}
 5 | \alias{tonsil_annotation}
 6 | \title{Matrix of anchor cells' annotation file
 7 |  A matrix including cell_ID and cellType for anchors cells}
 8 | \format{
 9 | A matrix of 11844 cells and 2 columns
10 | }
11 | \usage{
12 | tonsil_annotation
13 | }
14 | \description{
15 | matrix.
16 | }
17 | \keyword{datasets}
18 | 


--------------------------------------------------------------------------------
/man/tonsil_protein.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{tonsil_protein}
 5 | \alias{tonsil_protein}
 6 | \title{Small example SMI protein data from a tonsil tissue}
 7 | \format{
 8 | A list with the following elements:
 9 |  \itemize{
10 |  \item counts A matrix of raw counts, with cells in rows and proteins in columns
11 |  \item negs A matrix of IgG counts, with cells in rows and IgGs in columns
12 |  \item xy_coord x and y positions
13 |  \item UMAP umap projection
14 |  }
15 | }
16 | \usage{
17 | tonsil_protein
18 | }
19 | \description{
20 | A 21844-cells excerpt from a 68-plex SMI study of a tonsil tissue.
21 | }
22 | \keyword{datasets}
23 | 


--------------------------------------------------------------------------------
/man/tonsil_reference_profile.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{tonsil_reference_profile}
 5 | \alias{tonsil_reference_profile}
 6 | \title{Reference profile examples from a tonsil tissue
 7 | #'}
 8 | \format{
 9 | A list with the following elements:
10 |  \itemize{
11 |  \item tonsil_reference_profile A matrix of raw counts, with cells in rows and proteins in columns
12 |  \item counts A matrix of IgG counts, with cells in rows and IgGs in columns
13 |  \item xy_coord x and y positions
14 |  \item UMAP umap projection
15 |  }
16 | }
17 | \usage{
18 | tonsil_reference_profile
19 | }
20 | \description{
21 | Reference profile examples from a tonsil tissue
22 | #'
23 | }
24 | \keyword{datasets}
25 | 


--------------------------------------------------------------------------------
/man/updateProfilesFromAnchors.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/rescaleProfiles.R
 3 | \name{updateProfilesFromAnchors}
 4 | \alias{updateProfilesFromAnchors}
 5 | \title{Use anchor cells to update reference profiles, simply by taking the mean
 6 | profile of the anchors.}
 7 | \usage{
 8 | updateProfilesFromAnchors(
 9 |   counts,
10 |   neg,
11 |   bg = NULL,
12 |   assay_type = c("rna", "protein"),
13 |   anchors
14 | )
15 | }
16 | \arguments{
17 | \item{counts}{Counts matrix, cells * genes.}
18 | 
19 | \item{neg}{Vector of mean negprobe counts per cell. Can be provided}
20 | 
21 | \item{bg}{Expected background}
22 | 
23 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
24 | 
25 | \item{anchors}{Vector of anchor assignments}
26 | }
27 | \value{
28 | \enumerate{ 
29 | \item updated_profiles: A mean profiles matrix with the rows rescaled
30 | according to platform effects and individual elements updated further 
31 | \item updated_sds: A mean profiles matrix with the rows rescaled
32 | according to platform effects and individual elements updated further}
33 | }
34 | \description{
35 | Uses anchor cells to estimate platform effects / scaling factors to be
36 | applied to the genes/rows of the reference profile matrix. Then uses Bayesian
37 | math to update the individual elements on X.
38 | }
39 | 


--------------------------------------------------------------------------------
/man/updateReferenceProfiles.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/rescaleProfiles.R
  3 | \name{updateReferenceProfiles}
  4 | \alias{updateReferenceProfiles}
  5 | \title{Update reference profiles}
  6 | \usage{
  7 | updateReferenceProfiles(
  8 |   reference_profiles,
  9 |   reference_sds,
 10 |   counts,
 11 |   neg,
 12 |   assay_type = c("rna", "protein"),
 13 |   bg = NULL,
 14 |   nb_size = 10,
 15 |   anchors = NULL,
 16 |   n_anchor_cells = 2000,
 17 |   min_anchor_cosine = 0.3,
 18 |   min_anchor_llr = 0.01,
 19 |   insufficient_anchors_thresh = 20,
 20 |   refinement = FALSE,
 21 |   blacklist = NULL,
 22 |   rescale = FALSE,
 23 |   refit = TRUE
 24 | )
 25 | }
 26 | \arguments{
 27 | \item{reference_profiles}{Matrix of reference mean profiles, genes * cell types
 28 | are specified, by first choosing anchor cells.}
 29 | 
 30 | \item{reference_sds}{Matrix of standard deviation profiles, genes * cell types. Only for assay_type of protein.}
 31 | 
 32 | \item{counts}{Counts matrix, cells * genes.}
 33 | 
 34 | \item{neg}{Vector of mean negprobe counts per cell}
 35 | 
 36 | \item{assay_type}{Assay type of RNA, protein (default = "rna")}
 37 | 
 38 | \item{bg}{Expected background}
 39 | 
 40 | \item{nb_size}{The size parameter to assume for the NB distribution. Only for assay_type of RNA}
 41 | 
 42 | \item{anchors}{named vector giving "anchor" cell types with cell_id in names, 
 43 | for use in semi-supervised clustering. Vector elements will be mainly NA's 
 44 | (for non-anchored cells) and cell type names for cells to be held constant 
 45 | throughout iterations.}
 46 | 
 47 | \item{n_anchor_cells}{For semi-supervised learning. Maximum number of anchor
 48 | cells to use for each cell type.}
 49 | 
 50 | \item{min_anchor_cosine}{For semi-supervised learning. Cells must have at
 51 | least this much cosine similarity to a fixed profile to be used as an
 52 | anchor.}
 53 | 
 54 | \item{min_anchor_llr}{For semi-supervised learning. Cells must have
 55 | (log-likelihood ratio / totalcounts) above this threshold to be used as an
 56 | anchor}
 57 | 
 58 | \item{insufficient_anchors_thresh}{Cell types that end up with fewer than
 59 | this many anchors will be discarded.}
 60 | 
 61 | \item{refinement}{Logical, flag for further anchor refinement via UMAP projection (default = FALSE)}
 62 | 
 63 | \item{blacklist}{vector of genes to be excluded for cell typing (default = NULL)}
 64 | 
 65 | \item{rescale}{Logical, flag for platform effect correction (default = FALSE).}
 66 | 
 67 | \item{refit}{Logical, flag for fitting reference profiles to anchors, run after rescale if rescale = TRUE (default = TRUE)}
 68 | }
 69 | \value{
 70 | a list 
 71 | \describe{
 72 |     \item{updated_profiles}{a genes * cell types matrix for final updated reference profiles}
 73 |     \item{blacklist}{a vector of genes excluded from the final updated reference profiles}
 74 |     \item{anchors}{a named vector for final anchors used for reference profile update}
 75 |     \item{rescale_res}{a list of 5 elements, `rescaled_profiles`, `platformEff_statsDF`, `anchors`, `blacklist` and `lostgenes`, for platform effect correction outputs, return when rescale = TRUE}
 76 |     \item{refit_res}{a list of 2 elements, `refitted_profiles` and `anchors`, for anchor-based profile refitting outputs, return when refit = TRUE}
 77 | }
 78 | }
 79 | \description{
 80 | Update reference profiles using pre-specified anchor cells, or if no anchors
 81 | are specified, by first choosing anchor cells. Option to return reference 
 82 | profiles rescaled for platform effect and/or to return further refitted profiles 
 83 | based on the observed profiles of anchor cells.
 84 | }
 85 | \examples{
 86 | data("mini_nsclc")
 87 | data("ioprofiles")
 88 | counts <- mini_nsclc$counts
 89 | ## estimate per-cell bg as a fraction of total counts:
 90 | negmean.per.totcount <- mean(rowMeans(mini_nsclc$neg)) / mean(rowSums(counts))
 91 | per.cell.bg <- rowSums(counts) * negmean.per.totcount
 92 | astats <- get_anchor_stats(counts = mini_nsclc$counts, 
 93 |                            assay_type="RNA", 
 94 |                            neg = Matrix::rowMeans(mini_nsclc$neg),
 95 |                            profiles = ioprofiles,
 96 |                            sds=NULL)
 97 | 
 98 | # now choose anchors:
 99 | anchors <- choose_anchors_from_stats(counts = counts, 
100 |                                     neg = mini_nsclc$negmean, 
101 |                                     bg = per.cell.bg,
102 |                                     anchorstats = astats, 
103 |                                     # a very low value chosen for the mini
104 |                                     # dataset. Typically hundreds of cells
105 |                                     # would be better.
106 |                                     n_cells = 50, 
107 |                                     min_cosine = 0.4, 
108 |                                     min_scaled_llr = 0.03, 
109 |                                     insufficient_anchors_thresh = 5,
110 |                                     assay_type="RNA")
111 | 
112 | # The next step is to use the anchors to update the reference profiles:
113 | 
114 | updateReferenceProfiles(reference_profiles = ioprofiles,
115 |                         reference_sds = NULL,
116 |                         counts = mini_nsclc$counts, 
117 |                         neg = mini_nsclc$neg, 
118 |                         assay_type = "rna", 
119 |                         bg = per.cell.bg,
120 |                         anchors = anchors) 
121 | }
122 | 


--------------------------------------------------------------------------------
/man/update_logliks_with_cohort_freqs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/nbclust.R
 3 | \name{update_logliks_with_cohort_freqs}
 4 | \alias{update_logliks_with_cohort_freqs}
 5 | \title{Update logliks based on frequencies}
 6 | \usage{
 7 | update_logliks_with_cohort_freqs(
 8 |   logliks,
 9 |   cohort,
10 |   minfreq = 1e-06,
11 |   nbaselinecells = 50
12 | )
13 | }
14 | \arguments{
15 | \item{logliks}{Matrix of cells' (rows) loglikelihoods under clusters (columns)}
16 | 
17 | \item{cohort}{Vector of cells' cohort memberships}
18 | 
19 | \item{minfreq}{Minimum frequency to give any cell type in any cohort}
20 | 
21 | \item{nbaselinecells}{Number of cells from baseline distribution to add to the 
22 | cohort-specific frequencies, thereby shrinking each cohort's data towards the population}
23 | }
24 | \value{
25 | An adjusted logliks matrix
26 | }
27 | \description{
28 | Update logliks based on frequencies
29 | }
30 | 


--------------------------------------------------------------------------------
/reqs.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | #### Reqs for insitutype:
  6 | Insitutype performs unsupervised clustering, or semi-supervised clustering if
  7 | provided with reference profiles. It uses an Expectation_maximization (EM) algorithm based on a negbinom 
  8 | distribution. Insitutype coordinates calls to nbclust(), which runs the EM algorithm.
  9 | 
 10 | ##### Inputs:
 11 | - an expression matrix (cells * genes)
 12 | - a vector of mean negprobe values
 13 | - for semi-supervised learning, a matrix of reference profiles
 14 | - additional arguments for finer control
 15 | 
 16 | ##### Outputs:
 17 | A list, with the following elements:
 18 | \enumerate{
 19 | \item clust: a vector given cells' cluster assignments
 20 | \item prob: a vector giving the confidence in each cell's cluster
 21 | \item logliks: Matrix of cells' log-likelihoods under each cluster. Cells in rows, clusters in columns.
 22 | \item profiles: a matrix of cluster-specific expression profiles
 23 | \item anchors: from semi-supervised clustering: a vector giving the identifies and cell types of anchor cells
 24 | }
 25 | 
 26 | 
 27 | #### Reqs for insitutypeML:
 28 | Insitutype performs supervised cell typing using a Bayes classifier based on a negbinom distribution. 
 29 | 
 30 | ##### Inputs:
 31 | - an expression matrix (cells * genes)
 32 | - a vector of mean negprobe values
 33 | - for semi-supervised learning, a matrix of reference profiles
 34 | - additional arguments for finer control
 35 | 
 36 | ##### Outputs:
 37 | A list, with the following elements:
 38 | \enumerate{
 39 | \item clust: a vector given cells' cluster assignments
 40 | \item prob: a vector giving the confidence in each cell's cluster
 41 | \item logliks: Matrix of cells' log-likelihoods under each cluster. Cells in rows, clusters in columns.
 42 | \item profiles: a matrix of cluster-specific expression profiles
 43 | }
 44 | 
 45 | 
 46 | #### Reqs for updateReferenceProfiles
 47 | Update reference profiles from alternative platforms to better fit the spatial platform. 
 48 | Uses pre-specified anchor cells, or if no anchors are specified, by first choosing anchor cells.
 49 | 
 50 | ##### Inputs:
 51 | - reference profiles
 52 | - spatial data: counts matrix, negmean values
 53 | - additional arguments for finer control
 54 | 
 55 | ##### Outputs:
 56 | - An updated reference matrix
 57 | - A vector storing the anchor cells used
 58 | 
 59 | #### Reqs for refineClusters
 60 | A function for refining the output of insitutype and insitutypeML. 
 61 | Can delete clusters, merge/rename clusters, or sub-cluster clusters. 
 62 | 
 63 | ##### Inputs:
 64 | - Results from an insitutyle/insitutypeML run
 65 | - If subclustering further, counts data
 66 | 
 67 | ##### Outputs:
 68 | A list in the format of insitutype results with updated cluster assignments. 
 69 | 
 70 | 
 71 | 
 72 | #### Reqs for chooseClusterNumber
 73 | A function to run insituytpe across a range of cluster numbers and identify the best fit
 74 | 
 75 | ##### Inputs:
 76 | - The standard insitutype inputs
 77 | - A range of cluster numbers
 78 | 
 79 | ##### Outputs:
 80 | - A suggested cluster number, plus metrics for comparing cluster numbers.
 81 | 
 82 | 
 83 | 
 84 | 
 85 | #### Reqs for get_anchor_stats
 86 | Function to calculate the summary stats used by anchor cell selection. 
 87 | Results are meant to be fed to choose_anchors_from_stats().
 88 | 
 89 | ##### Inputs:
 90 | - The same expression data used by insitutype.
 91 | - Reference profiles
 92 | 
 93 | ##### Outputs:
 94 | - A matrix of cosine distances of cells * cell types
 95 | - A matrix of log likelihood ratio scores for cells * cell types
 96 | 
 97 | 
 98 | 
 99 | #### Reqs for choose_anchors_from_stats
100 | Chooses anchor cells given cosine distances and log likelihood ratio scores 
101 | output by get_anchor_stats. 
102 | 
103 | ##### Inputs:
104 | - A matrix of cosine distances of cells * cell types
105 | - A matrix of log likelihood ratio scores for cells * cell types
106 | 
107 | ##### Outputs:
108 | A vector of anchor assignments. 
109 | 
110 | 
111 | 
112 | #### Reqs for find_anchor_cells
113 | Complete anchor cell selection workflow. Calls get_anchor_stats and choose_anchors_from_stats.
114 | 
115 | ##### Inputs:
116 | - The same expression data used by insitutype.
117 | - Reference profiles
118 | 
119 | ##### Outputs:
120 | A vector of anchor assignments. 
121 | 
122 | 
123 | 
124 | #### Reqs for flightpath_layout
125 | A function to define the layout for a flightpath plot. Uses UMAP to place cluster centroids,
126 | then places cells based on their posterior probabilities of belonging to each centroid.
127 | 
128 | ##### Inputs:
129 | - A matrix of cell * cluster log-likelihoods (output by insitutype)
130 | - A matrix of cluster profiles
131 | 
132 | ##### Outputs:
133 | - xy placements for cluster centroids
134 | - xy placements for individual cells
135 | 
136 | 
137 | 
138 | #### Reqs for flightpath_plot
139 | Makes a ggplot object holding a flightpath plot. Uses UMAP to place cluster centroids,
140 | then places cells based on their posterior probabilities of belonging to each centroid.
141 | 
142 | ##### Inputs:
143 | - Path 1: input an insitutype/insitutypeML result, and it will call flightpath_layout()
144 | - Path 2: input a flightpath_layout result. 
145 | 
146 | ##### Outputs:
147 | A ggplot object
148 | 
149 | 
150 | #### Reqs for fastCohorting
151 | Quickly clusters data from alternative sources like immunofluorescence and spatial context. 
152 | 
153 | ##### Inputs:
154 | - A matrix holding alternative data (cells * variables)
155 | - Arguments for finer control
156 | 
157 | ##### Output:
158 | A vector giving each cell's cohort assignment. 
159 | 


--------------------------------------------------------------------------------
/specs.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #### Specs for insitutypeML:
 4 | - Returns a vector of cell type assignments  -- test: test_insitutype.R#L54
 5 | - Returns a vector of posterior probabilities / confidence scores   -- test: test_insitutype.R#L55
 6 | - Returns a matrix of cell * cell type log-likelihoods  -- test: test_insitutype.R#L56
 7 | - Returns a matrix of cell type profiles  -- test: test_insitutype.R#L57
 8 | 
 9 | 
10 | #### Specs for insitutype:
11 | - If run with fixed_profiles and 0 new clusters, produces valid outputs:
12 |   - Returns a vector of cell type assignments  -- test: test_insitutype.R#L82
13 | - Returns a vector of posterior probabilities / confidence scores   -- test: test_insitutype.R#L83
14 | - Returns a matrix of cell * cell type log-likelihoods  -- test: test_insitutype.R#L84
15 | - Returns a matrix of cell type profiles  -- test: test_insitutype.R#L85
16 | - If run with no fixed_profiles (fully unsupervises), produces valid outputs:
17 |   - Returns a vector of cell type assignments  -- test: test_insitutype.R#L112
18 | - Returns a vector of posterior probabilities / confidence scores   -- test: test_insitutype.R#L113
19 | - Returns a matrix of cell * cell type log-likelihoods  -- test: test_insitutype.R#L114
20 | - Returns a matrix of cell type profiles  -- test: test_insitutype.R#L115
21 | - If unsupervised clustering is run with initial clusters specified, produces valid outputs:
22 |   - Returns a vector of cell type assignments  -- test: test_insitutype.R#L144
23 | - Returns a vector of posterior probabilities / confidence scores   -- test: test_insitutype.R#L145
24 | - Returns a matrix of cell * cell type log-likelihoods  -- test: test_insitutype.R#L146
25 | - Returns a matrix of cell type profiles  -- test: test_insitutype.R#L147
26 | - The clusters returned have the same names as the initial clusters  -- test: test_insitutype.R#L148
27 | - If semi-supervised clustering is run with initial clusters specified, produces valid outputs:
28 |   - Returns a vector of cell type assignments  -- test: test_insitutype.R#L177
29 | - Returns a vector of posterior probabilities / confidence scores   -- test: test_insitutype.R#L178
30 | - Returns a matrix of cell * cell type log-likelihoods  -- test: test_insitutype.R#L179
31 | - Returns a matrix of cell type profiles  -- test: test_insitutype.R#L180
32 | 
33 | 
34 | #### Specs for updateReferenceProfiles:
35 | - Returns a matrix of new profiles  -- test: test_insitutype.R#L316
36 | - Returns a vector of anchor assignments  -- test: test_insitutype.R#L317
37 | 
38 | 
39 | #### Specs for refineClusters:
40 | - Merging operations happen correctly  -- test: test_insitutype.R#L196
41 | - Cell names are preserved  -- test: test_insitutype.R#L196
42 | - Makes no changes if none are requested  -- test: test_insitutype.R#L302
43 | - Merging operations happen correctly if merges and deletions are asked for  -- test: test_insitutype.R#L307
44 | - Merging operations happen correctly if merges are asked for  -- test: test_refinecells_cell_merging_logic.R#L14,18,22
45 | 
46 | 
47 | #### Specs for chooseClusterNumber:
48 | - Returns a single value for "best cluster number"  -- test: test_insitutype.R#L219
49 | - Reports the cluster numbers considered  -- test: test_insitutype.R#L220
50 | - Reports the log likelihood from each cluster number #221
51 | - Reports the AIC from each cluster number #222
52 | - Reports the BIC from each cluster number #223
53 | 
54 | 
55 | #### Specs for get_anchor_stats
56 | - Returns a matrix of cosine distances  -- test: test_insitutype.R#L236
57 | - Returns a matrix of log likelihood ratios  -- test: test_insitutype.R#L237
58 | 
59 | #### Specs for choose_anchors_from_stats
60 | - Assigns values consistent with the cell type names of the inputs  -- test: test_insitutype.R#L253
61 | - Assigns no more than the specified number of anchors per cell type  -- test: test_insitutype.R#L253
62 | - The anchors vector aligns to the rows of the counts matrix (cells)  -- test: test_insitutype.R#L254
63 | 
64 | #### Specs for find_anchor_cells
65 | - Assigns values consistent with the cell type names of the inputs  -- test: test_insitutype.R#L271
66 | - Assigns no more than the specified number of anchors per cell type  -- test: test_insitutype.R#L272
67 | - The anchors vector aligns to the rows of the counts matrix (cells)  -- test: test_insitutype.R#L273
68 | - Returns NULL if no cells meet anchor criteria  -- test: test_insitutype.R#L288
69 | 
70 | 
71 | #### Specs for flightpath_layout
72 | - Returns correctly formatted results:
73 |   - Cluster positions are in a 2-column matrix  -- test: test_flightpath.R#L34
74 | - Cell positions are in a 2-column matrix  -- test: test_flightpath.R#L35
75 | - There are no missing cluster positions  -- test: test_flightpath.R#L36
76 | - There are no missing cell positions  -- test: test_flightpath.R#L37
77 | 
78 | #### Specs for flightpath_plot
79 | - when passed a result from flightpath_layout, flightpath_plot returns a ggplot object  -- test: test_flightpath.R#L43
80 | - when passed an insitutype results, flightpath_plot returns a ggplot object  -- test: test_flightpath.R#L51
81 | - when asked to show meanConfidence, flightpath_plot returns a ggplot object  -- test: test_flightpath.R#L57
82 | 
83 | 
84 | #### Specs for fastCohorting
85 | - Returns a vector of cohort assignments  -- test: test_insitutype.R#L325
86 | - Returns the specified number of unique cohorts  -- test: test_insitutype.R#L325
87 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
 1 | 
 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 
 3 | ## enable compilation with C++11 (where available)
 4 | ##
 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider
 6 | ## availability of the package we do not yet enforce this here.  It is however
 7 | ## recommended for client packages to set it.
 8 | ##
 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP
10 | ## support within Armadillo prefers / requires it
11 | CXX_STD = CXX11
12 | 
13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD=1
14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
 1 | 
 2 | ## With R 3.1.0 or later, you can uncomment the following line to tell R to 
 3 | ## enable compilation with C++11 (where available)
 4 | ##
 5 | ## Also, OpenMP support in Armadillo prefers C++11 support. However, for wider
 6 | ## availability of the package we do not yet enforce this here.  It is however
 7 | ## recommended for client packages to set it.
 8 | ##
 9 | ## And with R 3.4.0, and RcppArmadillo 0.7.960.*, we turn C++11 on as OpenMP
10 | ## support within Armadillo prefers / requires it
11 | CXX_STD = CXX11
12 | 
13 | PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DARMA_64BIT_WORD=1
14 | PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
15 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <RcppArmadillo.h>
 5 | #include <Rcpp.h>
 6 | 
 7 | using namespace Rcpp;
 8 | 
 9 | #ifdef RCPP_USE_GLOBAL_ROSTREAM
10 | Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
11 | Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
12 | #endif
13 | 
14 | // lls_rna
15 | Rcpp::NumericMatrix lls_rna(arma::sp_mat& mat, arma::vec& bgsub, arma::mat& x, arma::vec& bg, int& size_dnb);
16 | RcppExport SEXP _InSituType_lls_rna(SEXP matSEXP, SEXP bgsubSEXP, SEXP xSEXP, SEXP bgSEXP, SEXP size_dnbSEXP) {
17 | BEGIN_RCPP
18 |     Rcpp::RObject rcpp_result_gen;
19 |     Rcpp::RNGScope rcpp_rngScope_gen;
20 |     Rcpp::traits::input_parameter< arma::sp_mat& >::type mat(matSEXP);
21 |     Rcpp::traits::input_parameter< arma::vec& >::type bgsub(bgsubSEXP);
22 |     Rcpp::traits::input_parameter< arma::mat& >::type x(xSEXP);
23 |     Rcpp::traits::input_parameter< arma::vec& >::type bg(bgSEXP);
24 |     Rcpp::traits::input_parameter< int& >::type size_dnb(size_dnbSEXP);
25 |     rcpp_result_gen = Rcpp::wrap(lls_rna(mat, bgsub, x, bg, size_dnb));
26 |     return rcpp_result_gen;
27 | END_RCPP
28 | }
29 | // lls_protein
30 | Rcpp::NumericMatrix lls_protein(arma::mat& mat, arma::vec& bgsub, arma::mat& x, arma::mat& xsd);
31 | RcppExport SEXP _InSituType_lls_protein(SEXP matSEXP, SEXP bgsubSEXP, SEXP xSEXP, SEXP xsdSEXP) {
32 | BEGIN_RCPP
33 |     Rcpp::RObject rcpp_result_gen;
34 |     Rcpp::RNGScope rcpp_rngScope_gen;
35 |     Rcpp::traits::input_parameter< arma::mat& >::type mat(matSEXP);
36 |     Rcpp::traits::input_parameter< arma::vec& >::type bgsub(bgsubSEXP);
37 |     Rcpp::traits::input_parameter< arma::mat& >::type x(xSEXP);
38 |     Rcpp::traits::input_parameter< arma::mat& >::type xsd(xsdSEXP);
39 |     rcpp_result_gen = Rcpp::wrap(lls_protein(mat, bgsub, x, xsd));
40 |     return rcpp_result_gen;
41 | END_RCPP
42 | }
43 | 
44 | static const R_CallMethodDef CallEntries[] = {
45 |     {"_InSituType_lls_rna", (DL_FUNC) &_InSituType_lls_rna, 5},
46 |     {"_InSituType_lls_protein", (DL_FUNC) &_InSituType_lls_protein, 4},
47 |     {NULL, NULL, 0}
48 | };
49 | 
50 | RcppExport void R_init_InSituType(DllInfo *dll) {
51 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
52 |     R_useDynamicSymbols(dll, FALSE);
53 | }
54 | 


--------------------------------------------------------------------------------
/src/rcpparma_dnbinom_sparse.cpp:
--------------------------------------------------------------------------------
  1 | // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; indent-tabs-mode: nil; -*-
  2 | 
  3 | // we only include RcppArmadillo.h which pulls Rcpp.h in for us
  4 | #include <RcppArmadillo.h>
  5 | 
  6 | // via the depends attribute we tell Rcpp to create hooks for
  7 | // RcppArmadillo so that the build process will know what to do
  8 | //
  9 | // [[Rcpp::depends(RcppArmadillo)]]
 10 | // [[Rcpp::plugins(cpp11)]]
 11 | using namespace Rcpp; 
 12 | using namespace arma;
 13 | 
 14 | // Add a flag to enable OpenMP at compile time
 15 | // [[Rcpp::plugins(openmp)]]
 16 | 
 17 | // Protect against compilers without OpenMP
 18 | #ifdef _OPENMP
 19 | #include <omp.h>
 20 | 
 21 | static int NBthreads = -1;
 22 | 
 23 | int get_lldist_threads(const int n_profiles) {
 24 |   if (NBthreads == -1) {
 25 |     // Max allocation of threads equal to 80% of cores
 26 |     NBthreads = floor(0.8*omp_get_num_procs());
 27 |     
 28 |     // Reduce max based on OpenMP settings
 29 |     NBthreads = std::min(NBthreads, omp_get_thread_limit());
 30 |     NBthreads = std::min(NBthreads, omp_get_max_threads());
 31 |   }
 32 |   const int ans = n_profiles + 2; // desired number of threads
 33 |   return std::min(ans, NBthreads);
 34 | }
 35 | #endif
 36 | 
 37 | //' sum from negative binomial density function
 38 | //'
 39 | //' Probability density function of the negative binomial distribution (written in C++)
 40 | //'
 41 | //' @param mat dgCMatrix expression counts
 42 | //' @param bgsub vector of background expression per cell
 43 | //' @param x numeric expression for reference profiles
 44 | //' @param bg numeric background level
 45 | //' @param size_dnb int Dispersion parameter
 46 | //'
 47 | //' @return rowSums for matrix of densities
 48 | //' @useDynLib InSituType, .registration = TRUE
 49 | //' @importFrom Rcpp evalCpp
 50 | //' @exportPattern "^[[:alpha:]]+" 
 51 | //' @export
 52 | // [[Rcpp::export]]
 53 | Rcpp::NumericMatrix
 54 |   lls_rna(arma::sp_mat& mat, arma::vec& bgsub, arma::mat& x, arma::vec& bg, int& size_dnb) {
 55 |     unsigned int K = x.n_cols;
 56 |     Rcpp::NumericMatrix res(mat.n_rows, K);
 57 | #pragma omp parallel for num_threads(get_lldist_threads(K))
 58 |     for (unsigned int k = 0; k < K; k++) {
 59 |       const arma::mat::const_col_iterator col_it_begin = x.begin_col(k);
 60 |       arma::mat::const_col_iterator col_it = x.begin_col(k);
 61 |       const arma::mat::const_col_iterator col_it_end = x.end_col(k);
 62 |       const arma::vec s = bgsub / sum(x.col(k));
 63 |       for(; col_it != col_it_end; ++col_it) {
 64 |         arma::vec::const_iterator s_iter = s.begin();
 65 |         arma::vec::const_iterator bg_iter = bg.begin();
 66 |         for(; s_iter != s.end(); ++s_iter) {
 67 |           double yhat = (*s_iter) * (*col_it) + (*bg_iter);
 68 |           int i = s_iter - s.begin();
 69 |           int j = col_it - col_it_begin;
 70 |           res(i, k) += R::dnbinom_mu(mat(i, j), size_dnb, yhat, 1);
 71 |           ++bg_iter;
 72 |         }
 73 |       }
 74 |     }
 75 |     return res;
 76 |   }
 77 | 
 78 | //' sum from Gaussian density function
 79 | //'
 80 | //' Probability density function of the Gaussian distribution (written in C++)
 81 | //'
 82 | //' @param mat dgCMatrix expression matrix
 83 | //' @param bgsub vector of background expression per cell
 84 | //' @param x numeric expression for reference profiles
 85 | //' @param xsd numeric expression for reference SD profiles
 86 | //' 
 87 | //' @return rowSums for matrix of densities
 88 | //' @useDynLib InSituType, .registration = TRUE
 89 | //' @importFrom Rcpp evalCpp
 90 | //' @exportPattern "^[[:alpha:]]+" 
 91 | //' @export
 92 | // [[Rcpp::export]]
 93 | Rcpp::NumericMatrix
 94 |   lls_protein(arma::mat& mat, arma::vec& bgsub, arma::mat& x, arma::mat& xsd) {
 95 |     unsigned int K = x.n_cols;
 96 |     Rcpp::NumericMatrix res(mat.n_rows, K);
 97 | #pragma omp parallel for num_threads(get_lldist_threads(K))
 98 |     for (unsigned int k = 0; k < K; k++) {
 99 |       const arma::mat::const_col_iterator col_it_begin = x.begin_col(k);
100 |       arma::mat::const_col_iterator col_it = x.begin_col(k);
101 |       arma::mat::const_col_iterator xsd_iter = xsd.begin_col(k);
102 |       const arma::mat::const_col_iterator col_it_end = x.end_col(k);
103 |       const arma::vec s = bgsub / sum(x.col(k));
104 |       for(; col_it != col_it_end; ++col_it) {
105 |         arma::vec::const_iterator s_iter = s.begin();
106 |         //arma::vec::const_iterator bg_iter = bg.begin();
107 |         for(; s_iter != s.end(); ++s_iter) {
108 |           double yhat = (*s_iter) * (*col_it);
109 |           double sd = (*s_iter) * (*xsd_iter);
110 |           int i = s_iter - s.begin();
111 |           int j = col_it - col_it_begin;
112 |           res(i, k) += R::dnorm(mat(i, j), yhat, sd, 1);
113 |           //++bg_iter;
114 |         }
115 |         ++xsd_iter;
116 |       }
117 |     }
118 |     return res;
119 |   }
120 |  


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(InSituType)
3 | test_check("InSituType")
4 | 


--------------------------------------------------------------------------------
/tests/testthat/test-colorCellTypes.R:
--------------------------------------------------------------------------------
 1 | # create mock cell type abundances:
 2 | data("iocolors")
 3 | 
 4 | set.seed(0)
 5 | cells <- sample(c(letters[1:10], names(iocolors)[1:4]), 100, replace = TRUE)
 6 | tab <- table(cells)
 7 | 
 8 | # run using just names: 
 9 | cols_names <- colorCellTypes(names = names(tab), freqs = NULL, init_colors = NULL, max_sum_rgb = 600, palette = "brewers") 
10 | 
11 | # run using abundance info: 
12 | cols_freqs <- colorCellTypes(names = NULL, freqs = tab, init_colors = NULL, max_sum_rgb = 600) 
13 | 
14 | 
15 | # test that pre-specified colors are used
16 | cols_init <- colorCellTypes(names = NULL, freqs = tab, init_colors = iocolors, max_sum_rgb = 600) 
17 | 
18 | test_that("pre-specified colors are used", {
19 |   sharedcells <- intersect(names(tab), names(iocolors))
20 |   expect_true(all.equal(cols_init[sharedcells], iocolors[sharedcells]))
21 | })
22 | 
23 | 
24 | 
25 | # test that legal colors are returned in all cases, with names matching the cell names:
26 | test_that("test that results returned by flagLowGenes have the right formats", {
27 |   expect_error(plot(seq_along(tab), col = cols_names), NA) # "NA" means expecting no error
28 |   expect_error(plot(seq_along(tab), col = cols_freqs), NA) # "NA" means expecting no error
29 |   expect_error(plot(seq_along(tab), col = cols_init), NA) # "NA" means expecting no error
30 |   expect_equal(length(intersect(names(cols_names), names(tab))), length(names(tab)))
31 |   expect_equal(length(intersect(names(cols_freqs), names(tab))), length(names(tab)))
32 |   expect_equal(length(intersect(names(cols_init), names(tab))), length(names(tab)))
33 |   
34 | })
35 | 
36 | # test that it works if prespecified colors have no overlap:
37 | test_that("correct results even if prespecified colors have no overlap", {
38 |   cols_bad_init <- colorCellTypes(names = NULL, freqs = tab, init_colors = c(no = "red", nope = "blue"), max_sum_rgb = 600) 
39 |   expect_error(plot(seq_along(tab), col = cols_bad_init), NA) # "NA" means expecting no error
40 |   expect_equal(length(intersect(names(cols_bad_init), names(tab))), length(names(tab)))
41 | })
42 | 
43 | 
44 | # test that all 3 paletted work:
45 | test_that("all 3 paletted work", {
46 |   cols_tab20 <- colorCellTypes(names = NULL, freqs = tab, init_colors = NULL, max_sum_rgb = 600, palette = "tableau20")
47 |   cols_brew <- colorCellTypes(names = NULL, freqs = tab, init_colors = NULL, max_sum_rgb = 600, palette = "brewers") 
48 |   cols_earth <- colorCellTypes(names = NULL, freqs = tab, init_colors = NULL, max_sum_rgb = 600, palette = "earthplus") 
49 |   expect_error(plot(seq_along(tab), col = cols_tab20), NA) # "NA" means expecting no error
50 |   expect_error(plot(seq_along(tab), col = cols_brew), NA) # "NA" means expecting no error
51 |   expect_error(plot(seq_along(tab), col = cols_earth), NA) # "NA" means expecting no error
52 | })
53 | 
54 | 


--------------------------------------------------------------------------------
/tests/testthat/test_RCppExports.R:
--------------------------------------------------------------------------------
 1 | data("ioprofiles")
 2 | data("mini_nsclc")
 3 | bg <- Matrix::rowMeans(mini_nsclc$neg)
 4 | genes <- intersect(dimnames(mini_nsclc$counts)[[2]], dimnames(ioprofiles)[[1]])
 5 | mat <- mini_nsclc$counts[, genes]
 6 | x <- ioprofiles[genes, 1, drop = FALSE]
 7 | 
 8 | test_that("Rcpp calculation is same as stats package for RNA data type", {
 9 |   bgsub <- pmax(sweep(mat, 1, bg, "-"), 0)
10 |   s <- Matrix::rowSums(bgsub) / sum(x)
11 |   s[s <= 0] <- Matrix::rowSums(mat[s <= 0, , drop = FALSE]) / sum(x)
12 |   result <- lldist(mat = as(mat, "dgCMatrix"),
13 |                                assay_type = "rna",
14 |                                x = x,
15 |                                bg=bg, 
16 |                                size = 10)
17 |   names(result) <- rownames(mat)
18 |   yhat <- sweep(s %*% t(x), 1, bg, "+")
19 |   lls <- stats::dnbinom(x = as.matrix(mat), size = 10, mu = yhat, log = TRUE)
20 |   result_ref <- round(rowSums(lls), digits=2)
21 |   expect_true(all.equal(result[,1], result_ref))
22 | })
23 | 
24 | 
25 | data("tonsil_protein")
26 | data("tonsil_reference_profile")
27 | bg <- Matrix::rowMeans(tonsil_protein$neg)
28 | proteins <- intersect(dimnames(tonsil_protein$counts)[[2]], dimnames(tonsil_reference_profile$mean.ref.profile)[[1]])
29 | mat <- tonsil_protein$counts[, proteins]
30 | x <- tonsil_reference_profile$mean.ref.profile[proteins, 1, drop = FALSE]
31 | xsd <- tonsil_reference_profile$SDs.ref.profile[proteins, 1, drop = FALSE]
32 | 
33 | 
34 | test_that("Rcpp calculation is same as stats package for protein data type", {
35 |   bgsub <- pmax(sweep(mat, 1, bg, "-"), 0)
36 |   s <- Matrix::rowSums(bgsub) / sum(x)
37 |   s[s <= 0] <- Matrix::rowSums(mat[s <= 0, , drop = FALSE]) / sum(x)
38 |   result <- lldist(mat = as.matrix(mat),
39 |                                assay_type = "Protein",
40 |                                x = x,
41 |                                xsd = xsd,
42 |                                bg=bg, 
43 |                                size = 10)
44 |   names(result) <- rownames(mat)
45 |   
46 |   yhat <- s %*% t(x)
47 |   ysd <- s %*% t(xsd)
48 | 
49 |   lls <- stats::dnorm(x = as.matrix(mat), sd = ysd, mean = yhat, log = TRUE)
50 |   
51 |   result_ref <- round(rowSums(lls), digits=2)
52 |   expect_true(all.equal(result[,1], result_ref))
53 | })
54 | 


--------------------------------------------------------------------------------
/tests/testthat/test_flightpath.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # load data ("raw" and "cellannot"):
 3 | data("ioprofiles")
 4 | data("iocolors")
 5 | data("mini_nsclc")
 6 | 
 7 | 
 8 | # run unsupervised clustering with several random starts:
 9 | res <- insitutype(x = mini_nsclc$counts,
10 |                   neg = Matrix::rowMeans(mini_nsclc$neg),
11 |                   bg = NULL,
12 |                   init_clust = NULL, n_clusts = 6,
13 |                   anchors = NULL,
14 |                   nb_size = 10,
15 |                   n_starts = 2,
16 |                   align_genes = TRUE,
17 |                   sketchingdata = NULL,
18 |                   n_benchmark_cells = 100,
19 |                   n_phase1 = 50,
20 |                   n_phase2 = 100,
21 |                   n_phase3 = 200,
22 |                   n_chooseclusternumber = 100,
23 |                   pct_drop = 1/10000,
24 |                   min_prob_increase = 0.05,
25 |                   max_iters = 2,
26 |                   assay_type="RNA")
27 | 
28 | 
29 | # test flightpath_layout
30 | fp <- flightpath_layout(probs = NULL, logliks = res$logliks, profiles = res$profiles)
31 | 
32 | test_that("flightpath_layout returns correct format", {
33 |   expect_true(all(dim(fp$clustpos) == c(6, 2)))
34 |   expect_true(all(dim(fp$cellpos) == c(nrow(res$logliks), 2)))
35 |   expect_true(all(!is.na(fp$clustpos)))
36 |   expect_true(all(!is.na(fp$cellpos)))
37 | })
38 | 
39 | 
40 | # test flightpath_plot from flightpath results
41 | p <- flightpath_plot(flightpath_result = fp)
42 | test_that("flightpath_plot returns a ggplot object", {
43 |   expect_true(any(grepl("gg", class(p))))
44 | })
45 | 
46 | 
47 | # test flightpath_plot from insitutype results
48 | p <- flightpath_plot(insitutype_result = res)
49 | test_that("flightpath_plot returns a ggplot object", {
50 |   expect_true(any(grepl("gg", class(p))))
51 | })
52 | 
53 | # test flightpath_plot showing meanconfidence
54 | p <- flightpath_plot(insitutype_result = res, showclusterconfidence = TRUE)
55 | test_that("flightpath_plot returns a ggplot object when showclusterconfidence = TRUE", {
56 |   expect_true(any(grepl("gg", class(p))))
57 | })
58 | 
59 | 


--------------------------------------------------------------------------------
/tests/testthat/test_getProfiles.R:
--------------------------------------------------------------------------------
 1 | data("ioprofiles")
 2 | data("iocolors")
 3 | data("mini_nsclc")
 4 | 
 5 | 
 6 | initclust <- sample(c("a","b","c"), nrow(mini_nsclc$counts), replace = TRUE)
 7 | 
 8 | test_that("getRNAprofiles worked", {
 9 |   temp <- getRNAprofiles(x = mini_nsclc$counts, neg = 0, clust = initclust)
10 |   expect_identical(rownames(temp), colnames(mini_nsclc$counts))
11 |   expect_identical(colnames(temp)[order(colnames(temp))], unique(initclust)[order(unique(initclust))])
12 | })
13 | 
14 | 
15 | test_that("getproteinparameters worked", {
16 |   temp <- getProteinParameters(x = mini_nsclc$counts, clust = initclust)
17 |   expect_identical(rownames(temp$profiles), colnames(mini_nsclc$counts))
18 |   expect_identical(rownames(temp$sds), colnames(mini_nsclc$counts))
19 |   expect_identical(colnames(temp$profiles)[order(colnames(temp$profiles))], unique(initclust)[order(unique(initclust))])
20 |   expect_identical(colnames(temp$sds)[order(colnames(temp$sds))], unique(initclust)[order(unique(initclust))])
21 | })
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/tests/testthat/test_getSpatialContext.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # load data ("raw" and "cellannot"):
 3 | data("mini_nsclc")
 4 | 
 5 | 
 6 | test_that("getNeighborhood expression works under diverse settings", {
 7 | 
 8 |   n1 <- getSpatialContext(counts = mini_nsclc$counts, xy = cbind(mini_nsclc$x, mini_nsclc$y), N = 50)
 9 |   expect_equal(dim(n1), dim(mini_nsclc$counts))
10 |   
11 |   n2 <- getSpatialContext(counts = mini_nsclc$counts, xy = cbind(mini_nsclc$x, mini_nsclc$y), rad = 0.1, dim_reduce_to = 20)
12 |   expect_equal(dim(n2), c(nrow(mini_nsclc$counts), 20))
13 | })
14 | 
15 | 


--------------------------------------------------------------------------------
/tests/testthat/test_refinecells_cell_merging_logic.R:
--------------------------------------------------------------------------------
 1 | # example logliks:
 2 | logliks <- matrix(c(-3, -3, -2, -1, -1, -2), 
 3 |                   nrow = 2,
 4 |                   dimnames = list(paste0("cell", 1:2), paste0("old_", letters[1:3])))
 5 | 
 6 | # define merges:
 7 | merges <- c("old_a" = "new1", "old_b" = "new1", "old_c" = "old_c")
 8 | 
 9 | # run:
10 | res <- refineClusters(merges = merges, logliks = logliks)
11 | 
12 | # confirm it works:
13 | test_that("new cluster names are right", {
14 |   expect_equal(colnames(res$logliks), c("new1", "old_c.new"))
15 | })
16 | 
17 | test_that("new cluster assignments are right", {
18 |   expect_equal(res$clust, c("cell1" = "old_c.new", "cell2" = "new1"))
19 | })
20 | 
21 | test_that("probabilities are right", {
22 |   expect_equal(res$logliks[, 1], c("cell1" = -2, "cell2" = -1), tolerance = 2)
23 | })
24 | 


--------------------------------------------------------------------------------
/tests/testthat/test_spatialUpdate.R:
--------------------------------------------------------------------------------
 1 | data("ioprofiles")
 2 | data("iocolors")
 3 | data("mini_nsclc")
 4 | 
 5 | 
 6 | initclust <- sample(c("a","b","c"), nrow(mini_nsclc$counts), replace = TRUE)
 7 | 
 8 | updatedclust <- spatialUpdate(celltype = initclust, 
 9 |                               counts = mini_nsclc$counts,
10 |                               neg = Matrix::rowMeans(mini_nsclc$neg),
11 |                               cohort = NULL, altdata = NULL, 
12 |                               xy = cbind(mini_nsclc$x, mini_nsclc$y), 
13 |                               tissue = NULL,
14 |                               nb_size = 10, assay_type = "rna")
15 | test_that("spatialUpdate worked", {
16 |   expect_true(all(is.element(c( "clust","prob","profiles","sds","logliks","logliks_from_lost_celltypes"), names(updatedclust))))
17 | })
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------