'+ escapeHtml(title) + '
' + escapeHtml(summary) +'
├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── Tutorial_scaleSC.ipynb ├── docs ├── 404.html ├── api-docs │ ├── harmonypy_gpu │ │ └── index.html │ ├── index.html │ ├── kernels │ │ └── index.html │ ├── pp │ │ └── index.html │ ├── trim_merge_marker │ │ └── index.html │ └── util │ │ └── index.html ├── css │ ├── base.css │ ├── bootstrap.min.css │ ├── bootstrap.min.css.map │ ├── brands.min.css │ ├── extra.css │ ├── fontawesome.min.css │ ├── solid.min.css │ └── v4-font-face.min.css ├── img │ ├── favicon.ico │ ├── grid.png │ ├── pipeline.png │ └── time_comp.png ├── index.html ├── js │ ├── base.js │ ├── bootstrap.bundle.min.js │ ├── bootstrap.bundle.min.js.map │ └── darkmode.js ├── search │ ├── lunr.js │ ├── main.js │ ├── search_index.json │ └── worker.js ├── sitemap.xml ├── sitemap.xml.gz └── webfonts │ ├── fa-brands-400.ttf │ ├── fa-brands-400.woff2 │ ├── fa-regular-400.ttf │ ├── fa-regular-400.woff2 │ ├── fa-solid-900.ttf │ ├── fa-solid-900.woff2 │ ├── fa-v4compatibility.ttf │ └── fa-v4compatibility.woff2 ├── img ├── pipeline.png ├── scalesc_overview.png ├── scalesc_pipeline.png └── time_comp.png ├── pyproject.toml └── scalesc ├── __init__.py ├── harmonypy_gpu.py ├── kernels.py ├── pp.py ├── trim_merge_marker.py └── util.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py linguist-language=Python 2 | *.ipynb linguist-language=Python -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | */__pycache__/ 2 | build/ 3 | dist/ 4 | scalesc.egg-info/ 5 | html/ 6 | test*/ 7 | .ipynb_checkpoints/ 8 | Test.ipynb 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Haotian Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
7 | A GPU-accelerated tool for large scale scRNA-seq pipeline. 8 |
9 | 10 | 19 | 20 |21 | Highlights • 22 | Why ScaleSC • 23 | Installation • 24 | Tutorial • 25 | API Reference 26 |
27 | 28 | ## Highlights 29 | 30 | - Fast scRNA-seq pipeline including QC, Normalization, Batch-effect Removal, Dimension Reduction in a ***similar syntax*** as `scanpy` and `rapids-singlecell`. 31 | - Scale to dataset with more than ***10M cells*** on a ***single*** GPU. (A100 80G) 32 | - Chunk the data to avoid the ***`int32` limitation*** in `cupyx.scipy.sparse` used by `rapids-singlecell` that disables the computing for moderate-size dataset (~1.3M) without Multi-GPU support. 33 | - Reconcile output at each step to ***`scanpy`*** to reproduce the ***same*** results as on CPU end. 34 | - Improvement on ***`harmonypy`*** which allows dataset with more than ***10M cells*** and more than ***1000 samples*** to be run on a single GPU. 35 | - Speedup and optimize ***`NSForest`*** algorithm using GPU for ***better*** maker gene identification. 36 | - ***Merge*** clusters according to the gene expression of detected markers by `NSForest`. 37 | 38 | ## Why ScaleSC 39 | 40 |Page not found
92 |harmonypy_gpu
get_usage
get_usage(s)
148 |
149 | to_csr_cuda
to_csr_cuda(x, dtype)
153 |
154 | Move to GPU as a csr_matrix.
155 |to_csc_cuda
to_csc_cuda(x, dtype)
159 |
160 | Move to GPU as a csc_matrix, speed up column slice.
161 |get_dummies
get_dummies(x)
165 |
166 | Return a sparse dummy matrix.
167 |run_harmony
run_harmony(
171 | data_mat: 'ndarray',
172 | meta_data: 'DataFrame',
173 | vars_use,
174 | init_seeds=None,
175 | theta=None,
176 | lamb=None,
177 | sigma=0.1,
178 | nclust=None,
179 | tau=0,
180 | block_size=0.05,
181 | max_iter_harmony=10,
182 | max_iter_kmeans=20,
183 | epsilon_cluster=1e-05,
184 | epsilon_harmony=0.0001,
185 | plot_convergence=False,
186 | verbose=True,
187 | reference_values=None,
188 | cluster_prior=None,
189 | n_init=1,
190 | random_state=0,
191 | dtype=<class 'numpy.float32'>
192 | )
193 |
194 | Run Harmony.
195 |safe_entropy
safe_entropy(x: 'array')
199 |
200 | moe_correct_ridge
moe_correct_ridge(Z_orig, Z_cos, Z_corr, R, W, K, Phi_Rk, Phi_moe, lamb)
204 |
205 | Harmony
__init__
__init__(
211 | Z,
212 | init_seeds,
213 | n_init,
214 | Phi,
215 | Phi_moe,
216 | Pr_b,
217 | sigma,
218 | theta,
219 | max_iter_harmony,
220 | max_iter_kmeans,
221 | epsilon_kmeans,
222 | epsilon_harmony,
223 | K,
224 | block_size,
225 | lamb,
226 | verbose,
227 | random_state,
228 | dtype
229 | )
230 |
231 | allocate_buffers
allocate_buffers()
235 |
236 | check_convergence
check_convergence(i_type)
240 |
241 | cluster
cluster()
245 |
246 | compute_objective
compute_objective()
250 |
251 | harmonize
harmonize(iter_harmony=10, verbose=True)
255 |
256 | init_cluster
init_cluster()
260 |
261 | kmeans_multirestart
kmeans_multirestart()
265 |
266 | result
result()
270 |
271 | update_R
update_R()
275 |
276 | This file was automatically generated via lazydocs.
harmonypy_gpu
kernels
pp
trim_merge_marker
util
harmonypy_gpu.Harmony
pp.ScaleSC
: ScaleSC integrated pipeline in a scanpy-like style.trim_merge_marker.UF
trim_merge_marker.data2UF
util.AnnDataBatchReader
: Chunked dataloader for extremely large single-cell dataset. Return a data chunk each time for further processing.harmonypy_gpu.get_dummies
: Return a sparse dummy matrix. harmonypy_gpu.get_usage
harmonypy_gpu.moe_correct_ridge
harmonypy_gpu.run_harmony
: Run Harmony.harmonypy_gpu.safe_entropy
harmonypy_gpu.to_csc_cuda
: Move to GPU as a csc_matrix, speed up column slice. harmonypy_gpu.to_csr_cuda
: Move to GPU as a csr_matrix. kernels.get_find_indices
kernels.get_mean_var_major
kernels.get_mean_var_minor
trim_merge_marker.X_to_GPU
: Transfers matrices and arrays to the GPU.trim_merge_marker.adata_cluster_merge
: Need a description. trim_merge_marker.wrapper
trim_merge_marker.wrapper
trim_merge_marker.wrapper
trim_merge_marker.find_cluster_pairs_to_merge
trim_merge_marker.find_markers
trim_merge_marker.fraction_cells
: Given adata.X (n cells * m genes), ctype_col (a column name in adata.obs that stores the cell type annotation), and a glist (for example, [gene1, gene2, ..., genek])trim_merge_marker.marker_filter_sort
trim_merge_marker.wrapper
trim_merge_marker.myNSForest
trim_merge_marker.wrapper
trim_merge_marker.wrapper
trim_merge_marker.wrapper
trim_merge_marker.specificity_score
trim_merge_marker.stds
: Variance of sparse matrix atrim_merge_marker.timer
util.check_dtype
: Convert dtype to float32
or float64
.util.check_nonnegative_integers
: Check if X
is a nonnegative integer matrix.util.correct_leiden
util.csr_col_index
util.csr_indptr_to_coo_rows
util.csr_row_index
: Populate indices and data arrays from the given row index.util.filter_cells
: Cell filtering according to min and max gene counts.util.find_indices
util.gc
: Release CPU and GPU RAMutil.get_mean_var
: Calculating mean and variance of a given matrix based on customized kernels.util.harmony
: Harmony GPU version. util.svd_flip
: Flip the signs of loading according to sign(max(abs(loadings))).util.write_to_disk
This file was automatically generated via lazydocs.
kernels
get_mean_var_major
get_mean_var_major(dtype)
148 |
149 | get_mean_var_minor
get_mean_var_minor(dtype)
153 |
154 | get_find_indices
get_find_indices()
158 |
159 | This file was automatically generated via lazydocs.
trim_merge_marker
timer
timer(func)
213 |
214 | wrapper
wrapper(*args, **kwargs)
218 |
219 | wrapper
wrapper(*args, **kwargs)
223 |
224 | wrapper
wrapper(*args, **kwargs)
228 |
229 | wrapper
wrapper(*args, **kwargs)
233 |
234 | wrapper
wrapper(*args, **kwargs)
238 |
239 | wrapper
wrapper(*args, **kwargs)
243 |
244 | wrapper
wrapper(*args, **kwargs)
248 |
249 | X_to_GPU
X_to_GPU(X)
253 |
254 | Transfers matrices and arrays to the GPU.
255 |Args:
256 |X
: Matrix or array to transfer to the GPU. marker_filter_sort
marker_filter_sort(markers, cluster, df_sp, df_frac)
263 |
264 | find_markers
find_markers(adata, subctype_col)
268 |
269 | stds
stds(x, axis=None)
273 |
274 | Variance of sparse matrix a var = mean(a2) - mean(a)2
275 |Standard deviation of sparse matrix a std = sqrt(var(a))
276 |find_cluster_pairs_to_merge
find_cluster_pairs_to_merge(adata, x, colname, cluster, markers)
280 |
281 | adata_cluster_merge
adata_cluster_merge(adata, subctype_col)
285 |
286 | Need a description.
287 |specificity_score
specificity_score(adata=None, ctype_col: str = None, glist: list = None)
291 |
292 | fraction_cells
fraction_cells(adata=None, ctype_col: str = None, glist: list = None)
296 |
297 | Given adata.X (n cells * m genes), ctype_col (a column name in adata.obs that stores the cell type annotation), and a glist (for example, [gene1, gene2, ..., genek]) The definiation of Fraction of expression := # cells>0 / # total cells. Assume in total c different cell types for each cell type, subset the adata, and then calculate the fraction of expression of each gene return the fraction dataframe, k rows, c columns.
298 |myNSForest
myNSForest(
302 | adata,
303 | cluster_header,
304 | cluster_list=None,
305 | medians_header=None,
306 | n_trees=100,
307 | n_jobs=-1,
308 | beta=0.5,
309 | n_top_genes=15,
310 | n_binary_genes=10,
311 | n_genes_eval=6,
312 | output_folder='.',
313 | save_results=False
314 | )
315 |
316 | UF
__init__
__init__(n)
322 |
323 | current_kids_dict
current_kids_dict()
327 |
328 | final
final()
332 |
333 | find
find(x)
337 |
338 | union
union(x, y)
342 |
343 | data2UF
__init__
__init__(celltypes: list, merge_pairs: list[tuple])
349 |
350 | union_pairs
union_pairs() → int
354 |
355 | This file was automatically generated via lazydocs.
141 | A GPU-accelerated tool for large scale scRNA-seq pipeline. 142 |
143 | 144 |145 | Highlights • 146 | Why ScaleSC • 147 | Installation • 148 | API Reference 149 |
150 | 151 |scanpy
and rapids-singlecell
.int32
limitation in cupyx.scipy.sparse
used by rapids-singlecell
that disables the computing for even moderate-size dataset (~1M).scanpy
to reproduce the same results as on CPU end.harmonypy
which allows dataset with more than 10M cells and more than 1000 samples to be run on a single GPU (A100 80G).174 | | scanpy | 175 |scalesc | 176 |rapids-singlecell | 177 |
---|---|---|---|
GPU Support | 182 |❌ | 183 |✅ | 184 |✅ | 185 |
int32 Issue | 188 |❌ | 189 |❌ | 190 |✅ | 191 |
Upper Limit of # Cells | 194 |♾️ | 195 |~20M | 196 |~1M | 197 |
Upper Limit of # Samples | 200 |♾️ | 201 |>1000 | 202 |<100 | 203 |
220 |222 |Note: ScaleSC requires a high-end GPU (> 24G VRAM) and a matching CUDA version to support GPU-accelerated computing.
221 |
Requirements:
223 |Environment Setup:
230 |Install RAPIDS through Conda,
233 |conda create -n scalesc -c rapidsai -c conda-forge -c nvidia rapids=24.10 python=3.10 'cuda-version>=11.4,<=11.8'
Users have flexibility to install it according to their systems by using this online selector.
237 |Activate conda env,
240 |conda activate scalesc
Install rapids-singlecell using pip,
246 |pip install rapids-singlecell
Install scaleSC,
252 |git clone https://github.com/interactivereport/scaleSC.git
cd scaleSC
pip install .
python -c "import scalesc; print(scalesc.__version__)"
== 0.1.0python -c "import cupy; print(cupy.__version__)"
>= 13.3.0python -c "import cuml; print(cuml.__version__)"
>= 24.10python -c "import cupy; print(cupy.cuda.is_available())"
= Truepython -c "import xgboost; print(xgboost.__version__)
>= 2.1.1, optionally for marker annotation' + escapeHtml(summary) +'
' + noResultsText + '
'); 52 | } 53 | } 54 | 55 | function doSearch () { 56 | var query = document.getElementById('mkdocs-search-query').value; 57 | if (query.length > min_search_length) { 58 | if (!window.Worker) { 59 | displayResults(search(query)); 60 | } else { 61 | searchWorker.postMessage({query: query}); 62 | } 63 | } else { 64 | // Clear results for short queries 65 | displayResults([]); 66 | } 67 | } 68 | 69 | function initSearch () { 70 | var search_input = document.getElementById('mkdocs-search-query'); 71 | if (search_input) { 72 | search_input.addEventListener("keyup", doSearch); 73 | } 74 | var term = getSearchTermFromLocation(); 75 | if (term) { 76 | search_input.value = term; 77 | doSearch(); 78 | } 79 | } 80 | 81 | function onWorkerMessage (e) { 82 | if (e.data.allowSearch) { 83 | initSearch(); 84 | } else if (e.data.results) { 85 | var results = e.data.results; 86 | displayResults(results); 87 | } else if (e.data.config) { 88 | min_search_length = e.data.config.min_search_length-1; 89 | } 90 | } 91 | 92 | if (!window.Worker) { 93 | console.log('Web Worker API not supported'); 94 | // load index in main thread 95 | $.getScript(joinUrl(base_url, "search/worker.js")).done(function () { 96 | console.log('Loaded worker'); 97 | init(); 98 | window.postMessage = function (msg) { 99 | onWorkerMessage({data: msg}); 100 | }; 101 | }).fail(function (jqxhr, settings, exception) { 102 | console.error('Could not load worker.js'); 103 | }); 104 | } else { 105 | // Wrap search in a web worker 106 | var searchWorker = new Worker(joinUrl(base_url, "search/worker.js")); 107 | searchWorker.postMessage({init: true}); 108 | searchWorker.onmessage = onWorkerMessage; 109 | } 110 | -------------------------------------------------------------------------------- /docs/search/worker.js: -------------------------------------------------------------------------------- 1 | var base_path = 'function' === typeof importScripts ? '.' : '/search/'; 2 | var allowSearch = false; 3 | var index; 4 | var documents = {}; 5 | var lang = ['en']; 6 | var data; 7 | 8 | function getScript(script, callback) { 9 | console.log('Loading script: ' + script); 10 | $.getScript(base_path + script).done(function () { 11 | callback(); 12 | }).fail(function (jqxhr, settings, exception) { 13 | console.log('Error: ' + exception); 14 | }); 15 | } 16 | 17 | function getScriptsInOrder(scripts, callback) { 18 | if (scripts.length === 0) { 19 | callback(); 20 | return; 21 | } 22 | getScript(scripts[0], function() { 23 | getScriptsInOrder(scripts.slice(1), callback); 24 | }); 25 | } 26 | 27 | function loadScripts(urls, callback) { 28 | if( 'function' === typeof importScripts ) { 29 | importScripts.apply(null, urls); 30 | callback(); 31 | } else { 32 | getScriptsInOrder(urls, callback); 33 | } 34 | } 35 | 36 | function onJSONLoaded () { 37 | data = JSON.parse(this.responseText); 38 | var scriptsToLoad = ['lunr.js']; 39 | if (data.config && data.config.lang && data.config.lang.length) { 40 | lang = data.config.lang; 41 | } 42 | if (lang.length > 1 || lang[0] !== "en") { 43 | scriptsToLoad.push('lunr.stemmer.support.js'); 44 | if (lang.length > 1) { 45 | scriptsToLoad.push('lunr.multi.js'); 46 | } 47 | if (lang.includes("ja") || lang.includes("jp")) { 48 | scriptsToLoad.push('tinyseg.js'); 49 | } 50 | for (var i=0; i < lang.length; i++) { 51 | if (lang[i] != 'en') { 52 | scriptsToLoad.push(['lunr', lang[i], 'js'].join('.')); 53 | } 54 | } 55 | } 56 | loadScripts(scriptsToLoad, onScriptsLoaded); 57 | } 58 | 59 | function onScriptsLoaded () { 60 | console.log('All search scripts loaded, building Lunr index...'); 61 | if (data.config && data.config.separator && data.config.separator.length) { 62 | lunr.tokenizer.separator = new RegExp(data.config.separator); 63 | } 64 | 65 | if (data.index) { 66 | index = lunr.Index.load(data.index); 67 | data.docs.forEach(function (doc) { 68 | documents[doc.location] = doc; 69 | }); 70 | console.log('Lunr pre-built index loaded, search ready'); 71 | } else { 72 | index = lunr(function () { 73 | if (lang.length === 1 && lang[0] !== "en" && lunr[lang[0]]) { 74 | this.use(lunr[lang[0]]); 75 | } else if (lang.length > 1) { 76 | this.use(lunr.multiLanguage.apply(null, lang)); // spread operator not supported in all browsers: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Spread_operator#Browser_compatibility 77 | } 78 | this.field('title'); 79 | this.field('text'); 80 | this.ref('location'); 81 | 82 | for (var i=0; i < data.docs.length; i++) { 83 | var doc = data.docs[i]; 84 | this.add(doc); 85 | documents[doc.location] = doc; 86 | } 87 | }); 88 | console.log('Lunr index built, search ready'); 89 | } 90 | allowSearch = true; 91 | postMessage({config: data.config}); 92 | postMessage({allowSearch: allowSearch}); 93 | } 94 | 95 | function init () { 96 | var oReq = new XMLHttpRequest(); 97 | oReq.addEventListener("load", onJSONLoaded); 98 | var index_path = base_path + '/search_index.json'; 99 | if( 'function' === typeof importScripts ){ 100 | index_path = 'search_index.json'; 101 | } 102 | oReq.open("GET", index_path); 103 | oReq.send(); 104 | } 105 | 106 | function search (query) { 107 | if (!allowSearch) { 108 | console.error('Assets for search still loading'); 109 | return; 110 | } 111 | 112 | var resultDocuments = []; 113 | var results = index.search(query); 114 | for (var i=0; i < results.length; i++){ 115 | var result = results[i]; 116 | doc = documents[result.ref]; 117 | doc.summary = doc.text.substring(0, 200); 118 | resultDocuments.push(doc); 119 | } 120 | return resultDocuments; 121 | } 122 | 123 | if( 'function' === typeof importScripts ) { 124 | onmessage = function (e) { 125 | if (e.data.init) { 126 | init(); 127 | } else if (e.data.query) { 128 | postMessage({ results: search(e.data.query) }); 129 | } else { 130 | console.error("Worker - Unrecognized message: " + e); 131 | } 132 | }; 133 | } 134 | -------------------------------------------------------------------------------- /docs/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 |