├── README.md ├── SCTransform.py ├── SCTransform_Example.ipynb └── setup.py /README.md: -------------------------------------------------------------------------------- 1 | # SCTransformPy 2 | 3 | 4 | This is a python port of the R package [SCTransform](https://github.com/ChristophH/sctransform). 5 | 6 | Currently, I only use log UMI counts as a single latent variable (the default in the R package). I'm planning on allowing the user to define custom regression models as is done in the R implementation. 7 | 8 | Implementation notes: 9 | - Poisson regression is done using the `statsmodels` package and parallelized with `multiprocessing`. 10 | - Improved Sheather & Jones bandwidth calculation is implemented by the `KDEpy` package. 11 | - Estimating `theta` using MLE was translated from the `theta.ml` function in R. 12 | - Pearson residuals are automatically clipped to be in the range `[0, sqrt(N/30)]` where `N` is the number of cells. This ensures that sparsity structure is preserved in the data. Practically, the results do not change much when allowing for dense, negative values. 13 | 14 | ## TODO 15 | - Provide comparison between the python and R implementations here to show that results are highly similar. 16 | - Clean up code and prepare for integration with `scanpy`. 17 | -------------------------------------------------------------------------------- /SCTransform.py: -------------------------------------------------------------------------------- 1 | import statsmodels.nonparametric.kernel_regression 2 | from KDEpy import FFTKDE 3 | from multiprocessing import Pool, Manager 4 | from scipy import stats 5 | import numpy as np 6 | import os 7 | import pandas as pd 8 | import statsmodels.discrete.discrete_model 9 | from anndata import AnnData 10 | import scipy as sp 11 | 12 | _EPS = np.finfo(float).eps 13 | 14 | def robust_scale_binned(y, x, breaks): 15 | bins = np.digitize(x,breaks) 16 | binsu = np.unique(bins) 17 | res = np.zeros(bins.size) 18 | for i in range(binsu.size): 19 | yb = y[bins==binsu[i]] 20 | res[bins==binsu[i]] = (yb - np.median(yb)) / (1.4826 * np.median(np.abs(yb - np.median(yb)))+ _EPS) 21 | 22 | return res 23 | 24 | 25 | def is_outlier(y, x, th = 10): 26 | z = FFTKDE(kernel='gaussian', bw='ISJ').fit(x) 27 | z.evaluate(); 28 | bin_width = (max(x) - min(x)) * z.bw / 2 29 | eps = _EPS * 10 30 | 31 | breaks1 = np.arange(min(x),max(x)+ bin_width,bin_width) 32 | breaks2 = np.arange(min(x) - eps - bin_width/2,max(x)+bin_width,bin_width) 33 | score1 = robust_scale_binned(y, x, breaks1) 34 | score2 = robust_scale_binned(y, x, breaks2) 35 | return np.abs(np.vstack((score1,score2))).min(0)>th 36 | 37 | 38 | def _parallel_init(igenes_bin_regress,iumi_bin,ign,imm,ips): 39 | global genes_bin_regress 40 | global umi_bin 41 | global gn 42 | global mm 43 | global ps 44 | genes_bin_regress = igenes_bin_regress 45 | umi_bin = iumi_bin 46 | gn = ign 47 | mm = imm 48 | ps = ips 49 | 50 | def _parallel_wrapper(j): 51 | name = gn[genes_bin_regress[j]] 52 | y = umi_bin[:,j].A.flatten() 53 | pr = statsmodels.discrete.discrete_model.Poisson(y,mm) 54 | res = pr.fit(disp=False) 55 | mu = res.predict() 56 | theta = theta_ml(y,mu) 57 | ps[name] = np.append(res.params,theta) 58 | 59 | 60 | def gmean(X,axis=0,eps=1): 61 | X=X.copy() 62 | X.data[:] = np.log(X.data+eps) 63 | return np.exp(X.mean(axis).A.flatten())-eps 64 | 65 | def theta_ml(y,mu): 66 | n = y.size 67 | weights = np.ones(n) 68 | limit = 10 69 | eps = (_EPS)**0.25 70 | 71 | from scipy.special import psi, polygamma 72 | def score(n,th,mu,y,w): 73 | return sum(w*(psi(th + y) - psi(th) + np.log(th) + 1 - np.log(th + mu) - (y + th)/(mu + th))) 74 | 75 | def info(n,th,mu,y,w): 76 | return sum(w*( - polygamma(1,th + y) + polygamma(1,th) - 1/th + 2/(mu + th) - (y + th)/(mu + th)**2)) 77 | 78 | t0 = n/sum(weights*(y/mu - 1)**2) 79 | it = 0 80 | de = 1 81 | 82 | while(it + 1 < limit and abs(de) > eps): 83 | it+=1 84 | t0 = abs(t0) 85 | i = info(n, t0, mu, y, weights) 86 | de = score(n, t0, mu, y, weights)/i 87 | t0 += de 88 | t0 = max(t0,0) 89 | 90 | return t0 91 | 92 | def SCTransform(adata,min_cells=5,gmean_eps=1,n_genes=2000,n_cells=None,bin_size=500,bw_adjust=3,inplace=True): 93 | """ 94 | This is a port of SCTransform from the Satija lab. See the R package for original documentation. 95 | 96 | Currently, only regression against the log UMI counts are supported. 97 | 98 | The only significant modification is that negative Pearson residuals are zero'd out to preserve 99 | the sparsity structure of the data. 100 | """ 101 | X=adata.X.copy() 102 | X=sp.sparse.csr_matrix(X) 103 | X.eliminate_zeros(); 104 | gn = np.array(list(adata.var_names)) 105 | cn = np.array(list(adata.obs_names)) 106 | genes_cell_count = X.sum(0).A.flatten() 107 | genes = np.where(genes_cell_count >= min_cells)[0] 108 | genes_ix=genes.copy() 109 | 110 | X = X[:,genes] 111 | Xraw=X.copy() 112 | gn = gn[genes] 113 | genes = np.arange(X.shape[1]) 114 | genes_cell_count = X.sum(0).A.flatten() 115 | 116 | 117 | genes_log_gmean = np.log10(gmean(X,axis=0,eps=gmean_eps)) 118 | 119 | if n_cells is not None and n_cells < X.shape[0]: 120 | cells_step1 = np.sort(np.random.choice(X.shape[0],replace=False,size=n_cells)) 121 | genes_cell_count_step1 = X[cells_step1].sum(0).A.flatten() 122 | genes_step1 = np.where(genes_cell_count_step1 >= min_cells)[0] 123 | genes_log_gmean_step1 = np.log10(gmean(X[cells_step1][:,genes_step1],axis=0,eps=gmean_eps)) 124 | else: 125 | cells_step1 = np.arange(X.shape[0]) 126 | genes_step1 = genes 127 | genes_log_gmean_step1 = genes_log_gmean 128 | 129 | 130 | umi = X.sum(1).A.flatten() 131 | log_umi = np.log10(umi) 132 | X2=X.copy() 133 | X2.data[:]=1 134 | gene = X2.sum(1).A.flatten() 135 | log_gene = np.log10(gene) 136 | umi_per_gene = umi / gene 137 | log_umi_per_gene = np.log10(umi_per_gene) 138 | 139 | cell_attrs = pd.DataFrame(index = cn, data = np.vstack((umi,log_umi,gene,log_gene,umi_per_gene,log_umi_per_gene)).T, 140 | columns=['umi','log_umi','gene','log_gene','umi_per_gene','log_umi_per_gene']) 141 | 142 | data_step1 = cell_attrs.iloc[cells_step1] 143 | 144 | if n_genes is not None and n_genes < len(genes_step1): 145 | log_gmean_dens = stats.gaussian_kde(genes_log_gmean_step1,bw_method='scott') 146 | xlo = np.linspace(genes_log_gmean_step1.min(),genes_log_gmean_step1.max(),512) 147 | ylo = log_gmean_dens.evaluate(xlo) 148 | xolo = genes_log_gmean_step1 149 | sampling_prob = 1 / (np.interp(xolo,xlo,ylo) + _EPS) 150 | genes_step1 = np.sort(np.random.choice(genes_step1,size=n_genes,p=sampling_prob/sampling_prob.sum(),replace=False)) 151 | genes_log_gmean_step1 = np.log10(gmean(X[cells_step1,:][:,genes_step1],eps=gmean_eps)) 152 | 153 | 154 | bin_ind = np.ceil(np.arange(1,genes_step1.size+1) / bin_size) 155 | max_bin = max(bin_ind) 156 | 157 | ps = Manager().dict() 158 | 159 | for i in range(1,int(max_bin)+1): 160 | genes_bin_regress = genes_step1[bin_ind==i] 161 | umi_bin = X[cells_step1,:][:,genes_bin_regress] 162 | 163 | mm = np.vstack((np.ones(data_step1.shape[0]),data_step1['log_umi'].values.flatten())).T 164 | 165 | pc_chunksize = umi_bin.shape[1] // os.cpu_count() + 1 166 | pool = Pool(os.cpu_count(), _parallel_init, [genes_bin_regress, umi_bin, gn, mm, ps]) 167 | try: 168 | pool.map(_parallel_wrapper, range(umi_bin.shape[1]), chunksize=pc_chunksize) 169 | finally: 170 | pool.close() 171 | pool.join() 172 | 173 | ps = ps._getvalue() 174 | 175 | model_pars = pd.DataFrame(data = np.vstack([ps[x] for x in gn[genes_step1]]), 176 | columns = ['Intercept','log_umi','theta'], 177 | index = gn[genes_step1]) 178 | 179 | min_theta = 1e-7 180 | x = model_pars['theta'].values.copy() 181 | x[x0 189 | 190 | filt = np.invert(outliers) 191 | model_pars = model_pars[filt] 192 | genes_step1 = genes_step1[filt] 193 | genes_log_gmean_step1 = genes_log_gmean_step1[filt] 194 | 195 | z = FFTKDE(kernel='gaussian', bw='ISJ').fit(genes_log_gmean_step1) 196 | z.evaluate(); 197 | bw = z.bw*bw_adjust 198 | 199 | x_points = np.vstack((genes_log_gmean,np.array([min(genes_log_gmean_step1)]*genes_log_gmean.size))).max(0) 200 | x_points = np.vstack((x_points,np.array([max(genes_log_gmean_step1)]*genes_log_gmean.size))).min(0) 201 | 202 | full_model_pars = pd.DataFrame(data = np.zeros((x_points.size,model_pars.shape[1])),index=gn,columns=model_pars.columns) 203 | for i in model_pars.columns: 204 | kr = statsmodels.nonparametric.kernel_regression.KernelReg(model_pars[i].values,genes_log_gmean_step1[:,None],['c'],reg_type='ll',bw=[bw]) 205 | full_model_pars[i] = kr.fit(data_predict=x_points)[0] 206 | 207 | theta = 10**genes_log_gmean / (10**full_model_pars['dispersion'].values - 1) 208 | full_model_pars['theta'] = theta 209 | del full_model_pars['dispersion'] 210 | 211 | model_pars_outliers = outliers 212 | 213 | regressor_data = np.vstack((np.ones(cell_attrs.shape[0]),cell_attrs['log_umi'].values)).T 214 | 215 | 216 | d = X.data 217 | x,y = X.nonzero() 218 | 219 | mud = np.exp(full_model_pars.values[:,0][y] + full_model_pars.values[:,1][y] * cell_attrs['log_umi'].values[x]) 220 | vard = mud+mud**2 / full_model_pars['theta'].values.flatten()[y] 221 | 222 | X.data[:] = (d - mud) / vard**0.5 223 | X.data[X.data<0]=0 224 | X.eliminate_zeros() 225 | 226 | clip = np.sqrt(X.shape[0]/30) 227 | X.data[X.data>clip]=clip 228 | 229 | if inplace: 230 | adata.raw = adata.copy() 231 | 232 | d = dict(zip(np.arange(X.shape[1]),genes_ix)) 233 | x,y = X.nonzero() 234 | y = np.array([d[i] for i in y]) 235 | data = X.data 236 | Xnew = sp.sparse.coo_matrix((data, (x, y)), shape=adata.shape).tocsr() 237 | adata.X = Xnew # TODO: add log1p of corrected umi counts to layers 238 | 239 | for c in full_model_pars.columns: 240 | adata.var[c+'_sct'] = full_model_pars[c] 241 | 242 | for c in cell_attrs.columns: 243 | adata.obs[c+'_sct'] = cell_attrs[c] 244 | 245 | for c in model_pars.columns: 246 | adata.var[c+'_step1_sct'] = model_pars[c] 247 | 248 | z = pd.Series(index=gn,data=np.zeros(gn.size,dtype='int')) 249 | z[gn[genes_step1]]=1 250 | 251 | w = pd.Series(index=gn,data=np.zeros(gn.size,dtype='int')) 252 | w[gn]=genes_log_gmean 253 | adata.var['genes_step1_sct'] = z 254 | adata.var['log10_gmean_sct'] = w 255 | 256 | else: 257 | adata_new = AnnData(X=X) 258 | adata_new.var_names = pd.Index(gn) 259 | adata_new.obs_names = adata.obs_names 260 | adata_new.raw = adata.copy() 261 | 262 | for c in full_model_pars.columns: 263 | adata_new.var[c+'_sct'] = full_model_pars[c] 264 | 265 | for c in cell_attrs.columns: 266 | adata_new.obs[c+'_sct'] = cell_attrs[c] 267 | 268 | for c in model_pars.columns: 269 | adata_new.var[c+'_step1_sct'] = model_pars[c] 270 | 271 | z = pd.Series(index=gn,data=np.zeros(gn.size,dtype='int')) 272 | z[gn[genes_step1]]=1 273 | adata_new.var['genes_step1_sct'] = z 274 | adata_new.var['log10_gmean_sct'] = genes_log_gmean 275 | return adata_new 276 | -------------------------------------------------------------------------------- /SCTransform_Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "toxic-falls", 6 | "metadata": {}, 7 | "source": [ 8 | "## Imports" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "measured-tuition", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from SCTransform import SCTransform\n", 19 | "import scanpy as sc\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "import numpy as np" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "perfect-variety", 27 | "metadata": {}, 28 | "source": [ 29 | "## Load data" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "id": "thick-applicant", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "#adata = sc.datasets.pbmc3k()\n", 40 | "adata = sc.read_h5ad('../data/pbmc_33k.h5ad')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "judicial-feelings", 46 | "metadata": {}, 47 | "source": [ 48 | "## Run SCTransform" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "id": "aerial-joseph", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "CPU times: user 8.6 s, sys: 1.19 s, total: 9.79 s\n", 62 | "Wall time: 3min 11s\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "%%time\n", 68 | "adata_sct = SCTransform(adata,\n", 69 | " min_cells=5,\n", 70 | " gmean_eps=1,\n", 71 | " n_genes=2000,\n", 72 | " n_cells=None, #use all cells\n", 73 | " bin_size=500,\n", 74 | " bw_adjust=3,\n", 75 | " inplace=False)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "id": "sudden-density", 81 | "metadata": {}, 82 | "source": [ 83 | "## Plot the fit parameters" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "id": "empirical-salmon", 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "image/png": "\n", 95 | "text/plain": [ 96 | "
" 97 | ] 98 | }, 99 | "metadata": { 100 | "needs_background": "light" 101 | }, 102 | "output_type": "display_data" 103 | } 104 | ], 105 | "source": [ 106 | "# Pull out the fit parameters (both step1 and extrapolated)\n", 107 | "intercept1 = adata_sct.var['Intercept_step1_sct'].values\n", 108 | "filt = np.invert(np.isnan(intercept1))\n", 109 | "\n", 110 | "gmean = 10**adata_sct.var['log10_gmean_sct'].values\n", 111 | "gmean1 = gmean[filt]\n", 112 | "intercept1 = intercept1[filt]\n", 113 | "\n", 114 | "intercept = adata_sct.var['Intercept_sct'].values\n", 115 | "logumi = adata_sct.var['log_umi_sct'].values\n", 116 | "logumi1 = adata_sct.var['log_umi_step1_sct'].values[filt]\n", 117 | "theta = adata_sct.var['theta_sct'].values\n", 118 | "theta1 = adata_sct.var['dispersion_step1_sct'].values[filt]\n", 119 | "theta1 = gmean1 / (10**theta1 - 1)\n", 120 | "\n", 121 | "# Do the plotting\n", 122 | "fig,axs = plt.subplots(nrows=3,ncols=1)\n", 123 | "fig.set_size_inches((5,12))\n", 124 | "ax=axs[0]\n", 125 | "ax.scatter(gmean1,intercept1,color='k'); \n", 126 | "ax.scatter(gmean,intercept,color='pink',s=5);\n", 127 | "ax.set_ylabel('Fit parameter for intercept')\n", 128 | "ax.set_xscale('log')\n", 129 | "\n", 130 | "ax=axs[1]\n", 131 | "ax.scatter(gmean1,logumi1,color='k'); \n", 132 | "ax.scatter(gmean,logumi,color='pink',s=5);\n", 133 | "ax.set_ylabel('Fit parameter for log(library sizes)')\n", 134 | "ax.set_xscale('log')\n", 135 | "\n", 136 | "ax=axs[2]\n", 137 | "ax.scatter(gmean1,np.log(theta1),color='k'); \n", 138 | "ax.scatter(gmean,np.log(theta),color='pink',s=5);\n", 139 | "ax.set_xlabel('Gene geometric means')\n", 140 | "ax.set_ylabel('log(theta)')\n", 141 | "ax.set_xscale('log')\n", 142 | "fig.tight_layout()" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.8.5" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 5 167 | } 168 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | __version__ = "0.0.1" 3 | 4 | 5 | setup( 6 | name="SCTransform", 7 | version=__version__, 8 | author="Alexander Tarashansky", 9 | author_email="tarashanst@gmail.com", 10 | url="", 11 | description="Python port of SCTransform from the Seurat package.", 12 | install_requires=[ 13 | "numpy", 14 | "scipy", 15 | "statsmodels", 16 | "KDEpy", 17 | "pandas", 18 | "anndata", 19 | ], 20 | py_modules=['SCTransform'], 21 | ) 22 | --------------------------------------------------------------------------------