├── .gitignore ├── README.md ├── LICENSE ├── .ipynb_checkpoints └── run_scsim-checkpoint.py ├── run_scsim.py ├── run_multi_scsim.py └── scsim.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | *.pyo 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scsim 2 | Simulate single-cell RNA-SEQ data using the [Splatter](https://github.com/Oshlack/splatter) statistical framework, which is described [here](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1305-0) but implemented in python. In addition, simulates doublets and cells with shared gene-expression programs (I.e. activity programs). This was used to benchmark methods for gene expression program inference in single-cell rna-seq data as described [here](https://elifesciences.org/articles/43803) 3 | 4 | run_scsim.py has example code for running a simulation with a given set of parameters. It saves the results in the numpy compressed matrix format which can be loaded into a Pandas dataframe as follows: 5 | 6 | with np.load(filename) as f: 7 | result = pd.DataFrame(**f) 8 | 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Dylan Kotliar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/run_scsim-checkpoint.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os, sys 3 | from scsim import scsim 4 | import numpy as np 5 | import time 6 | 7 | 8 | 9 | def save_df(obj, filename): 10 | '''Save pandas dataframe in compressed format''' 11 | np.savez_compressed(filename, data=obj.values, index=obj.index.values, columns=obj.columns.values) 12 | 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser(description='Run scsim with specified input arguments') 16 | parser.add_argument('--outdir', type=str, default='scsim-%s-%s-%s-%s-%s-%s-%s-%s', 17 | help='Output directory base') 18 | parser.add_argument('--seed', type=int, help='simulation seed') 19 | parser.add_argument('--numsims', type=int, help='number of sims to run', 20 | default=20) 21 | parser.add_argument('--deloc', type=float, 22 | help='devalue', 23 | default=1.) 24 | parser.add_argument('--K', type=int, 25 | help='Number of identity programs', 26 | default=10) 27 | parser.add_argument('--nproggoups', type=int, 28 | help='Number of groups expressing activity program. Default is 1/3 of K rounded down', 29 | default=None) 30 | parser.add_argument('--ncells', type=int, 31 | help='Total number of cells', 32 | default=10000) 33 | parser.add_argument('--doubletfrac', type=float, 34 | help='Percentage of doublet cells', 35 | default=0.) 36 | a = parser.parse_args() 37 | return(a.outdir, a.seed, a.numsims, a.deloc, a.K, a.nproggoups, a.ncells, a.doubletfrac) 38 | 39 | 40 | 41 | 42 | 43 | def main(): 44 | (outdir, randseed, numsims, deval, K, nproggroups, ncells, doubletfrac) = parse_args() 45 | ngenes=10000 46 | nproggenes = 400 47 | ndoublets=int(doubletfrac*ncells) 48 | 49 | deloc=deval 50 | progdeloc=deval 51 | descale=1.0 52 | progcellfrac = .35 53 | deprob = .025 54 | 55 | if nproggroups is None: 56 | nproggroups = int(K/3) 57 | 58 | proggroups = list(range(1, nproggroups+1)) 59 | 60 | simulator = scsim(ngenes=ngenes, ncells=ncells, ngroups=K, libloc=7.64, libscale=0.78, 61 | mean_rate=7.68,mean_shape=0.34, expoutprob=0.00286, 62 | expoutloc=6.15, expoutscale=0.49, 63 | diffexpprob=deprob, diffexpdownprob=0., diffexploc=deloc, diffexpscale=descale, 64 | bcv_dispersion=0.448, bcv_dof=22.087, ndoublets=ndoublets, 65 | nproggenes=nproggenes, progdownprob=0., progdeloc=progdeloc, 66 | progdescale=descale, progcellfrac=progcellfrac, proggoups=proggroups, 67 | minprogusage=.1, maxprogusage=.7, seed=randseed) 68 | 69 | 70 | start_time = time.time() 71 | simulator.simulate() 72 | end_time = time.time() 73 | print('%.3f minutes elapsed for seed %d' % ((end_time-start_time)/60, randseed)) 74 | 75 | save_df(simulator.cellparams, os.path.join(outdir, 'cellparams')) 76 | save_df(simulator.geneparams, os.path.join(outdir, 'geneparams')) 77 | save_df(simulator.counts, os.path.join(outdir, 'counts')) 78 | 79 | 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /run_scsim.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os, sys 3 | from scsim import scsim 4 | import numpy as np 5 | import time 6 | 7 | 8 | 9 | def save_df(obj, filename): 10 | '''Save pandas dataframe in compressed format''' 11 | np.savez_compressed(filename, data=obj.values, index=obj.index.values, columns=obj.columns.values) 12 | 13 | 14 | def load_df(filename): 15 | with np.load(filename, allow_pickle=True) as f: 16 | obj = pd.DataFrame(**f) 17 | return obj 18 | 19 | 20 | def parse_args(): 21 | parser = argparse.ArgumentParser(description='Run scsim with specified input arguments') 22 | parser.add_argument('--outdir', type=str, default='scsim-%s-%s-%s-%s-%s-%s-%s-%s', 23 | help='Output directory base') 24 | parser.add_argument('--seed', type=int, help='simulation seed') 25 | parser.add_argument('--numsims', type=int, help='number of sims to run', 26 | default=20) 27 | parser.add_argument('--deloc', type=float, 28 | help='devalue', 29 | default=1.) 30 | parser.add_argument('--K', type=int, 31 | help='Number of identity programs', 32 | default=10) 33 | parser.add_argument('--nproggoups', type=int, 34 | help='Number of groups expressing activity program. Default is 1/3 of K rounded down', 35 | default=None) 36 | parser.add_argument('--ncells', type=int, 37 | help='Total number of cells', 38 | default=10000) 39 | parser.add_argument('--doubletfrac', type=float, 40 | help='Percentage of doublet cells', 41 | default=0.) 42 | a = parser.parse_args() 43 | return(a.outdir, a.seed, a.numsims, a.deloc, a.K, a.nproggoups, a.ncells, a.doubletfrac) 44 | 45 | 46 | 47 | 48 | 49 | def main(): 50 | (outdir, randseed, numsims, deval, K, nproggroups, ncells, doubletfrac) = parse_args() 51 | ngenes=10000 52 | nproggenes = 400 53 | ndoublets=int(doubletfrac*ncells) 54 | 55 | deloc=deval 56 | progdeloc=deval 57 | descale=1.0 58 | progcellfrac = .35 59 | deprob = .025 60 | 61 | if nproggroups is None: 62 | nproggroups = int(K/3) 63 | 64 | proggroups = list(range(1, nproggroups+1)) 65 | 66 | simulator = scsim(ngenes=ngenes, ncells=ncells, ngroups=K, libloc=7.64, libscale=0.78, 67 | mean_rate=7.68,mean_shape=0.34, expoutprob=0.00286, 68 | expoutloc=6.15, expoutscale=0.49, 69 | diffexpprob=deprob, diffexpdownprob=0., diffexploc=deloc, diffexpscale=descale, 70 | bcv_dispersion=0.448, bcv_dof=22.087, ndoublets=ndoublets, 71 | nproggenes=nproggenes, progdownprob=0., progdeloc=progdeloc, 72 | progdescale=descale, progcellfrac=progcellfrac, proggoups=proggroups, 73 | minprogusage=.1, maxprogusage=.7, seed=randseed) 74 | 75 | 76 | start_time = time.time() 77 | simulator.simulate() 78 | end_time = time.time() 79 | print('%.3f minutes elapsed for seed %d' % ((end_time-start_time)/60, randseed)) 80 | 81 | save_df(simulator.cellparams, os.path.join(outdir, 'cellparams')) 82 | save_df(simulator.geneparams, os.path.join(outdir, 'geneparams')) 83 | save_df(simulator.counts, os.path.join(outdir, 'counts')) 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /run_multi_scsim.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os, sys 3 | from scsim import scsim 4 | import numpy as np 5 | import time 6 | 7 | 8 | 9 | def save_df(obj, filename): 10 | '''Save pandas dataframe in compressed format''' 11 | np.savez_compressed(filename, data=obj.values, index=obj.index.values, columns=obj.columns.values) 12 | 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser(description='Run scsim with specified input arguments') 16 | parser.add_argument('--outdir', type=str, default='scsim-%s-%s-%s-%s-%s-%s-%s-%s', 17 | help='Output directory base') 18 | parser.add_argument('--seed', type=int, 19 | help='seed for generating simulation seeds') 20 | parser.add_argument('--numsims', type=int, help='number of sims to run', 21 | default=20) 22 | parser.add_argument('--firstseed', type=int, help='first seed to run', 23 | default=0) 24 | parser.add_argument('--lastseed', type=int, 25 | help='last seed to run,default is use all', 26 | default=None) 27 | parser.add_argument('--deloc', type=float, 28 | help='devalue', 29 | default=1.) 30 | parser.add_argument('--K', type=int, 31 | help='Number of identity programs', 32 | default=10) 33 | parser.add_argument('--nproggoups', type=int, 34 | help='Number of groups expressing activity program. Default is 1/3 of K rounded down', 35 | default=None) 36 | parser.add_argument('--ncells', type=int, 37 | help='Total number of cells', 38 | default=10000) 39 | parser.add_argument('--doubletfrac', type=float, 40 | help='Percentage of doublet cells', 41 | default=0.) 42 | a = parser.parse_args() 43 | return(a.outdir, a.seed, a.numsims, a.firstseed, a.lastseed, a.deloc, a.K, a.nproggoups, a.ncells, a.doubletfrac) 44 | 45 | 46 | 47 | 48 | 49 | def main(): 50 | (outdir, randseed, numsims, firstseed, lastseed, deval, K, nproggroups, ncells, doubletfrac) = parse_args() 51 | ngenes=10000 52 | nproggenes = 400 53 | ndoublets=int(doubletfrac*ncells) 54 | 55 | deloc=deval 56 | progdeloc=deval 57 | descale=1.0 58 | progcellfrac = .35 59 | deprob = .025 60 | 61 | if nproggroups is None: 62 | nproggroups = int(K/3) 63 | 64 | proggroups = list(range(1, nproggroups+1)) 65 | 66 | 67 | np.random.seed(randseed) 68 | simseeds = np.random.randint(1, high=2**15, size=numsims) 69 | if lastseed is None: 70 | lastseed = numsims+1 71 | 72 | 73 | for seed in simseeds[firstseed:lastseed]: 74 | print(seed) 75 | outbase = outdir % (seed, deloc, descale, ndoublets, progcellfrac, len(proggroups), nproggenes, deprob) 76 | if not os.path.exists(outbase): 77 | os.mkdir(outbase) 78 | 79 | Koutbase = '%s/K%d' % (outbase, K) 80 | if not os.path.exists(Koutbase): 81 | os.mkdir(Koutbase) 82 | 83 | simulator = scsim(ngenes=ngenes, ncells=ncells, ngroups=K, libloc=7.64, libscale=0.78, 84 | mean_rate=7.68,mean_shape=0.34, expoutprob=0.00286, 85 | expoutloc=6.15, expoutscale=0.49, 86 | diffexpprob=deprob, diffexpdownprob=0., diffexploc=deloc, diffexpscale=descale, 87 | bcv_dispersion=0.448, bcv_dof=22.087, ndoublets=ndoublets, 88 | nproggenes=nproggenes, progdownprob=0., progdeloc=progdeloc, 89 | progdescale=descale, progcellfrac=progcellfrac, proggoups=proggroups, 90 | minprogusage=.1, maxprogusage=.7, seed=seed) 91 | 92 | 93 | start_time = time.time() 94 | simulator.simulate() 95 | end_time = time.time() 96 | print('%.3f minutes elapsed for seed %d' % ((end_time-start_time)/60, seed)) 97 | 98 | save_df(simulator.cellparams, '%s/cellparams' % Koutbase) 99 | save_df(simulator.geneparams, '%s/geneparams' % Koutbase) 100 | save_df(simulator.counts, '%s/counts' % Koutbase) 101 | 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /scsim.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sys 4 | 5 | class scsim: 6 | def __init__(self, ngenes=10000, ncells=100, seed=757578, 7 | mean_rate=.3, mean_shape=.6, libloc=11, libscale=0.2, 8 | expoutprob=.05, expoutloc=4, expoutscale=0.5, ngroups=1, 9 | diffexpprob=.1, diffexpdownprob=.5, 10 | diffexploc=.1, diffexpscale=.4, bcv_dispersion=.1, 11 | bcv_dof=60, ndoublets=0, groupprob=None, 12 | nproggenes=None, progdownprob=None, progdeloc=None, 13 | progdescale=None, proggoups=None, progcellfrac=None, 14 | minprogusage=.2, maxprogusage=.8): 15 | 16 | self.ngenes = ngenes 17 | self.ncells = ncells 18 | self.seed = seed 19 | self.mean_rate = mean_rate 20 | self.mean_shape = mean_shape 21 | self.libloc = libloc 22 | self.libscale = libscale 23 | self.expoutprob = expoutprob 24 | self.expoutloc = expoutloc 25 | self.expoutscale = expoutscale 26 | self.ngroups = ngroups 27 | self.diffexpprob = diffexpprob 28 | self.diffexpdownprob = diffexpdownprob 29 | self.diffexploc = diffexploc 30 | self.diffexpscale = diffexpscale 31 | self.bcv_dispersion = bcv_dispersion 32 | self.bcv_dof = bcv_dof 33 | self.ndoublets = ndoublets 34 | self.init_ncells = ncells+ndoublets 35 | self.nproggenes=nproggenes 36 | self.progdownprob=progdownprob 37 | self.progdeloc=progdeloc 38 | self.progdescale=progdescale 39 | self.proggoups=proggoups 40 | self.progcellfrac = progcellfrac 41 | self.minprogusage = minprogusage 42 | self.maxprogusage = maxprogusage 43 | 44 | if groupprob is None: 45 | self.groupprob = [1/float(self.ngroups)]*self.ngroups 46 | elif (len(groupprob) == self.ngroups) & (np.abs(np.sum(groupprob) - 1) < (10**-6)): 47 | self.groupprob = groupprob 48 | else: 49 | sys.exit('Invalid groupprob input') 50 | 51 | 52 | def simulate(self): 53 | np.random.seed(self.seed) 54 | print('Simulating cells') 55 | self.cellparams = self.get_cell_params() 56 | print('Simulating gene params') 57 | self.geneparams = self.get_gene_params() 58 | 59 | if (self.nproggenes is not None) and (self.nproggenes > 0): 60 | print('Simulating program') 61 | self.simulate_program() 62 | 63 | print('Simulating DE') 64 | self.sim_group_DE() 65 | 66 | print('Simulating cell-gene means') 67 | self.cellgenemean = self.get_cell_gene_means() 68 | if self.ndoublets > 0: 69 | print('Simulating doublets') 70 | self.simulate_doublets() 71 | 72 | print('Adjusting means') 73 | self.adjust_means_bcv() 74 | print('Simulating counts') 75 | self.simulate_counts() 76 | 77 | def simulate_counts(self): 78 | '''Sample read counts for each gene x cell from Poisson distribution 79 | using the variance-trend adjusted updatedmean value''' 80 | self.counts = pd.DataFrame(np.random.poisson(lam=self.updatedmean), 81 | index=self.cellnames, columns=self.genenames) 82 | 83 | def adjust_means_bcv(self): 84 | '''Adjust cellgenemean to follow a mean-variance trend relationship''' 85 | self.bcv = self.bcv_dispersion + (1 / np.sqrt(self.cellgenemean)) 86 | chisamp = np.random.chisquare(self.bcv_dof, size=self.ngenes) 87 | self.bcv = self.bcv*np.sqrt(self.bcv_dof / chisamp) 88 | self.updatedmean = np.random.gamma(shape=1/(self.bcv**2), 89 | scale=self.cellgenemean*(self.bcv**2)) 90 | self.bcv = pd.DataFrame(self.bcv, index=self.cellnames, columns=self.genenames) 91 | self.updatedmean = pd.DataFrame(self.updatedmean, index=self.cellnames, 92 | columns=self.genenames) 93 | 94 | 95 | def simulate_doublets(self): 96 | ## Select doublet cells and determine the second cell to merge with 97 | d_ind = sorted(np.random.choice(self.ncells, self.ndoublets, 98 | replace=False)) 99 | d_ind = ['Cell%d' % (x+1) for x in d_ind] 100 | self.cellparams['is_doublet'] = False 101 | self.cellparams.loc[d_ind, 'is_doublet'] = True 102 | extraind = self.cellparams.index[-self.ndoublets:] 103 | group2 = self.cellparams.ix[extraind, 'group'].values 104 | self.cellparams['group2'] = -1 105 | self.cellparams.loc[d_ind, 'group2'] = group2 106 | 107 | ## update the cell-gene means for the doublets while preserving the 108 | ## same library size 109 | dmean = self.cellgenemean.loc[d_ind,:].values 110 | dmultiplier = .5 / dmean.sum(axis=1) 111 | dmean = np.multiply(dmean, dmultiplier[:, np.newaxis]) 112 | omean = self.cellgenemean.loc[extraind,:].values 113 | omultiplier = .5 / omean.sum(axis=1) 114 | omean = np.multiply(omean, omultiplier[:,np.newaxis]) 115 | newmean = dmean + omean 116 | libsize = self.cellparams.loc[d_ind, 'libsize'].values 117 | newmean = np.multiply(newmean, libsize[:,np.newaxis]) 118 | self.cellgenemean.loc[d_ind,:] = newmean 119 | ## remove extra doublet cells from the data structures 120 | self.cellgenemean.drop(extraind, axis=0, inplace=True) 121 | self.cellparams.drop(extraind, axis=0, inplace=True) 122 | self.cellnames = self.cellnames[0:self.ncells] 123 | 124 | 125 | def get_cell_gene_means(self): 126 | '''Calculate each gene's mean expression for each cell while adjusting 127 | for the library size''' 128 | 129 | 130 | group_genemean = self.geneparams.loc[:,[x for x in self.geneparams.columns if ('_genemean' in x) and ('group' in x)]].T.astype(float) 131 | group_genemean = group_genemean.div(group_genemean.sum(axis=1), axis=0) 132 | ind = self.cellparams['group'].apply(lambda x: 'group%d_genemean' % x) 133 | 134 | if self.nproggenes == 0: 135 | cellgenemean = group_genemean.loc[ind,:].astype(float) 136 | cellgenemean.index = self.cellparams.index 137 | else: 138 | noprogcells = self.cellparams['has_program']==False 139 | hasprogcells = self.cellparams['has_program']==True 140 | 141 | print(' - Getting mean for activity program carrying cells') 142 | progcellmean = group_genemean.loc[ind[hasprogcells], :] 143 | progcellmean.index = ind.index[hasprogcells] 144 | progcellmean = progcellmean.multiply(1-self.cellparams.loc[hasprogcells, 'program_usage'], axis=0) 145 | 146 | progmean = self.geneparams.loc[:,['prog_genemean']] 147 | progmean = progmean.div(progmean.sum(axis=0), axis=1) 148 | progusage = self.cellparams.loc[progcellmean.index, ['program_usage']] 149 | progusage.columns = ['prog_genemean'] 150 | progcellmean += progusage.dot(progmean.T) 151 | progcellmean = progcellmean.astype(float) 152 | 153 | print(' - Getting mean for non activity program carrying cells') 154 | noprogcellmean = group_genemean.loc[ind[noprogcells],:] 155 | noprogcellmean.index = ind.index[noprogcells] 156 | 157 | cellgenemean = pd.concat([noprogcellmean, progcellmean], axis=0) 158 | 159 | del(progcellmean, noprogcellmean) 160 | 161 | cellgenemean = cellgenemean.reindex(index=self.cellparams.index) 162 | 163 | print(' - Normalizing by cell libsize') 164 | normfac = (self.cellparams['libsize'] / cellgenemean.sum(axis=1)).values 165 | for col in cellgenemean.columns: 166 | cellgenemean[col] = cellgenemean[col].values*normfac 167 | #cellgenemean = cellgenemean.multiply(normfac, axis=0).astype(float) 168 | return(cellgenemean) 169 | 170 | 171 | def get_gene_params(self): 172 | '''Sample each genes mean expression from a gamma distribution as 173 | well as identifying outlier genes with expression drawn from a 174 | log-normal distribution''' 175 | basegenemean = np.random.gamma(shape=self.mean_shape, 176 | scale=1./self.mean_rate, 177 | size=self.ngenes) 178 | 179 | is_outlier = np.random.choice([True, False], size=self.ngenes, 180 | p=[self.expoutprob,1-self.expoutprob]) 181 | outlier_ratio = np.ones(shape=self.ngenes) 182 | outliers = np.random.lognormal(mean=self.expoutloc, 183 | sigma=self.expoutscale, 184 | size=is_outlier.sum()) 185 | outlier_ratio[is_outlier] = outliers 186 | gene_mean = basegenemean.copy() 187 | median = np.median(basegenemean) 188 | gene_mean[is_outlier] = outliers*median 189 | self.genenames = ['Gene%d' % i for i in range(1, self.ngenes+1)] 190 | geneparams = pd.DataFrame([basegenemean, is_outlier, outlier_ratio, gene_mean], 191 | index=['BaseGeneMean', 'is_outlier', 'outlier_ratio', 'gene_mean'], 192 | columns=self.genenames).T 193 | return(geneparams) 194 | 195 | 196 | def get_cell_params(self): 197 | '''Sample cell group identities and library sizes''' 198 | groupid = self.simulate_groups() 199 | libsize = np.random.lognormal(mean=self.libloc, sigma=self.libscale, 200 | size=self.init_ncells) 201 | self.cellnames = ['Cell%d' % i for i in range(1, self.init_ncells+1)] 202 | cellparams = pd.DataFrame([groupid, libsize], 203 | index=['group', 'libsize'], 204 | columns=self.cellnames).T 205 | cellparams['group'] = cellparams['group'].astype(int) 206 | return(cellparams) 207 | 208 | 209 | def simulate_program(self): 210 | ## Simulate the program gene expression 211 | self.geneparams['prog_gene'] = False 212 | proggenes = self.geneparams.index[-self.nproggenes:] 213 | self.geneparams.loc[proggenes, 'prog_gene'] = True 214 | DEratio = np.random.lognormal(mean=self.progdeloc, 215 | sigma=self.progdescale, 216 | size=self.nproggenes) 217 | DEratio[DEratio<1] = 1 / DEratio[DEratio<1] 218 | is_downregulated = np.random.choice([True, False], 219 | size=len(DEratio), 220 | p=[self.progdownprob, 221 | 1-self.progdownprob]) 222 | DEratio[is_downregulated] = 1. / DEratio[is_downregulated] 223 | all_DE_ratio = np.ones(self.ngenes) 224 | all_DE_ratio[-self.nproggenes:] = DEratio 225 | prog_mean = self.geneparams['gene_mean']*all_DE_ratio 226 | self.geneparams['prog_genemean'] = prog_mean 227 | 228 | ## Assign the program to cells 229 | self.cellparams['has_program'] = False 230 | if self.proggoups is None: 231 | ## The program is active in all cell types 232 | self.proggoups = np.arange(1, self.ngroups+1) 233 | 234 | self.cellparams.loc[:, 'program_usage'] = 0 235 | for g in self.proggoups: 236 | groupcells = self.cellparams.index[self.cellparams['group']==g] 237 | hasprog = np.random.choice([True, False], size=len(groupcells), 238 | p=[self.progcellfrac, 239 | 1-self.progcellfrac]) 240 | self.cellparams.loc[groupcells[hasprog], 'has_program'] = True 241 | usages = np.random.uniform(low=self.minprogusage, 242 | high=self.maxprogusage, 243 | size=len(groupcells[hasprog])) 244 | self.cellparams.loc[groupcells[hasprog], 'program_usage'] = usages 245 | 246 | 247 | 248 | 249 | def simulate_groups(self): 250 | '''Sample cell group identities from a categorical distriubtion''' 251 | groupid = np.random.choice(np.arange(1, self.ngroups+1), 252 | size=self.init_ncells, p=self.groupprob) 253 | self.groups = np.unique(groupid) 254 | return(groupid) 255 | 256 | 257 | def sim_group_DE(self): 258 | '''Sample differentially expressed genes and the DE factor for each 259 | cell-type group''' 260 | groups = self.cellparams['group'].unique() 261 | if self.nproggenes>0: 262 | proggene = self.geneparams['prog_gene'].values 263 | else: 264 | proggene = np.array([False]*self.geneparams.shape[0]) 265 | 266 | for group in self.groups: 267 | isDE = np.random.choice([True, False], size=self.ngenes, 268 | p=[self.diffexpprob,1-self.diffexpprob]) 269 | isDE[proggene] = False # Program genes shouldn't be differentially expressed between groups 270 | DEratio = np.random.lognormal(mean=self.diffexploc, 271 | sigma=self.diffexpscale, 272 | size=isDE.sum()) 273 | DEratio[DEratio<1] = 1 / DEratio[DEratio<1] 274 | is_downregulated = np.random.choice([True, False], 275 | size=len(DEratio), 276 | p=[self.diffexpdownprob,1-self.diffexpdownprob]) 277 | DEratio[is_downregulated] = 1. / DEratio[is_downregulated] 278 | all_DE_ratio = np.ones(self.ngenes) 279 | all_DE_ratio[isDE] = DEratio 280 | group_mean = self.geneparams['gene_mean']*all_DE_ratio 281 | 282 | deratiocol = 'group%d_DEratio' % group 283 | groupmeancol = 'group%d_genemean' % group 284 | self.geneparams[deratiocol] = all_DE_ratio 285 | self.geneparams[groupmeancol] = group_mean 286 | --------------------------------------------------------------------------------