├── .Rprofile ├── Fig1_S2 ├── 57epigenomes.median_expr.txt ├── Fig1B_S2B.R ├── Fig1C.R ├── Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz ├── Xpresso.py ├── choose_reference_genes.pl ├── choose_reference_genes_forhg19.pl ├── extract_promoters.pl ├── geneName2Ensembl.pl ├── geneName2EnsemblMouse.pl ├── generate_training_input.pl ├── print_losses.py ├── process_RNAseq.R ├── process_RNAseq_mouse.R ├── runme.sh ├── setup_training_files.py └── tpe_1K_10epochs_optimized_0to20K.hyperopt ├── Fig2 ├── 1to1_orthologs_expression.txt ├── 57epigenomes.median_expr.txt ├── Fig2A.R ├── Fig2BC.R ├── Fig2D.R ├── Fig2E.pdf ├── Fig2EFG.R ├── Fig2F.pdf ├── Fig2G.pdf ├── Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz ├── Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz ├── Xpresso.py ├── all_crossvalidated_predictions.txt ├── all_crossvalidated_predictions_mouse.txt ├── ensembl2geneName_v90_mm10.txt ├── human2mouse_one2one_orthologs.txt ├── human2mouse_orthologs.txt ├── mouse.median_expr.txt ├── ortholog_results ├── pM10Kb_1KTest ├── pM10Kb_1KTest_Mouse ├── pM10Kb_1KTest_one2oneOrthologs ├── runme.sh ├── setup_training_files.py ├── subsample.py ├── subsampled_10fold ├── subsampling_10fold.R └── tpe_1K_10epochs_optimized_0to20K.hyperopt ├── Fig3_S3 ├── 57epigenomes.RPKM.pc.gz ├── Boyer_et_al_PCG_repressed.txt ├── EnsemblID2GeneName.txt ├── Fig3ABCDEF_S3ABC.R ├── Fig3ABC_S3ABC.pdf ├── Fig3DEF_S3C.pdf ├── Fig3GH.R ├── Fig3GH.pdf ├── Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz ├── Mouse_FantomAnnotations.InputData.pM10Kb.mESC.txt.gz ├── Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz ├── Ouyang_mESC_RPKM_ensemblID.txt ├── Roadmap_FantomAnnotations.InputData.pM10Kb.GM12878expr.txt.gz ├── Roadmap_FantomAnnotations.InputData.pM10Kb.K562expr.txt.gz ├── Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz ├── Schofield_K562_half_lives.txt ├── Summary_Counts.default_predictions.txt.gz ├── TableS1_human.txt ├── TableS1_mouse.txt ├── Whyte_et_al_superenhancers.txt ├── all_crossvalidated_predictions.txt ├── all_crossvalidated_predictions_GM12878.txt ├── all_crossvalidated_predictions_K562.txt ├── all_crossvalidated_predictions_mESC.txt ├── all_crossvalidated_predictions_mouse.txt ├── cross_valid ├── cross_valid_GM12878 ├── cross_valid_K562 ├── cross_valid_mESC ├── cross_valid_mouse ├── diHMM ├── ensembl2geneName_v90_mm10.txt ├── integrate_cv_results.R ├── mouse.median_expr.txt ├── mouseESC_GSE76288_miRNA_counts_Denzler.txt ├── pM10Kb_1KTest_GM12878expr_cv ├── pM10Kb_1KTest_K562expr_cv ├── pM10Kb_1KTest_mESCexpr_cv ├── pM10Kb_Mouse_cv ├── pM10Kb_cv ├── runme.sh └── setup_training_files.py ├── Fig4_S4 ├── 57epigenomes.RPKM.pc.gz ├── Fig4ABCD.R ├── Fig4ABCD.pdf ├── Fig4E.pdf ├── Fig4E_S4.R ├── FigS4A.pdf ├── FigS4B.R ├── FigS4B.pdf ├── FigS4B_2.pdf ├── FigS4C.R ├── FigS4C_human.pdf ├── FigS4C_mouse.pdf ├── GSE78709_sure23.plasmid.norm.combined.45.55.minus.promoters.bigWigSignal ├── GSE78709_sure23.plasmid.norm.combined.45.55.plus.promoters.bigWigSignal ├── JASPAR_CORE_2016_vertebrates.meme ├── Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz ├── Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz ├── all_crossvalidated_predictions.txt ├── all_crossvalidated_predictions_GM12878.txt ├── all_crossvalidated_predictions_K562.txt ├── all_crossvalidated_predictions_mESC.txt ├── all_crossvalidated_predictions_mouse.txt ├── baseline_models.R ├── coefplot.r ├── gencode.v27lift37.basic.annotation.gtf.gz ├── hg19_promoters_cage_corrected_withChr.bed ├── hg19_promoters_cage_corrected_withChr_andOthers.bed ├── hg19_promoters_cage_corrected_withChr_andOthers_minus.bed ├── hg19_promoters_cage_corrected_withChr_andOthers_plus.bed ├── hg38ToHg19.over.chain ├── hg38_promoters_cage_corrected.bed ├── hg38_promoters_cage_corrected_withChr.bed ├── model_comparison.txt ├── model_comparison_Fig3.txt ├── promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz ├── promoters_pM1.5Kb.FIMO_scanned.txt.gz ├── promoters_pM1.5Kb.fa.gz ├── promoters_pM1.5Kb.firstOrderMarkov_background ├── promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz ├── promoters_pM1.5Kb.mouse.FIMO_scanned.txt.gz ├── promoters_pM1.5Kb.mouse.fa.gz ├── promoters_pM1.5Kb.mouse.firstOrderMarkov_background ├── runme.sh └── supplement_ids.pl ├── Fig5_S5 ├── hg19.chrom.sizes ├── human_trainepoch.11-0.426.h5 ├── mouse_trainepoch.05-0.278.h5 ├── predict_seqs.py ├── region.1Mb.bed ├── region.1Mb.intervals.100ntStep.Minus.bedGraph ├── region.1Mb.intervals.100ntStep.Minus.bw ├── region.1Mb.intervals.100ntStep.Plus.bedGraph ├── region.1Mb.intervals.100ntStep.Plus.bw ├── region.1Mb.intervals.100ntStep.bed ├── region.1Mb.intervals.100ntStep.input.txt.gz ├── region.1Mb.intervals.100ntStep.mouse.bed ├── region.1Mb.intervals.100ntStep.mouse.input.txt.gz ├── region.1Mb.intervals.100ntStep.mouse.minus.bedGraph ├── region.1Mb.intervals.100ntStep.mouse.minus.bw ├── region.1Mb.intervals.100ntStep.mouse.plus.bedGraph ├── region.1Mb.intervals.100ntStep.mouse.plus.bw ├── region.1Mb.mouse.bed ├── runme.sh └── tpe_1K_10epochs_optimized_0to20K.hyperopt ├── Fig6_S6_S7_S8 ├── Fig6B.pdf ├── Fig6C_S7_S8.R ├── FigS6.R ├── FigS7.pdf ├── FigS8.pdf ├── Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz ├── Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz ├── all_crossvalidated_predictions.txt ├── all_crossvalidated_predictions_mouse.txt ├── best_positions.R ├── cv_human ├── cv_mouse ├── deep_explain_cv.py ├── extract_kmer.pl ├── human_all_1mer_400K │ ├── dreme.html │ ├── dreme.txt │ └── dreme.xml ├── humangradinput.pdf ├── humanintgrad.pdf ├── motif_analysis ├── mouse_all_1mer_400K │ ├── dreme.html │ ├── dreme.txt │ └── dreme.xml ├── mousegradinput.pdf ├── mouseintgrad.pdf ├── pM10Kb_Mouse_cv ├── pM10Kb_cv └── runme.sh ├── FigS1 ├── 57epigenomes.RPKM.pc.gz ├── EG.name.txt ├── FigS1.R ├── FigS1.pdf └── runme.sh ├── LICENSE.txt ├── README.md ├── Xpresso.ipynb ├── allfxns.pm └── xpresso_logo.png /.Rprofile: -------------------------------------------------------------------------------- 1 | args = commandArgs(trailingOnly = T) 2 | 3 | head <- function(x, y = 5) { base::print(utils::head(x, y)) } 4 | say <- function(...) { base::print(paste(...)) } 5 | 6 | tryCatch({options(width = as.integer(Sys.getenv("COLUMNS")))}, error = function(err) {options(width=236)}) 7 | 8 | .Last <- function(){ 9 | if (!any(commandArgs() == '--no-readline') && interactive()) { 10 | require(utils) 11 | try(savehistory(".Rhistory")) 12 | } 13 | } 14 | 15 | error.bar <- function(x, y, upper, lower=upper, length=0.1,...){ 16 | if(length(x) != length(y) | length(y) !=length(lower) | length(lower) != length(upper)) 17 | stop("vectors must be same length") 18 | arrows(x,y+upper, x, y-lower, angle=90, code=3, length=length, ...) 19 | } 20 | 21 | writefile = function(obj, x, ...){ 22 | write.table(obj, file=x, quote=F, row.names=F, sep='\t', ...) 23 | } 24 | 25 | fastread = function(file, ...){ 26 | data.table::fread(file,data.table=F,sep="\t", ...) 27 | } 28 | -------------------------------------------------------------------------------- /Fig1_S2/57epigenomes.median_expr.txt: -------------------------------------------------------------------------------- 1 | ../datasets/57epigenomes.median_expr.txt -------------------------------------------------------------------------------- /Fig1_S2/Fig1B_S2B.R: -------------------------------------------------------------------------------- 1 | library(latticeExtra) 2 | 3 | getresults = function(thisfile){ 4 | sites = read.table(text = system(paste("python print_losses.py", thisfile), intern=T), sep='\t') 5 | colnames(sites)=c("leftpos","rightpos","loss","params") 6 | print(sites[which(sites$loss == min(sites$loss)),]) 7 | say(nrow(sites), "trials") 8 | sites 9 | } 10 | 11 | c = getresults(args[1]) 12 | e = getresults(args[2]) 13 | 14 | pdf("Fig1B.pdf",width=5,height=4) 15 | plot(1:nrow(c), sapply(1:nrow(c), function(x) min(c[1:x, "loss"])), lwd=2, bty='n', col='red', type="l", 16 | xlim = c(0, 1000), ylim = c(0.4, 0.7), xlab="Number of iterations", ylab="Validation mean squared error, best model found") 17 | abline(h=0.479, lwd=2, lty=2, col='black') 18 | lines(1:nrow(e), sapply(1:nrow(e), function(x) min(e[1:x, "loss"])), lwd=2, col='purple') 19 | legend("topright", bg="white", bty="n", legend = c("Tree of Parzen estimators", "Simulated annealing","Best manually discovered, -1.5Kb to 1.5Kb"), 20 | text.col = c("red", "purple","black"), cex=0.8) 21 | dev.off() 22 | 23 | plotboundaries = function(a){ 24 | b=aggregate(a$loss, by=list(leftpos=a$leftpos, rightpos=a$rightpos), min) 25 | totsize = 10000 26 | pdf("FigS2B.pdf",width=5,height=6) 27 | layout(matrix(c(1,1,1,2), 1, 4)) 28 | par(mar = c(1, 1, 5, 1)) 29 | b$mycol = as.character("red") 30 | N=min(nrow(b),100) 31 | b=b[order(b$x, decreasing=T),] 32 | b=b[(nrow(b)-N+1):nrow(b),] 33 | b=rbind(c(8500,11500,0.479,"blue"), b) 34 | b$leftpos=as.integer(b$leftpos) - 10000 35 | b$rightpos=as.integer(b$rightpos) - 10000 36 | plot(1:(N+1),xlim=c(-totsize,totsize), type="n", cex.lab = 2, bty="n", yaxt='n', xaxt='n') 37 | axis(3, at=seq(-totsize,totsize,totsize/5)) 38 | mtext("Position relative to TSS", side=3, line=3) 39 | for(x in 1:(N+1)) lines(c(b$leftpos[x],b$rightpos[x]), c(x,x), col=b$mycol[x], type="l", lty=1, lwd=2) 40 | abline(v=0, lwd=2, col='black') 41 | 42 | plot(1:(N+1),xlim=c(0.4,0.48), type="n", cex.lab = 2, bty="n", yaxt='n', xaxt='n') 43 | axis(3, at=seq(0.4,0.48,0.02)) 44 | mtext("Validation MSE", side=3, line=3) 45 | for(x in 1:(N+1)) lines(c(0,b$x[x]), c(x,x), col="grey", type="l", lty=1, lwd=2) 46 | abline(v=0.479, lwd=2, lty=2, col='blue') 47 | dev.off() 48 | } 49 | 50 | plotboundaries(c) -------------------------------------------------------------------------------- /Fig1_S2/Fig1C.R: -------------------------------------------------------------------------------- 1 | library(latticeExtra) 2 | 3 | crp.rg <- colorRampPalette(c("red","orange","green","cyan","blue","purple","magenta")) 4 | cols <- sample(crp.rg(10)) 5 | 6 | plotresults = function(dir){ 7 | files = paste(list.files(path=dir, pattern='.txt', full.names=T)) 8 | pdf("Fig1C.pdf",width=5,height=4) 9 | plot(0, lwd=1, bty='n', type="l", xlim = c(0, 25), ylim = c(0.4, 1), xlab="Epoch", ylab="Validation MSE", las = 1) 10 | abline(h=0.479, lwd=1, lty=2, col='black') 11 | lapply(1:length(files), FUN=function(i){ 12 | file = files[i] 13 | cmd = paste("grep val_loss", file, " | perl -ne 'chomp; ($mse) = ($_ =~ /val_loss: (\\d+.\\d+)/); print \"$mse \";'") 14 | sites = unlist(strsplit(system(cmd, intern=T), "\\s+")) 15 | sites = as.numeric(sites[2:length(sites)]) 16 | lines(1:length(sites), sites, lwd=2, col=cols[i]) 17 | }) 18 | dev.off() 19 | } 20 | 21 | c = plotresults(args[1]) -------------------------------------------------------------------------------- /Fig1_S2/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz -------------------------------------------------------------------------------- /Fig1_S2/Xpresso.py: -------------------------------------------------------------------------------- 1 | import sys, os, h5py, pickle 2 | import pandas as pd 3 | from optparse import OptionParser 4 | from scipy import stats 5 | import tensorflow as tf 6 | from tensorflow import keras 7 | from keras.optimizers import Adam 8 | from keras.models import Model, load_model 9 | from keras.layers import * 10 | from keras.metrics import * 11 | from keras.utils import plot_model 12 | from keras import backend as K 13 | from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping 14 | from hyperopt import fmin, tpe, rand, anneal, hp, STATUS_OK, STATUS_FAIL, Trials, mix, partial, space_eval 15 | 16 | global X_trainhalflife, X_trainpromoter, y_train, geneName_train, X_validhalflife, X_validpromoter, y_valid, geneName_valid, X_testhalflife, X_testpromoter, y_test, geneName_test, params 17 | 18 | def main(): 19 | usage = 'usage: %prog [options] ' 20 | parser = OptionParser(usage) 21 | parser.add_option('-c', dest='counts', default=0, type='int', help='Number of training counts to subsample [Default: %default]') 22 | parser.add_option('--bestmanual', dest='bestmanual', default=False, action='store_true', help='Try best manually identified model % [Default: %default]') 23 | parser.add_option('--fold', dest='cvfold', default='', type='string', help='Which of the 10 folds of cross-validation to use % [Default: %default]') 24 | parser.add_option('--trial', dest='trial', default='', type='string', help='Trial number % [Default: %default]') 25 | parser.add_option('--usemodel', dest='usemodel', default=None, type='string', help='Use pre-trained model % [Default: %default]') 26 | (options,args) = parser.parse_args() 27 | 28 | if len(args) != 3: 29 | print (args) 30 | parser.error('Must provide mode (tune, train, or test), hyperparameter database file, and database directory') 31 | else: 32 | mode = args[0] 33 | database = args[1] 34 | datadir = args[2] 35 | 36 | global X_trainhalflife, X_trainpromoter, y_train, geneName_train, X_validhalflife, X_validpromoter, y_valid, geneName_valid, X_testhalflife, X_testpromoter, y_test, geneName_test, params 37 | params['datadir'] = datadir 38 | if not options.usemodel: 39 | trainfile = h5py.File(os.path.join(datadir, options.cvfold+'train.h5'), 'r') #_mouse1to1 40 | X_trainhalflife, X_trainpromoter, y_train, geneName_train = trainfile['data'], trainfile['promoter'], trainfile['label'], trainfile['geneName'] 41 | validfile = h5py.File(os.path.join(datadir, options.cvfold+'valid.h5'), 'r') #_mouse1to1 42 | X_validhalflife, X_validpromoter, y_valid, geneName_valid = validfile['data'], validfile['promoter'], validfile['label'], validfile['geneName'] 43 | 44 | if mode == "tune": 45 | while True: # loop indefinitely and stop whenever you like 46 | run_trials(database) 47 | else: 48 | testfile = h5py.File(os.path.join(datadir, options.cvfold+'test.h5'), 'r') #_mouse1to1_human1to1 49 | X_testhalflife, X_testpromoter, y_test, geneName_test = testfile['data'], testfile['promoter'], testfile['label'], testfile['geneName'] 50 | if options.bestmanual: 51 | params = { 'datadir' : datadir, 'batchsize' : 2**6, 'leftpos' : 8500, 'rightpos' : 11500, 'activationFxn' : 'relu', 'numFiltersConv1' : 2**6, 'filterLenConv1' : 5, 'dilRate1' : 1, 52 | 'maxPool1' : 10, 'numconvlayers' : { 'numFiltersConv2' : 2**6, 'filterLenConv2' : 5, 'dilRate2' : 1, 'maxPool2' : 20, 'numconvlayers1' : { 'numconvlayers2' : 'two' } }, 53 | 'dense1' : 100, 'dropout1' : 0.5, 'numdenselayers' : { 'layers' : 'one' } } 54 | print("Using best human-identified parameters") 55 | else: 56 | trials = pickle.load(open(database, "rb")) 57 | best = trials.argmin 58 | params = space_eval(params, best) 59 | print("Found saved Trials!") 60 | print ("The best parameters are:") 61 | print (params) 62 | params['subsample'] = options.counts 63 | params['cvfold'] = options.cvfold 64 | params['trial'] = options.trial 65 | params['usemodel'] = options.usemodel 66 | params['tuneMode'] = 0 #enable mode that trains best model structure over up to 100 epochs, and evaluates final model on test set 67 | results = objective(params) 68 | print("Best Validation MSE = %.3f" % results['loss']) 69 | 70 | params = { 71 | 'tuneMode' : 1, 72 | 'batchsize' : 2**hp.quniform('batchsize', 5, 7, 1), 73 | 'leftpos' : hp.quniform('leftpos', 0, 10000, 500), 74 | 'rightpos' : hp.quniform('rightpos', 10000, 20000, 500), 75 | 'activationFxn' : 'relu', #hp.choice('activationFxn', ['relu', 'elu', 'selu', 'LeakyReLU', 'PReLU']) -- tried but none worked better than simply relu 76 | 'numFiltersConv1' : 2**hp.quniform('numFiltersConv1', 4, 7, 1), 77 | 'filterLenConv1' : hp.quniform('filterLenConv1', 1, 10, 1), 78 | 'dilRate1' : hp.quniform('dilRate1', 1, 4, 1), 79 | 'maxPool1' : hp.quniform('maxPool1', 5, 100, 5), 80 | 'numconvlayers' : hp.choice('numconvlayers', [ 81 | { 82 | 'numconvlayers1' : 'one' 83 | }, 84 | { 85 | 'numFiltersConv2' : 2**hp.quniform('numFiltersConv2', 4, 7, 1), 86 | 'filterLenConv2' : hp.quniform('filterLenConv2', 1, 10, 1), 87 | 'dilRate2' : hp.quniform('dilRate2', 1, 4, 1), 88 | 'maxPool2' : hp.quniform('maxPool2', 5, 100, 5), 89 | 'numconvlayers1' : hp.choice('numconvlayers1', [ 90 | { 91 | 'numconvlayers2' : 'two' 92 | }, 93 | { 94 | 'numFiltersConv3' : 2**hp.quniform('numFiltersConv3', 4, 7, 1), 95 | 'filterLenConv3' : hp.quniform('filterLenConv3', 1, 10, 1), 96 | 'dilRate3' : hp.quniform('dilRate3', 1, 4, 1), 97 | 'maxPool3' : hp.quniform('maxPool3', 5, 100, 5), 98 | 'numconvlayers2' : hp.choice('numconvlayers2', [ 99 | { 100 | 'numconvlayers3' : 'three' 101 | }, 102 | { 103 | 'numFiltersConv4' : 2**hp.quniform('numFiltersConv4', 4, 7, 1), 104 | 'filterLenConv4' : hp.quniform('filterLenConv4', 1, 10, 1), 105 | 'dilRate4' : hp.quniform('dilRate4', 1, 4, 1), 106 | 'maxPool4' : hp.quniform('maxPool4', 5, 100, 5), 107 | 'numconvlayers3' : 'four' 108 | }]) 109 | }]) 110 | }]), 111 | 'dense1' : 2**hp.quniform('dense1', 1, 8, 1), 112 | 'dropout1' : hp.uniform('dropout1', 0, 1), 113 | 'numdenselayers' : hp.choice('numdenselayers', [ 114 | { 115 | 'layers' : 'one' 116 | }, 117 | { 118 | 'layers' : 'two' , 119 | 'dense2' : 2**hp.quniform('dense2', 1, 8, 1), 120 | 'dropout2' : hp.uniform('dropout2', 0, 1) 121 | } 122 | ]) 123 | } 124 | 125 | def run_trials(database): 126 | trials_step = 5 # how many additional trials to do after loading saved trials 127 | max_trials = 5 # initial max_trials. put something small to not have to wait 128 | 129 | try: # try to load an already saved trials object, and increase the max 130 | trials = pickle.load(open(database, "rb")) 131 | print("Found saved Trials! Loading...") 132 | max_trials = len(trials.trials) + trials_step 133 | print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, trials_step)) 134 | except: # create a new trials object and start searching 135 | trials = Trials() 136 | 137 | best = fmin(objective, params, max_evals = max_trials, trials = trials, 138 | algo = anneal.suggest) 139 | # algo = rand.suggest) 140 | # algo = tpe.suggest) 141 | # algo = partial(mix.suggest, p_suggest=[(0.2, rand.suggest),(0.6, tpe.suggest),(0.2, anneal.suggest)])) 142 | 143 | ##### sample random parameter sets and print 144 | # import hyperopt.pyll.stochastic 145 | # print (hyperopt.pyll.stochastic.sample(params)) 146 | 147 | print( "Best:", best) 148 | # save the trials object 149 | with open(database, "wb") as f: 150 | pickle.dump(trials, f) 151 | 152 | def objective(params): 153 | leftpos = int(params['leftpos']) 154 | rightpos = int(params['rightpos']) 155 | activationFxn = params['activationFxn'] 156 | if not params['usemodel']: 157 | global X_trainhalflife, y_train 158 | X_trainpromoterSubseq = X_trainpromoter[:,leftpos:rightpos,:] 159 | X_validpromoterSubseq = X_validpromoter[:,leftpos:rightpos,:] 160 | halflifedata = Input(shape=(X_trainhalflife.shape[1:]), name='halflife') 161 | input_promoter = Input(shape=X_trainpromoterSubseq.shape[1:], name='promoter') 162 | 163 | try: 164 | # if True: 165 | mse = 1 166 | if params['usemodel']: 167 | model = load_model(params['usemodel']) 168 | print('Loaded results from:', params['usemodel']) 169 | else: 170 | x = Conv1D(int(params['numFiltersConv1']), int(params['filterLenConv1']), dilation_rate=int(params['dilRate1']), padding='same', kernel_initializer='glorot_normal', input_shape=X_trainpromoterSubseq.shape[1:],activation=activationFxn)(input_promoter) 171 | x = MaxPooling1D(int(params['maxPool1']))(x) 172 | 173 | if params['numconvlayers']['numconvlayers1'] != 'one': 174 | maxPool2 = int(params['numconvlayers']['maxPool2']) 175 | x = Conv1D(int(params['numconvlayers']['numFiltersConv2']), int(params['numconvlayers']['filterLenConv2']), dilation_rate=int(params['numconvlayers']['dilRate2']), padding='same', kernel_initializer='glorot_normal',activation=activationFxn)(x) #[2, 3, 4, 5, 6, 7, 8, 9, 10] 176 | x = MaxPooling1D(maxPool2)(x) 177 | if params['numconvlayers']['numconvlayers1']['numconvlayers2'] != 'two': 178 | maxPool3 = int(params['numconvlayers']['numconvlayers1']['maxPool3']) 179 | x = Conv1D(int(params['numconvlayers']['numconvlayers1']['numFiltersConv3']), int(params['numconvlayers']['numconvlayers1']['filterLenConv3']), dilation_rate=int(params['numconvlayers']['numconvlayers1']['dilRate3']), padding='same', kernel_initializer='glorot_normal',activation=activationFxn)(x) #[2, 3, 4, 5] 180 | x = MaxPooling1D(maxPool3)(x) 181 | if params['numconvlayers']['numconvlayers1']['numconvlayers2']['numconvlayers3'] != 'three': 182 | maxPool4 = int(params['numconvlayers']['numconvlayers1']['numconvlayers2']['maxPool4']) 183 | x = Conv1D(int(params['numconvlayers']['numconvlayers1']['numconvlayers2']['numFiltersConv4']), int(params['numconvlayers']['numconvlayers1']['numconvlayers2']['filterLenConv4']), dilation_rate=int(params['numconvlayers']['numconvlayers1']['numconvlayers2']['dilRate4']), padding='same', kernel_initializer='glorot_normal',activation=activationFxn)(x) #[2, 3, 4, 5] 184 | x = MaxPooling1D(maxPool4)(x) 185 | 186 | x = Flatten()(x) 187 | x = Concatenate()([x, halflifedata]) 188 | x = Dense(int(params['dense1']))(x) 189 | x = Activation(activationFxn)(x) 190 | x = Dropout(params['dropout1'])(x) 191 | if params['numdenselayers']['layers'] == 'two': 192 | x = Dense(int(params['numdenselayers']['dense2']))(x) 193 | x = Activation(activationFxn)(x) 194 | x = Dropout(params['numdenselayers']['dropout2'])(x) 195 | main_output = Dense(1)(x) 196 | model = Model(inputs=[input_promoter, halflifedata], outputs=[main_output]) 197 | model.compile(Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),'mean_squared_error', metrics=['mean_squared_error']) 198 | 199 | if params['tuneMode']: 200 | result = model.fit([X_trainpromoterSubseq, X_trainhalflife], y_train, batch_size=int(params['batchsize']), shuffle="batch", epochs=10, 201 | validation_data=[[X_validpromoterSubseq, X_validhalflife], y_valid]) 202 | mse = min(result.history['val_mean_squared_error']) 203 | print("leftpos, rightpos, mse") 204 | print(leftpos, rightpos, mse) 205 | else: 206 | print(model.summary()) 207 | plot_model(model, to_file=os.path.join(params['datadir'], 'best_model.png')) #requires Pydot/Graphviz to generate graph of network 208 | X_testpromoterSubseq = X_testpromoter[:,leftpos:rightpos,:] 209 | if not params['usemodel']: 210 | if params['subsample'] > 0: 211 | X_trainpromoterSubseq = X_trainpromoterSubseq[0:params['subsample'],:,:] 212 | X_trainhalflife = X_trainhalflife[0:params['subsample'],:] 213 | y_train = y_train[0:params['subsample']] 214 | check_cb = ModelCheckpoint(os.path.join(params['datadir'], params['trial']+params['cvfold']+'trainepoch.{epoch:02d}-{val_loss:.4f}.h5'), monitor='val_loss', verbose=1, save_best_only=True, mode='min') 215 | earlystop_cb = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min') 216 | result = model.fit([X_trainpromoterSubseq, X_trainhalflife], y_train, batch_size=int(params['batchsize']), shuffle="batch", epochs=100, 217 | validation_data=[[X_validpromoterSubseq, X_validhalflife], y_valid], callbacks=[earlystop_cb, check_cb]) 218 | mse_history = result.history['val_mean_squared_error'] 219 | mse = min(mse_history) 220 | best_file = os.path.join(params['datadir'], params['trial']+params['cvfold']+'trainepoch.%02d-%.4f.h5' % (mse_history.index(mse), mse)) 221 | model = load_model(best_file) 222 | print('Loaded results from:', best_file) 223 | 224 | predictions_test = model.predict([X_testpromoterSubseq, X_testhalflife], batch_size=20).flatten() 225 | slope, intercept, r_value, p_value, std_err = stats.linregress(predictions_test, y_test) 226 | print('Test R^2 = %.3f' % r_value**2) 227 | df = pd.DataFrame(np.column_stack((geneName_test, predictions_test, y_test)), columns=['Gene','Pred','Actual']) 228 | df.to_csv(os.path.join(params['datadir'], params['trial']+params['cvfold']+'predictions.txt'), index=False, header=True, sep='\t') 229 | 230 | return {'loss': mse, 'status': STATUS_OK } 231 | 232 | except: 233 | return {'loss': 1, 'status': STATUS_FAIL } # loss = 1 indicates a poor-performing model; reason model might fail include: incompatible parameters or insufficient memory resources available 234 | 235 | if __name__ == '__main__': 236 | main() 237 | -------------------------------------------------------------------------------- /Fig1_S2/choose_reference_genes.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | $file = shift; 4 | 5 | open IN, "zgrep -P '\tfive_prime_utr|three_prime_utr|CDS\t' $file | "; 6 | while(){ 7 | ($region, $start, $stop, $last) = (split /\t/)[2,3,4,-1]; 8 | ($parent, $id) = ($last =~ /gene_id "(ENS\w*GR?[\d|\.]+)"; gene_version "\d+"; transcript_id "(ENS\w*TR?[\d|\.]+)";/); 9 | $lengths{$id}{$region} += ($stop-$start); 10 | } 11 | close IN; 12 | 13 | open IN, "zgrep -P '\tCDS\t' $file | "; 14 | while(){ 15 | ($parent, $id) = ($_ =~ /gene_id "(ENS\w*GR?[\d|\.]+)"; gene_version "\d+"; transcript_id "(ENS\w*TR?[\d|\.]+)";/); 16 | $reptranscript{$parent} = $id if (! defined $reptranscript{$parent}); 17 | $reptranscript{$parent} = $id if ($lengths{$id}{"CDS"} > $lengths{$reptranscript{$parent}}{"CDS"} && $lengths{$id}{"five_prime_utr"} > 0); 18 | $reptranscript{$parent} = $id if ($lengths{$id}{"CDS"} >= $lengths{$reptranscript{$parent}}{"CDS"} && $lengths{$id}{"five_prime_utr"} > $lengths{$reptranscript{$parent}}{"five_prime_utr"}); 19 | $reptranscript{$parent} = $id if ($lengths{$id}{"CDS"} >= $lengths{$reptranscript{$parent}}{"CDS"} && $lengths{$id}{"five_prime_utr"} >= $lengths{$reptranscript{$parent}}{"five_prime_utr"} && $lengths{$id}{"three_prime_utr"} > $lengths{$reptranscript{$parent}}{"three_prime_utr"}); 20 | } 21 | close IN; 22 | 23 | %okids = map { $_ => 1 } values %reptranscript; 24 | foreach (keys %reptranscript){ 25 | $repid{$reptranscript{$_}} = $_; 26 | } 27 | 28 | open IN, "zcat $file |"; 29 | while(){ 30 | ($chr, $region, $start, $stop, $str, $last) = (split /\t/)[0,2,3,4,6,-1]; 31 | ($parent, $id) = ($last =~ /gene_id "(ENS\w*GR?[\d|\.]+)"; gene_version "\d+"; transcript_id "(ENS\w*TR?[\d|\.]+)";/); 32 | $repid = $repid{$id}; 33 | if ($okids{$id}){ 34 | @a = split /\t/, $_; 35 | $a[-1] = "$parent"; 36 | $_ = join ("\t", @a)."\n"; 37 | print "$_" if $chr =~ /^\d+|^X/; #keep non-chrY genes only 38 | } 39 | } 40 | close IN; -------------------------------------------------------------------------------- /Fig1_S2/choose_reference_genes_forhg19.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | $file = shift; 4 | 5 | open IN, "zgrep -P '\tUTR|CDS\t' $file | "; 6 | while(){ 7 | ($region, $start, $stop, $last) = (split /\t/)[2,3,4,-1]; 8 | ($parent, $id) = ($last =~ /gene_id "(ENS\w*GR?[\d|\.]+.*)"; transcript_id "(ENS\w*TR?[\d|\.]+.*)"; gene_type/); 9 | $lengths{$id}{$region} += ($stop-$start); 10 | } 11 | close IN; 12 | 13 | open IN, "zgrep -P '\tCDS\t' $file | "; 14 | while(){ 15 | ($parent, $id) = ($_ =~ /gene_id "(ENS\w*GR?[\d|\.]+.*)"; transcript_id "(ENS\w*TR?[\d|\.]+.*)"; gene_type/); 16 | $reptranscript{$parent} = $id if (! defined $reptranscript{$parent}); 17 | $reptranscript{$parent} = $id if ($lengths{$id}{"CDS"} > $lengths{$reptranscript{$parent}}{"CDS"} && $lengths{$id}{"UTR"} > 0); 18 | } 19 | close IN; 20 | 21 | %okids = map { $_ => 1 } values %reptranscript; 22 | foreach (keys %reptranscript){ 23 | $repid{$reptranscript{$_}} = $_; 24 | } 25 | 26 | open IN, "zcat $file |"; 27 | while(){ 28 | ($chr, $region, $start, $stop, $str, $last) = (split /\t/)[0,2,3,4,6,-1]; 29 | ($parent, $id) = ($last =~ /gene_id "(ENS\w*GR?[\d|\.]+.*)"; transcript_id "(ENS\w*TR?[\d|\.]+.*)"; gene_type/); 30 | $repid = $repid{$id}; 31 | if ($okids{$id}){ 32 | @a = split /\t/, $_; 33 | $a[-1] = "$parent"; 34 | $_ = join ("\t", @a)."\n"; 35 | print "$_"; 36 | } 37 | } 38 | close IN; -------------------------------------------------------------------------------- /Fig1_S2/extract_promoters.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | $file = shift; 4 | $spec = shift; 5 | 6 | $dist = 10000; 7 | 8 | open IN, "zgrep -P '\texon\t' $file | "; 9 | while(){ 10 | ($chr, $region, $start, $stop, $str, $last) = (split /\t/)[0,2,3,4,6,-1]; 11 | ($parent) = ($last =~ /(ENS\w*G\d+)/); 12 | $has5pUTR{$id} = 1; 13 | if($str eq '+'){ 14 | $allregions{$parent} = $start if (! exists $allregions{$parent} || $start < $allregions{$parent}); 15 | } 16 | else{ 17 | $allregions{$parent} = $stop if (! exists $allregions{$parent} || $stop > $allregions{$parent}); 18 | } 19 | } 20 | close IN; 21 | 22 | open IN, "zgrep -P '\texon\t' $file | "; 23 | while(){ 24 | ($start, $stop, $str, $last) = (split /\t/)[3,4,6,-1]; 25 | ($parent) = ($last =~ /(ENS\w*G\d+)/); 26 | next if $seenids{$parent}; 27 | next if $str eq '+' && $allregions{$parent} != $start; 28 | next if $str eq '-' && $allregions{$parent} != $stop; 29 | @a = split /\t/, $_; 30 | $a[-1] = $parent; 31 | $a[2] = $parent; 32 | $a[3] = $allregions{$parent} - $dist; 33 | $a[4] = $allregions{$parent} + $dist; 34 | if ($spec eq "mouse"){ print join("\t", 'chr'.$a[0], $a[3], $a[4], $a[-1], 0, $str), "\n" if $a[3] > 0; } 35 | else { print join("\t", $a[0], $a[3], $a[4], $a[-1], 0, $str), "\n" if $a[3] > 0; } 36 | $seenids{$parent} = 1; 37 | } 38 | close IN; -------------------------------------------------------------------------------- /Fig1_S2/geneName2Ensembl.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | open IN, "){ chomp; 5 | @a = split /\t/, $_; 6 | $id2parent{$a[1]} = $a[0] if $a[2] =~ /^\d+|^X/; #remove haplotypes when considering Ensembl ID 7 | } 8 | close IN; 9 | 10 | open IN, "){ chomp; 12 | @a = split /\t/, $_; 13 | @b = split /, /, $a[1]; 14 | @c = split /, /, $a[2]; 15 | $hgnc2parent{$a[0]} = $a[3]; 16 | foreach $i (@b){ $hgnc2parent{$i} = $a[3]; } 17 | foreach $i (@c){ $hgnc2parent{$i} = $a[3]; } 18 | } 19 | close IN; 20 | 21 | open IN, "hg38_cage_promoters_ensemblID.bed"; 23 | while(){ chomp; @a=split; $id = $a[3]; 24 | @ids = split /,/, $id; 25 | foreach $id (@ids){ 26 | ($promoter, $gene) = split /\@/, $id; 27 | $a[3] = $id2parent{$gene}; 28 | $a[3] = $hgnc2parent{$gene} if $a[3] eq ''; 29 | print OUT join("\t", @a),"\n" if $promoter eq 'p1' && $a[3] ne ''; 30 | } 31 | } 32 | close IN; 33 | close OUT; -------------------------------------------------------------------------------- /Fig1_S2/geneName2EnsemblMouse.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | open IN, "){ chomp; 5 | @a = split /\t/, $_; 6 | $id2parent{$a[1]} = $a[0] if $a[2] =~ /^\d+|^X/; #remove haplotypes when considering Ensembl ID 7 | } 8 | close IN; 9 | 10 | open IN, "cut -f 2,10 MGI_EntrezGene.rpt | "; 11 | while(){ chomp; 12 | @a = split /\t/, $_; 13 | $mgi2synonym{$a[0]} = $a[1]; 14 | } 15 | close IN; 16 | 17 | open IN, "cut -f 1,3 ensembl2entrezID_v90_mm10.txt | "; 18 | while(){ chomp; 19 | @a = split /\t/, $_; 20 | $entrez2ensembl{$a[1]} = $a[0] if $a[1] ne ''; 21 | } 22 | close IN; 23 | 24 | open IN, "cut -f 3,11 MGI_Gene_Model_Coord.rpt | "; 25 | while(){ chomp; 26 | @a = split /\t/, $_; 27 | $mgi2parent{$a[0]} = $a[1]; 28 | @b = split /\|/, $mgi2synonym{$a[0]}; 29 | foreach $i (@b){ $mgi2parent{$i} = $a[1]; } 30 | } 31 | close IN; 32 | 33 | open IN, "cut -f 3,11 MGI_Gene_Model_Coord.rpt | "; 34 | while(){ chomp; 35 | @a = split /\t/, $_; 36 | $mgi2parent{$a[0]} = $a[1]; 37 | } 38 | close IN; 39 | 40 | open IN, "Ouyang_mESC_RPKM_ensemblID.txt"; 42 | while(){ chomp; @a=split; $id = $a[0]; 43 | $id = $entrez2ensembl{$a[0]}; 44 | $id = $id2parent{$a[1]} if $id eq ''; 45 | print OUT join("\t", $id, $a[2]),"\n" if $id ne '' && !$seenid{$id}; 46 | print STDERR $_,"\n" if $id eq '' || $seenid{$id}; 47 | $seenid{$id} = 1; 48 | } 49 | close IN; 50 | close OUT; 51 | 52 | open IN, "mm10_cage_promoters_ensemblID.bed"; 54 | while(){ chomp; @a=split; $id = $a[3]; 55 | @ids = split /,/, $id; 56 | foreach $id (@ids){ 57 | ($promoter, $gene) = split /\@/, $id; 58 | $a[3] = $id2parent{$gene}; 59 | $a[3] = $mgi2parent{$gene} if $a[3] eq ''; 60 | print OUT join("\t", @a),"\n" if $promoter eq 'p1' && $a[3] =~ /ENS/ && !$seenid{$a[3]}; 61 | $seenid{$a[3]} = 1; 62 | } 63 | } 64 | close IN; 65 | close OUT; -------------------------------------------------------------------------------- /Fig1_S2/generate_training_input.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use allfxns; 4 | 5 | $exprMat = shift; 6 | $spec = ($exprMat =~ /mouse/)? 1 : 0; #species is mouse or human? 7 | 8 | sub readBed{ 9 | local $bed = shift; 10 | local %bed = (); 11 | open BEDENTRY, "<$bed" || die "Could not open bed file for $bed\n"; 12 | while ($line = ){ 13 | @a = split /\t/, $line; 14 | $bed{$a[3]} = $line; 15 | } 16 | close BEDENTRY; 17 | return \%bed; 18 | } 19 | 20 | sub readFasta{ 21 | local $fasta = shift; 22 | local %fasta = (); 23 | open DNA, "zcat $fasta | " || die "Could not open fasta file for $fasta\n"; 24 | while ($line = ){ chomp $line; 25 | if ($line =~ /^>\s?(\w+\d)\.?\d*/){ $header = $1; } 26 | else { $fasta{$header} .= $line; } 27 | } 28 | close DNA; 29 | return \%fasta; 30 | } 31 | 32 | if ($spec){ 33 | open IN, "zcat Mus_musculus.GRCm3.90.chosenTranscript.gtf.gz | "; 34 | } 35 | else{ 36 | open IN, "zcat Homo_sapiens.GRCh38.90.chosenTranscript.gtf.gz | "; 37 | } 38 | while(){ 39 | ($region, $start, $stop, $last) = (split /\t/)[2,3,4,-1]; 40 | ($id) = ($last =~ /(ENS\w*GR?[\d|\.]+)/); 41 | $lengths{$id}{$region} += ($stop-$start); 42 | $cdsexoncount{$id}++ if $region eq 'CDS'; 43 | } 44 | close IN; 45 | 46 | if ($spec){ 47 | %promoterbed = %{ readBed("mm10_promoters.bed") }; 48 | %promoters = %{ readFasta("mm10_promoters.fa.gz") }; 49 | %fantombed = %{ readBed("mm10_cage_promoters_ensemblID.bed") }; 50 | %fantompromoters = %{ readFasta("mm10_cage_promoters_ensemblID.fa.gz") }; 51 | %utr5p = %{ readFasta("mm10_ensembl90_5utrs.fa.gz") }; 52 | %orfs = %{ readFasta("mm10_ensembl90_orfs.fa.gz") }; 53 | %utr3p = %{ readFasta("mm10_ensembl90_3utrs.fa.gz") }; 54 | } 55 | else{ 56 | %promoterbed = %{ readBed("hg38_promoters.bed") }; 57 | %fantombed = %{ readBed("hg38_cage_promoters_ensemblID.bed") }; 58 | %promoters = %{ readFasta("hg38_promoters.fa.gz") }; 59 | %fantompromoters = %{ readFasta("hg38_cage_promoters_ensemblID.fa.gz") }; 60 | %utr5p = %{ readFasta("hg38_ensembl90_5utrs.fa.gz") }; 61 | %orfs = %{ readFasta("hg38_ensembl90_orfs.fa.gz") }; 62 | %utr3p = %{ readFasta("hg38_ensembl90_3utrs.fa.gz") }; 63 | } 64 | 65 | if ($spec){ 66 | open BED, ">mm10_promoters_cage_corrected.bed"; 67 | } 68 | else{ 69 | open BED, ">hg38_promoters_cage_corrected.bed"; 70 | } 71 | print join("\t", "ENSID", "EXPRESSION", "UTR5LEN", "CDSLEN", "INTRONLEN", "UTR3LEN", "UTR5GC", "CDSGC", "UTR3GC", "ORFEXONDENSITY", "PROMOTER"), "\n"; 72 | open IN, "<$exprMat"; 73 | while(){ chomp; 74 | @a=split /\t/; 75 | $id = $a[0]; 76 | if (($promoters{$id} ne '' || $fantompromoters{$id} ne '') && $lengths{$id}{"CDS"} > 0){ 77 | $promoter = $promoters{$id}; 78 | $promoterbed = $promoterbed{$id}; 79 | if ($fantompromoters{$id} ne ''){ 80 | $promoter = $fantompromoters{$id}; 81 | $promoterbed = $fantombed{$id}; 82 | } 83 | print BED $promoterbed; 84 | print join("\t", $id, $a[1], int($lengths{$id}{"five_prime_utr"}), int($lengths{$id}{"CDS"}), 85 | int($lengths{$id}{"transcript"})-(int($lengths{$id}{"three_prime_utr"})+int($lengths{$id}{"CDS"})+int($lengths{$id}{"five_prime_utr"})), 86 | int($lengths{$id}{"three_prime_utr"}), gcContent($utr5p{$id}), gcContent($orfs{$id}), gcContent($utr3p{$id}), 87 | sprintf("%.2f", $cdsexoncount{$id}*1000/$lengths{$id}{"CDS"}), $promoter), "\n"; 88 | } 89 | else{ $count++; } 90 | } 91 | close BED; 92 | 93 | print STDERR "$count IDs missing/revised due to annotation version changes\n"; -------------------------------------------------------------------------------- /Fig1_S2/print_losses.py: -------------------------------------------------------------------------------- 1 | import pickle, sys 2 | 3 | file = sys.argv[1] 4 | trials = pickle.load(open(file, "rb")) 5 | alldicts = trials.trials 6 | loss = trials.losses() 7 | 8 | for i in range(len(loss)): 9 | print(str(int(alldicts[i]['misc']['vals']['leftpos'][0]))+'\t'+str(int(alldicts[i]['misc']['vals']['rightpos'][0]))+'\t'+str(loss[i])+'\t'+str(alldicts[i]['misc']['vals'])) 10 | -------------------------------------------------------------------------------- /Fig1_S2/process_RNAseq.R: -------------------------------------------------------------------------------- 1 | a=read.delim(gzfile("57epigenomes.RPKM.pc.gz")) 2 | a$median=apply(a[,3:ncol(a)],1,median) 3 | write.table(a[,c("gene_id","median")],quote=F,row.names=F,col.names=F,sep="\t", file="57epigenomes.median_expr.txt") -------------------------------------------------------------------------------- /Fig1_S2/process_RNAseq_mouse.R: -------------------------------------------------------------------------------- 1 | a=read.delim(gzfile("mouse_FPKMs.tsv.gz"), F) 2 | colnames(a)[1]="gene_id" 3 | a[,1]=substring(a[,1],1,18) 4 | a$median=apply(a[,2:ncol(a)],1,median) 5 | write.table(a[,c("gene_id","median")],quote=F,row.names=F,col.names=F,sep="\t", file="mouse.median_expr.txt") -------------------------------------------------------------------------------- /Fig1_S2/runme.sh: -------------------------------------------------------------------------------- 1 | ########### MOST OF THESE STEPS HAVE PRECOMPUTED RESULTS 2 | 3 | #run the following in the base Xpresso folder to retrieve these precomputed results: 4 | wget -r -np -nH --reject "index.html*" --cut-dirs 5 https://krishna.gs.washington.edu/content/members/vagar/Xpresso/data/datasets/ 5 | 6 | ########### EXTRACT PROMOTERS FROM FANTOM5 CAGE PEAKS ############ 7 | 8 | # download human CAGE annotations 9 | # the original link (now broken, stored in "datasets/ was downloaded from here) 10 | # wget http://fantom.gsc.riken.jp/5/datahub/hg38/peaks/hg38.cage_peak_phase1and2combined.bb http://fantom.gsc.riken.jp/5/datahub/mm10/peaks/mm10.cage_peak_phase1and2combined.bb 11 | 12 | # the revised link can be found here: 13 | wget http://fantom.gsc.riken.jp/5/datahub/hg38/peaks/hg38.cage_peak.bb http://fantom.gsc.riken.jp/5/datahub/mm10/peaks/mm10.cage_peak.bb 14 | bigBedToBed hg38.cage_peak_phase1and2combined.bb hg38.cage_peak_phase1and2combined.bed 15 | bigBedToBed mm10.cage_peak_phase1and2combined.bb mm10.cage_peak_phase1and2combined.bed 16 | 17 | # extracts best peak for each gene, removes chrY/M genes 18 | grep -e 'p1@' hg38.cage_peak_phase1and2combined.bed | \ 19 | perl -ne 'chomp; @a=split /\t/; $a[0]=substr($a[0],3); $mid = int($a[-2]); $start=$mid-10000; $stop=$mid+10000; print join("\t",$a[0],$start,$stop,$a[3],0,$a[5])."\n" if $start > 0;' | \ 20 | grep -v -P "^Y|^M" >hg38_cage_promoters.bed 21 | grep -e 'p1@' mm10.cage_peak_phase1and2combined.bed | \ 22 | perl -ne 'chomp; @a=split /\t/; $mid = int($a[-2]); $start=$mid-10000; $stop=$mid+10000; print join("\t",$a[0],$start,$stop,$a[3],0,$a[5])."\n" if $start > 0;' | \ 23 | grep -v -P "^chrY|^chrM" >mm10_cage_promoters.bed 24 | 25 | #acquired 2 additional tables from BioMart and HGNC in addition to these for the mouse 26 | wget http://www.informatics.jax.org/downloads/reports/MGI_Gene_Model_Coord.rpt http://www.informatics.jax.org/downloads/reports/MGI_EntrezGene.rpt 27 | # converts IDs of protein-coding genes into Ensembl IDs for top promoter CAGE peak 28 | ./geneName2Ensembl.pl 29 | # acquire Ouyang_mESC_RPKM.txt from Supplementary table of Ouyang et al. 30 | ./geneName2EnsemblMouse.pl 31 | 32 | # extract CAGE-revised promoter sequence from hg38 genome -- REQUIRES DOWNLOAD OF HG38 AND MM10 GENOMES 33 | bedtools getfasta -s -name -fi human_hs38_noAlt/whole_genome.fa -bed hg38_cage_promoters_ensemblID.bed -fo hg38_cage_promoters_ensemblID.fa 34 | gzip hg38_cage_promoters_ensemblID.fa 35 | bedtools getfasta -s -name -fi mus_musculus/mm10.fa -bed mm10_cage_promoters_ensemblID.bed -fo mm10_cage_promoters_ensemblID.fa 36 | gzip mm10_cage_promoters_ensemblID.fa 37 | 38 | 39 | ########### EXTRACT PROMOTERS FROM ENSEMBL ############ 40 | 41 | # download human/mouse gene annotations on hg38/mm10 42 | wget ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_sapiens.GRCh38.90.gtf.gz 43 | wget ftp://ftp.ensembl.org/pub/release-90/gtf/mus_musculus/Mus_musculus.GRCm38.90.gtf.gz 44 | #Ensembl 90 on hg19 45 | wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_27/GRCh37_mapping/gencode.v27lift37.basic.annotation.gtf.gz 46 | 47 | # choose 1 representative transcript for each protein-coding gene, keep chrX or chr[1..22] genes only 48 | ./choose_reference_genes.pl Homo_sapiens.GRCh38.90.gtf.gz | gzip -c >Homo_sapiens.GRCh38.90.chosenTranscript.gtf.gz 49 | ./choose_reference_genes_forhg19.pl gencode.v27lift37.basic.annotation.gtf.gz | gzip -c >Homo_sapiens.hg19.90.chosenTranscript.gtf.gz 50 | ./choose_reference_genes.pl Mus_musculus.GRCm38.90.gtf.gz | gzip -c >Mus_musculus.GRCm3.90.chosenTranscript.gtf.gz 51 | zgrep transcript Homo_sapiens.hg19.90.chosenTranscript.gtf.gz | gzip -c >Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz 52 | 53 | # generate input file for UCSC genome browser, extract 5' UTR, ORF, and 3' UTR sequences using these files using the Table Browser 54 | perl -ne '@a = split; $a[-1] = "gene_id \"$a[-1]\"; transcript_id \"$a[-1]\""; print "chr".join("\t", @a), "\n";' \ 55 | <(zcat Homo_sapiens.GRCh38.90.chosenTranscript.gtf.gz) | \ 56 | gzip -c >Homo_sapiens.GRCh38.90.chr.gtf.gz 57 | perl -ne '@a = split; $a[-1] = "gene_id \"$a[-1]\"; transcript_id \"$a[-1]\""; print "chr".join("\t", @a), "\n";' \ 58 | <(zcat Mus_musculus.GRCm3.90.chosenTranscript.gtf.gz) | \ 59 | gzip -c >Mus_musculus.GRCm3.90.chr.gtf.gz 60 | 61 | # process into BED and extract +/- 10Kb region surrounding TSS 62 | ./extract_promoters.pl Homo_sapiens.GRCh38.90.chosenTranscript.gtf.gz >hg38_promoters.bed 63 | ./extract_promoters.pl Mus_musculus.GRCm3.90.chosenTranscript.gtf.gz mouse >mm10_promoters.bed 64 | 65 | # extract Ensembl-annotated promoter sequence from hg38 genome 66 | bedtools getfasta -s -name -fi human_hs38_noAlt/whole_genome.fa -bed hg38_promoters.bed -fo hg38_promoters.fa 67 | gzip hg38_promoters.fa 68 | bedtools getfasta -s -name -fi mus_musculus/mm10.fa -bed mm10_promoters.bed -fo mm10_promoters.fa 69 | gzip mm10_promoters.fa 70 | 71 | # histone genes to filter out, not quantified correctly due to lack of poly(A) tail 72 | grep HIST ensembl2geneName_v90.txt | cut -f 1 >mask_histone_genes.txt 73 | grep Hist ensembl2geneName_v90_mm10.txt | cut -f 1 >mask_histone_genes_mm10.txt 74 | 75 | ########### COLLECT & PROCESS GENE EXPRESSION DATA ############ 76 | 77 | # download pre-processed RNA-seq data from 56 cell types (+1 universal reference) 78 | wget http://egg2.wustl.edu/roadmap/data/byDataType/rna/expression/57epigenomes.RPKM.pc.gz http://egg2.wustl.edu/roadmap/data/byDataType/rna/expression/EG.name.txt 79 | # extract median expression values 80 | Rscript process_RNAseq.R 81 | cut -f 1,56 <(zcat 57epigenomes.RPKM.pc.gz) | tail -n+2 >57epigenomes.K562.txt 82 | cut -f 1,50 <(zcat 57epigenomes.RPKM.pc.gz) | tail -n+2 >57epigenomes.GM12878.txt 83 | 84 | cut -f 11 files_geneQuant_Rep1.txt | tail -n+2 >urls.txt 85 | while read p; do qsub "wget $p"; done $X.FPKM.tsv.gz; } done 88 | paste *tsv | cut -f 1,$(echo `seq 2 2 1000` | perl -ne '@a=split / /, $_; print join(",",@a);') | sort | gzip -c >mouse_FPKMs.tsv.gz 89 | Rscript process_RNAseq_mouse.R #generates mouse.median_expr.txt 90 | 91 | ########### GENERATE TRAINING/VALIDATION/TEST SET AND OPTIMIZE ############ 92 | 93 | #generate training/validation/test sets 94 | perl generate_training_input.pl 57epigenomes.median_expr.txt | gzip -c >Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz 95 | perl generate_training_input.pl 57epigenomes.K562.txt | gzip -c >Roadmap_FantomAnnotations.InputData.pM10Kb.K562expr.txt.gz 96 | perl generate_training_input.pl 57epigenomes.GM12878.txt | gzip -c >Roadmap_FantomAnnotations.InputData.pM10Kb.GM12878expr.txt.gz 97 | perl generate_training_input.pl mouse.median_expr.txt | gzip -c >Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz 98 | perl generate_training_input.pl Ouyang_mouseESC_RPKM_ensemblID.txt | gzip -c >Mouse_FantomAnnotations.InputData.pM10Kb.mESC.txt.gz 99 | 100 | python setup_training_files.py -t 1000 -v 1000 Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz pM10Kb_1KTest 101 | python setup_training_files.py -t 1000 -v 1000 Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz pM10Kb_1KTest_Mouse 102 | 103 | #run hyperparameter search for ~1000 iterations on a fast GPU (can take ~1-2 days to run) 104 | python Xpresso.py tune tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest/ 105 | 106 | Rscript Fig1B_S2A.R FILE1 FILE2 107 | 108 | for x in {1..10}; do { python Xpresso.py test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest/ >trial_$x.txt; } done & 109 | 110 | Rscript Fig1C.R FILE1 111 | -------------------------------------------------------------------------------- /Fig1_S2/setup_training_files.py: -------------------------------------------------------------------------------- 1 | import sys, os, h5py 2 | import numpy.random as npr 3 | import numpy as np 4 | from optparse import OptionParser 5 | import pandas as pd 6 | from sklearn import preprocessing 7 | from sklearn.model_selection import KFold 8 | 9 | def main(): 10 | usage = 'usage: %prog [options] ' 11 | parser = OptionParser(usage) 12 | parser.add_option('-t', dest='testCount', default=1000, type='int', help='Number of test examples: [Default: %default]') 13 | parser.add_option('-v', dest='validCount', default=1000, type='int', help='Number of validation examples: [Default: %default]') 14 | parser.add_option('--cv', dest='crossVal', default=False, action='store_true', help='Generate samples for 10-fold cross-validated predictions? [Default: %default]') 15 | parser.add_option('--orthologs', dest='orthologMode', default=False, action='store_true', help='Mouse file to prepare human and mouse 1-1 ortholog set (Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz recommended to accompany human data_file): [Default: %default]') 16 | parser.add_option('--over', dest='overwrite', default=False, action='store_true', help='Overwrite directory? [Default: %default]') 17 | (options,args) = parser.parse_args() 18 | 19 | if len(args) != 2: 20 | print(args) 21 | parser.error('Must provide data file and output directory') 22 | else: 23 | data_file = args[0] 24 | out_dir = args[1] 25 | compress_args = {'compression': 'gzip', 'compression_opts': 1} 26 | trainfile = os.path.join(out_dir, 'train.h5') 27 | validfile = os.path.join(out_dir, 'valid.h5') 28 | testfile = os.path.join(out_dir, 'test.h5') 29 | 30 | if options.orthologMode: 31 | trainfile = os.path.join(out_dir, 'train_human1to1.h5') 32 | validfile = os.path.join(out_dir, 'valid_human1to1.h5') 33 | testfile = os.path.join(out_dir, 'test_human1to1.h5') 34 | trainfile2 = os.path.join(out_dir, 'train_mouse1to1.h5') 35 | validfile2 = os.path.join(out_dir, 'valid_mouse1to1.h5') 36 | testfile2 = os.path.join(out_dir, 'test_mouse1to1.h5') 37 | 38 | if options.overwrite or not os.path.exists(out_dir): 39 | if not os.path.exists(out_dir): 40 | os.mkdir(out_dir) 41 | 42 | # load data 43 | promoters, halflifedata, labels, geneNames = preprocess(data_file, options.orthologMode) 44 | 45 | # check that the sum is valid 46 | assert(options.testCount + options.validCount <= promoters.shape[0]) 47 | test_count = options.testCount 48 | valid_count = options.validCount 49 | 50 | train_count = promoters.shape[0] - test_count - valid_count 51 | 52 | if options.crossVal: 53 | print('running 10-fold cross val w/ %d sequences ' % promoters.shape[0]) 54 | kf = KFold(n_splits=10, random_state=42, shuffle=False) 55 | fold = 0 56 | for train_index, test_index in kf.split(promoters): #keep aside 1000 examples of train indices for validation set 57 | fold += 1 58 | print('fold %d' % fold) 59 | h5f_train = h5py.File(os.path.join(out_dir, str(fold)+'train.h5'), 'w') 60 | h5f_valid = h5py.File(os.path.join(out_dir, str(fold)+'valid.h5'), 'w') 61 | h5f_test = h5py.File(os.path.join(out_dir, str(fold)+'test.h5'), 'w') 62 | valid_index = train_index[0:1000] 63 | train_index = train_index[1000:len(train_index)] 64 | h5f_train.create_dataset('data' , data=halflifedata[train_index,:], **compress_args) 65 | h5f_train.create_dataset('promoter', data=promoters[train_index,:], **compress_args) 66 | h5f_train.create_dataset('label' , data=labels[train_index], **compress_args) 67 | h5f_train.create_dataset('geneName' , data=np.array(geneNames)[train_index].tolist(), **compress_args) 68 | h5f_train.close() 69 | h5f_valid.create_dataset('data' , data=halflifedata[valid_index,:], **compress_args) 70 | h5f_valid.create_dataset('promoter', data=promoters[valid_index,:], **compress_args) 71 | h5f_valid.create_dataset('label' , data=labels[valid_index], **compress_args) 72 | h5f_valid.create_dataset('geneName' , data=np.array(geneNames)[valid_index].tolist(), **compress_args) 73 | h5f_valid.close() 74 | h5f_test.create_dataset('data' , data=halflifedata[test_index,:], **compress_args) 75 | h5f_test.create_dataset('promoter', data=promoters[test_index,:], **compress_args) 76 | h5f_test.create_dataset('label' , data=labels[test_index], **compress_args) 77 | h5f_test.create_dataset('geneName' , data=np.array(geneNames)[test_index].tolist(), **compress_args) 78 | h5f_test.close() 79 | else: 80 | print('%d training sequences ' % train_count) 81 | print('%d test sequences ' % test_count) 82 | print('%d validation sequences ' % valid_count) 83 | h5f_train = h5py.File(trainfile, 'w') 84 | h5f_valid = h5py.File(validfile, 'w') 85 | h5f_test = h5py.File(testfile, 'w') 86 | i = 0 87 | if train_count > 0: 88 | h5f_train.create_dataset('data' , data=halflifedata[i:i+train_count,:], **compress_args) 89 | h5f_train.create_dataset('promoter', data=promoters[i:i+train_count,:], **compress_args) 90 | h5f_train.create_dataset('label' , data=labels[i:i+train_count], **compress_args) 91 | h5f_train.create_dataset('geneName' , data=geneNames[i:i+train_count], **compress_args) 92 | h5f_train.close() 93 | i += train_count 94 | if valid_count > 0: 95 | h5f_valid.create_dataset('data' , data=halflifedata[i:i+valid_count,:], **compress_args) 96 | h5f_valid.create_dataset('promoter', data=promoters[i:i+valid_count,:], **compress_args) 97 | h5f_valid.create_dataset('label' , data=labels[i:i+valid_count], **compress_args) 98 | h5f_valid.create_dataset('geneName' , data=geneNames[i:i+valid_count], **compress_args) 99 | h5f_valid.close() 100 | i += valid_count 101 | if test_count > 0: 102 | h5f_test.create_dataset('data' , data=halflifedata[i:i+test_count,:], **compress_args) 103 | h5f_test.create_dataset('promoter', data=promoters[i:i+test_count,:], **compress_args) 104 | h5f_test.create_dataset('label' , data=labels[i:i+test_count], **compress_args) 105 | h5f_test.create_dataset('geneName' , data=geneNames[i:i+test_count], **compress_args) 106 | h5f_test.close() 107 | 108 | if options.orthologMode: 109 | print("Finding 1-1 orthologs...") 110 | h5f_train = h5py.File(trainfile2, 'w') 111 | h5f_valid = h5py.File(validfile2, 'w') 112 | h5f_test = h5py.File(testfile2, 'w') 113 | promoters2, halflifedata2, labels2, geneNames2 = preprocess(options.orthologMode, options.orthologMode) 114 | orthologs = pd.read_table("human2mouse_one2one_orthologs.txt", header=None) 115 | 116 | i = 0 117 | orthoids = orthologs[orthologs[0].isin(geneNames[i:i+train_count])][1] #transform human to mouse IDs 118 | idxs = np.where(np.isin(geneNames2,orthoids))[0].tolist() 119 | h5f_train.create_dataset('data' , data=halflifedata2[idxs,:], **compress_args) 120 | h5f_train.create_dataset('promoter', data=promoters2[idxs,:], **compress_args) 121 | h5f_train.create_dataset('label' , data=labels2[idxs], **compress_args) 122 | h5f_train.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args) 123 | print('%d 1-1 mouse orthologs found for training set' % labels2[idxs].shape) 124 | h5f_train.close() 125 | i += train_count 126 | orthoids = orthologs[orthologs[0].isin(geneNames[i:i+valid_count])][1] 127 | idxs = np.where(np.isin(geneNames2,orthoids))[0].tolist() 128 | h5f_valid.create_dataset('data' , data=halflifedata2[idxs,:], **compress_args) 129 | h5f_valid.create_dataset('promoter', data=promoters2[idxs,:], **compress_args) 130 | h5f_valid.create_dataset('label' , data=labels2[idxs], **compress_args) 131 | h5f_valid.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args) 132 | print('%d 1-1 mouse orthologs found validation set' % labels2[idxs].shape) 133 | h5f_valid.close() 134 | i += valid_count 135 | orthoids = orthologs[orthologs[0].isin(geneNames[i:i+test_count])][1] 136 | idxs = np.isin(geneNames2,orthoids) 137 | h5f_test.create_dataset('data' , data=halflifedata2[idxs,:], **compress_args) 138 | h5f_test.create_dataset('promoter', data=promoters2[idxs,:], **compress_args) 139 | h5f_test.create_dataset('label' , data=labels2[idxs], **compress_args) 140 | h5f_test.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args) 141 | print('%d 1-1 mouse orthologs found for test set ' % labels2[idxs].shape) 142 | h5f_test.close() 143 | else: 144 | parser.error('Nothing done...Run with --over to overwrite') 145 | 146 | def one_hot(seq): 147 | seq_len = len(seq.item(0)) 148 | seqindex = {'A':0, 'C':1, 'G':2, 'T':3, 'a':0, 'c':1, 'g':2, 't':3} 149 | seq_vec = np.zeros((len(seq),seq_len,4), dtype='bool') 150 | for i in range(len(seq)): 151 | thisseq = seq.item(i) 152 | for j in range(seq_len): 153 | try: 154 | seq_vec[i,j,seqindex[thisseq[j]]] = 1 155 | except: 156 | pass 157 | return seq_vec 158 | 159 | def preprocess(data_file, orthologMode): 160 | table = pd.read_table(data_file, index_col=0) 161 | maskedIDs = pd.read_table("mask_histone_genes_mm10.txt", header=None) #mask histone genes, chrY genes already filtered out 162 | maskedIDs2 = pd.read_table("mask_histone_genes.txt", header=None) #mask histone genes, chrY genes already filtered out 163 | table = table[~table.index.isin(maskedIDs[0])] #remove rows corresponding to chrY or histone sequences 164 | table = table[~table.index.isin(maskedIDs2[0])] #remove rows corresponding to chrY or histone sequences 165 | if orthologMode: 166 | orthologs = pd.read_table("1to1_orthologs_expression.txt", header=None) 167 | table = table[table.index.isin(orthologs[[0,1]].values.flatten())] #must match human or mouse 1-1 ortholog IDs 168 | table[table.columns[range(0,5)+[8]]] = np.log10(table[table.columns[range(0,5)+[8]]]+0.1) 169 | table = table.sample(table.shape[0], replace=False, random_state=1) 170 | table[table.columns[range(0,9)]] = preprocessing.scale(table[table.columns[range(0,9)]]) 171 | print("\nPre-processed data...one-hot encoding...") 172 | promoters = one_hot(table['PROMOTER'].as_matrix()) 173 | halflifedata = table[table.columns[range(1,9)]].as_matrix() 174 | labels = table['EXPRESSION'].as_matrix() 175 | geneNames = list(table.index) 176 | print("Processed data from %s" % data_file) 177 | return promoters, halflifedata, labels, geneNames 178 | 179 | if __name__ == '__main__': 180 | main() 181 | -------------------------------------------------------------------------------- /Fig2/57epigenomes.median_expr.txt: -------------------------------------------------------------------------------- 1 | ../datasets/57epigenomes.median_expr.txt -------------------------------------------------------------------------------- /Fig2/Fig2A.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(latticeExtra) 3 | 4 | setwd("subsampled_10fold/") 5 | 6 | files = list.files(path='.', pattern='trial', recursive=T) 7 | # files = files[grepl("^2000|4000|6000|8000|10000|14000|16000",files)] 8 | table <- as.data.frame(do.call("rbind", lapply(files, FUN=function(file){ 9 | cmd = paste("tail -2", file, "| perl -ne \'@a=split /= /; print $a[1];\'") 10 | tmp = t(read.table(textConnection(system(cmd, intern=TRUE)))) 11 | tmp$sample = as.numeric(dirname(file)) 12 | tmp$rep = as.numeric(strsplit(basename(file), "_")[[1]][1]) 13 | tmp$trial = as.numeric(strsplit(strsplit(file, "_trial")[[1]][2], '\\.')[[1]][1]) 14 | names(tmp) = c("r2","MSE","sample","rep","trial") 15 | tmp 16 | }))) 17 | 18 | table = as.data.frame(apply(table,2,function(x) as.numeric(as.character(x)))) 19 | table = do.call("rbind", lapply(unique(table$sample), function(sub) { do.call("rbind", lapply(unique(table$rep), function(x) { tmp=table[table$rep==x & table$sample==sub,]; tmp[which( tmp$MSE == min(tmp$MSE) ),] } )) }) ) 20 | 21 | table = as.data.frame(aggregate(.~sample,table,function(x) c(mean=mean(x), sd=sd(x)))) 22 | table 23 | table[,2][,2]=table[,2][,2]/sqrt(10) #std err 24 | table[,3][,2]=table[,3][,2]/sqrt(10) #std err 25 | table 26 | 27 | pdf("Fig2A.pdf", height=3, width=5) 28 | obj1 = xyplot(MSE[,1] ~ sample, table, 29 | panel = function(x, y, ...){ 30 | panel.arrows(x, y, x, table[,3][,1]+1.96*table[,3][,2], length = 0, angle = 90) 31 | panel.arrows(x, y, x, table[,3][,1]-1.96*table[,3][,2], length = 0, angle = 90) 32 | panel.xyplot(x, y, ...) 33 | }, type = "o" , ylim=c(0.4,0.5), lwd=2, scales = list(x = list(at = seq(2000,16000,2000) ))) #limits = c(-500,500) 34 | obj2 = xyplot(r2[,1] ~ sample, table, 35 | panel = function(x, y, ...){ 36 | panel.arrows(x, y, x, table[,2][,1]+1.96*table[,2][,2], length = 0, angle = 90) 37 | panel.arrows(x, y, x, table[,2][,1]-1.96*table[,2][,2], length = 0, angle = 90) 38 | panel.xyplot(x, y, ...) 39 | }, type = "o", lwd=2, ylim=c(0.5,0.6)) 40 | doubleYScale(obj1, obj2, add.ylab2 = TRUE) 41 | dev.off() 42 | -------------------------------------------------------------------------------- /Fig2/Fig2BC.R: -------------------------------------------------------------------------------- 1 | library(LSD) 2 | library(data.table) 3 | 4 | ############################# 5 | ##Species-specific analysis## 6 | ############################# 7 | 8 | pdf("Fig2B.pdf") 9 | a=read.delim("pM10Kb_1KTest/predictions.txt") 10 | actual=read.delim("57epigenomes.median_expr.txt",F) 11 | colnames(actual)=c("Gene","UnscaledExpr") 12 | actual$UnscaledExpr=log10(actual$UnscaledExpr+0.1) 13 | a=merge(a,actual,by=1) 14 | model=lm(UnscaledExpr~Actual,a) 15 | a$Pred=predict(model,newdata=data.frame(Actual=a$Pred)) 16 | a$Actual=predict(model) 17 | 18 | "Human r^2:" 19 | cor(a$Pred,a$Actual)^2 20 | heatscatter(a$Pred, a$Actual, bty='n', xlim=c(-1,3), ylim=c(-1,3), cex.axis=2, cex.lab=2, las=1) 21 | a=read.delim("pM10Kb_1KTest_Mouse/predictions.txt") 22 | actual=read.delim("mouse.median_expr.txt",F) 23 | colnames(actual)=c("Gene","UnscaledExpr") 24 | actual$UnscaledExpr=log10(actual$UnscaledExpr+0.1) 25 | a=merge(a,actual,by=1) 26 | model=lm(UnscaledExpr~Actual,a) 27 | a$Pred=predict(model,newdata=data.frame(Actual=a$Pred)) 28 | a$Actual=predict(model) 29 | dev.off() 30 | 31 | "Mouse r^2:" 32 | cor(a$Pred,a$Actual)^2 33 | pdf("Fig2C.pdf") 34 | heatscatter(a$Pred, a$Actual, bty='n', xlim=c(-1,3), ylim=c(-1,3), cex.axis=2, cex.lab=2, las=1) 35 | dev.off() 36 | -------------------------------------------------------------------------------- /Fig2/Fig2D.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(latticeExtra) 3 | 4 | getbest <- function(dir){ 5 | files = list.files(path=dir, pattern='.txt', full.names=T) 6 | sbtable <- as.data.frame(do.call("rbind", lapply(files, FUN=function(file){ 7 | cmd = paste("tail -2", file, "| perl -ne \'@a=split /= /; print $a[1];\'") 8 | tmp = t(read.table(textConnection(system(cmd, intern=TRUE)))) 9 | tmp$file = strsplit(file, "/")[[1]][2] 10 | names(tmp) = c("r2","MSE","samples") 11 | tmp 12 | }))) 13 | 14 | sbtable$samples = as.character(sbtable$samples) 15 | sbtable = data.table(sbtable) 16 | sbtable = sbtable[ , .SD[which.min(MSE)], by = samples] 17 | sbtable 18 | } 19 | 20 | a=list() 21 | a[[1]]=getbest("ortholog_results/train_human_test_human/") 22 | a[[2]]=getbest("ortholog_results/train_human_test_mouse/") 23 | a[[3]]=getbest("ortholog_results/train_mouse_test_human/") 24 | a[[4]]=getbest("ortholog_results/train_mouse_test_mouse/") 25 | 26 | a=as.data.frame(do.call("rbind", a)) 27 | a$samples=as.factor(a$samples) 28 | 29 | a 30 | pdf("Fig2D.pdf", height=4, width=5) 31 | obj1 = xyplot(MSE ~ samples, a, type = "p", pch=19, lwd=2, scales=list(x=list(rot=45))) 32 | obj2 = xyplot(r2 ~ samples, a, type = "p", pch=19, lwd=2, scales=list(x=list(rot=45))) 33 | doubleYScale(obj1, obj2, add.ylab2 = TRUE) 34 | dev.off() 35 | -------------------------------------------------------------------------------- /Fig2/Fig2E.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig2/Fig2E.pdf -------------------------------------------------------------------------------- /Fig2/Fig2EFG.R: -------------------------------------------------------------------------------- 1 | library(LSD) 2 | library(data.table) 3 | library(ROCR) 4 | 5 | #################### 6 | ##One2One analysis## 7 | #################### 8 | a=read.delim("human2mouse_one2one_orthologs.txt",F) 9 | b=fread("zcat Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1-2",header=T,data.table=F,sep="\t") 10 | c=fread("zcat Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1-2",header=T,data.table=F,sep="\t") 11 | b$EXPRESSION = log10(b$EXPRESSION+0.1) 12 | c$EXPRESSION = log10(c$EXPRESSION+0.1) 13 | d=merge(a,c,by.x=2,by.y=1) 14 | d=merge(d,b,by.x=2,by.y=1) 15 | 16 | colnames(d)=c("hid","mid","type","mexpr","hexpr") 17 | head(d) 18 | 19 | say(nrow(d), "genes") 20 | say("Pearson corr =", cor(d$hexpr,d$mexpr)) 21 | 22 | pdf("Fig2E.pdf") 23 | par(mar=c(7,7,5,5), mgp = c(5, 1, 0)) 24 | plot.ecdf(b$EXPRESSION, xlim=c(-1,3), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="purple", 25 | ylab="Cumulative fraction", xlab="log10(Median expression level + 0.1)", col.01line = "white", lwd=2, cex.axis=2, cex.lab=2, bty="n", las=1) 26 | plot.ecdf(c$EXPRESSION, verticals= TRUE, do.points = FALSE, add = TRUE, col = "red", col.01line = "white", lwd=2, bty="n") 27 | plot.ecdf(d$hexpr, verticals= TRUE, do.points = FALSE, add = TRUE, col = "blue", col.01line = "white", lwd=2, bty="n") 28 | plot.ecdf(d$mexpr, verticals= TRUE, do.points = FALSE, add = TRUE, col = "cyan", col.01line = "white", lwd=2, bty="n") 29 | 30 | legend("bottomright", bg="white", bty="n", legend = 31 | c( paste("human (", length(b$EXPRESSION), ")") , paste("mouse (", length(c$EXPRESSION), ")"), 32 | paste("human, one-to-one orthologs (", length(d$hexpr), ")"), paste("mouse, one-to-one orthologs (", length(d$mexpr), ")")), 33 | text.col = c("purple","red","blue","cyan")) 34 | dev.off() 35 | 36 | pdf("Fig2F.pdf") 37 | heatscatter(d$hexpr, d$mexpr, xlab="Human", ylab="Mouse", bty='n', cex=0.3, xlim=c(-1,3), ylim=c(-1,3), cex.axis=2, cex.lab=2, las=1) 38 | dev.off() 39 | 40 | writefile(d,"1to1_orthologs_expression.txt", col.names=F) 41 | 42 | pdf("Fig2G.pdf") 43 | b=read.delim("all_crossvalidated_predictions.txt") 44 | c=read.delim("all_crossvalidated_predictions_mouse.txt") 45 | d=merge(d,b,by=1) 46 | colnames(d)[4:5]=c("mouse_expr","human_expr") 47 | colnames(d)[6:7]=c("human_pred","human_Actual") 48 | e=merge(d,c,by.x=2,by.y=1) 49 | colnames(e)[8:9]=c("mouse_pred","mouse_Actual") 50 | attach(e) 51 | head(e) 52 | 53 | e$diff = mouse_expr-human_expr 54 | f=e[abs(e$diff) > 1,] 55 | f$mouseOrHuman = ifelse(f$diff > 0, 1, 0) 56 | "human-specific" 57 | sum(f$diff < 0) 58 | "mouse-specific" 59 | sum(f$diff > 0) 60 | head(f) 61 | plot(performance( prediction( f$mouse_pred-f$human_pred, f$mouseOrHuman), "tpr", "fpr"), col="blue", las=1, cex.axis=2, cex.lab=2, bty='n') 62 | text(0.2, 1, labels = paste("AUC = ", round(performance( prediction(f$mouse_pred-f$human_pred, f$mouseOrHuman), "auc")@y.values[[1]],2), ' (n = ', nrow(f), ')', sep=''), offset = 1.5, col="black") 63 | abline(0,1,col="grey") 64 | dev.off() 65 | -------------------------------------------------------------------------------- /Fig2/Fig2F.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig2/Fig2F.pdf -------------------------------------------------------------------------------- /Fig2/Fig2G.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig2/Fig2G.pdf -------------------------------------------------------------------------------- /Fig2/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz -------------------------------------------------------------------------------- /Fig2/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz -------------------------------------------------------------------------------- /Fig2/Xpresso.py: -------------------------------------------------------------------------------- 1 | ../Fig1_S2/Xpresso.py -------------------------------------------------------------------------------- /Fig2/all_crossvalidated_predictions.txt: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/all_crossvalidated_predictions.txt -------------------------------------------------------------------------------- /Fig2/all_crossvalidated_predictions_mouse.txt: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/all_crossvalidated_predictions_mouse.txt -------------------------------------------------------------------------------- /Fig2/ensembl2geneName_v90_mm10.txt: -------------------------------------------------------------------------------- 1 | ../datasets/ensembl2geneName_v90_mm10.txt -------------------------------------------------------------------------------- /Fig2/mouse.median_expr.txt: -------------------------------------------------------------------------------- 1 | ../datasets/mouse.median_expr.txt -------------------------------------------------------------------------------- /Fig2/ortholog_results: -------------------------------------------------------------------------------- 1 | ../datasets/ortholog_results/ -------------------------------------------------------------------------------- /Fig2/pM10Kb_1KTest: -------------------------------------------------------------------------------- 1 | ../datasets/pM10Kb_1KTest -------------------------------------------------------------------------------- /Fig2/pM10Kb_1KTest_Mouse: -------------------------------------------------------------------------------- 1 | ../datasets/pM10Kb_1KTest_Mouse/ -------------------------------------------------------------------------------- /Fig2/pM10Kb_1KTest_one2oneOrthologs: -------------------------------------------------------------------------------- 1 | ../datasets/pM10Kb_1KTest_one2oneOrthologs/ -------------------------------------------------------------------------------- /Fig2/runme.sh: -------------------------------------------------------------------------------- 1 | ### Ran from datasets/ directory on GPU 2 | mkdir subsampled_10fold 3 | for x in `seq 2000 2000 16000`; do { echo $x; python subsample.py $x pM10Kb_1KTest subsampled_10fold/$x; } done 4 | for z in `seq 2000 2000 16000`; do { for y in {0..9}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x train tpe_1K_10epochs_optimized_0to20K.hyperopt subsampled_10fold/$z/ >subsampled_10fold/$z/$y\_trial$x.txt; } done } done } done 5 | 6 | Rscript Fig2A.R 7 | Rscript Fig2BC.R 8 | 9 | #Acquired 1 to 1 ortholog predictions from Ensembl BioMart 10 | grep one2one human2mouse_orthologs.txt >human2mouse_one2one_orthologs.txt 11 | python setup_training_files.py -t 1000 -v 1000 --orthologs Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz pM10Kb_1KTest_one2oneOrthologs 12 | 13 | mkdir ortholog_results 14 | mkdir ortholog_results/train_human_test_human 15 | mkdir ortholog_results/train_mouse_test_human 16 | mkdir ortholog_results/train_human_test_mouse 17 | mkdir ortholog_results/train_mouse_test_mouse 18 | for x in {1..10}; do { python Xpresso.py train tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_one2oneOrthologs/ >ortholog_results/train_human_test_human/trial_$x.txt; } done & 19 | for x in {1..10}; do { python Xpresso.py train tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_one2oneOrthologs/ >ortholog_results/train_human_test_mouse/trial_$x.txt; } done & 20 | for x in {1..10}; do { python Xpresso.py train tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_one2oneOrthologs/ >ortholog_results/train_mouse_test_human/trial_$x.txt; } done & 21 | for x in {1..10}; do { python Xpresso.py train tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_one2oneOrthologs/ >ortholog_results/train_mouse_test_mouse/trial_$x.txt; } done & 22 | 23 | #Stored results from runs in ortholog_results/ 24 | Rscript Fig2D.R 25 | Rscript Fig2EFG.R 26 | -------------------------------------------------------------------------------- /Fig2/setup_training_files.py: -------------------------------------------------------------------------------- 1 | import sys, os, h5py 2 | import numpy.random as npr 3 | import numpy as np 4 | from optparse import OptionParser 5 | import pandas as pd 6 | from sklearn import preprocessing 7 | from sklearn.model_selection import KFold 8 | 9 | def main(): 10 | usage = 'usage: %prog [options] ' 11 | parser = OptionParser(usage) 12 | parser.add_option('-t', dest='testCount', default=1000, type='int', help='Number of test examples: [Default: %default]') 13 | parser.add_option('-v', dest='validCount', default=1000, type='int', help='Number of validation examples: [Default: %default]') 14 | parser.add_option('--cv', dest='crossVal', default=False, action='store_true', help='Generate samples for 10-fold cross-validated predictions? [Default: %default]') 15 | parser.add_option('--orthologs', dest='orthologMode', default=False, action='store_true', help='Mouse file to prepare human and mouse 1-1 ortholog set (Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz recommended to accompany human data_file): [Default: %default]') 16 | parser.add_option('--over', dest='overwrite', default=False, action='store_true', help='Overwrite directory? [Default: %default]') 17 | (options,args) = parser.parse_args() 18 | 19 | if len(args) != 2: 20 | print(args) 21 | parser.error('Must provide data file and output directory') 22 | else: 23 | data_file = args[0] 24 | out_dir = args[1] 25 | compress_args = {'compression': 'gzip', 'compression_opts': 1} 26 | trainfile = os.path.join(out_dir, 'train.h5') 27 | validfile = os.path.join(out_dir, 'valid.h5') 28 | testfile = os.path.join(out_dir, 'test.h5') 29 | 30 | if options.orthologMode: 31 | trainfile = os.path.join(out_dir, 'train_human1to1.h5') 32 | validfile = os.path.join(out_dir, 'valid_human1to1.h5') 33 | testfile = os.path.join(out_dir, 'test_human1to1.h5') 34 | trainfile2 = os.path.join(out_dir, 'train_mouse1to1.h5') 35 | validfile2 = os.path.join(out_dir, 'valid_mouse1to1.h5') 36 | testfile2 = os.path.join(out_dir, 'test_mouse1to1.h5') 37 | 38 | if options.overwrite or not os.path.exists(out_dir): 39 | if not os.path.exists(out_dir): 40 | os.mkdir(out_dir) 41 | 42 | # load data 43 | promoters, halflifedata, labels, geneNames = preprocess(data_file, options.orthologMode) 44 | 45 | # check that the sum is valid 46 | assert(options.testCount + options.validCount <= promoters.shape[0]) 47 | test_count = options.testCount 48 | valid_count = options.validCount 49 | 50 | train_count = promoters.shape[0] - test_count - valid_count 51 | 52 | if options.crossVal: 53 | print('running 10-fold cross val w/ %d sequences ' % promoters.shape[0]) 54 | kf = KFold(n_splits=10, random_state=42, shuffle=False) 55 | fold = 0 56 | for train_index, test_index in kf.split(promoters): #keep aside 1000 examples of train indices for validation set 57 | fold += 1 58 | print('fold %d' % fold) 59 | h5f_train = h5py.File(os.path.join(out_dir, str(fold)+'train.h5'), 'w') 60 | h5f_valid = h5py.File(os.path.join(out_dir, str(fold)+'valid.h5'), 'w') 61 | h5f_test = h5py.File(os.path.join(out_dir, str(fold)+'test.h5'), 'w') 62 | valid_index = train_index[0:1000] 63 | train_index = train_index[1000:len(train_index)] 64 | h5f_train.create_dataset('data' , data=halflifedata[train_index,:], **compress_args) 65 | h5f_train.create_dataset('promoter', data=promoters[train_index,:], **compress_args) 66 | h5f_train.create_dataset('label' , data=labels[train_index], **compress_args) 67 | h5f_train.create_dataset('geneName' , data=np.array(geneNames)[train_index].tolist(), **compress_args) 68 | h5f_train.close() 69 | h5f_valid.create_dataset('data' , data=halflifedata[valid_index,:], **compress_args) 70 | h5f_valid.create_dataset('promoter', data=promoters[valid_index,:], **compress_args) 71 | h5f_valid.create_dataset('label' , data=labels[valid_index], **compress_args) 72 | h5f_valid.create_dataset('geneName' , data=np.array(geneNames)[valid_index].tolist(), **compress_args) 73 | h5f_valid.close() 74 | h5f_test.create_dataset('data' , data=halflifedata[test_index,:], **compress_args) 75 | h5f_test.create_dataset('promoter', data=promoters[test_index,:], **compress_args) 76 | h5f_test.create_dataset('label' , data=labels[test_index], **compress_args) 77 | h5f_test.create_dataset('geneName' , data=np.array(geneNames)[test_index].tolist(), **compress_args) 78 | h5f_test.close() 79 | else: 80 | print('%d training sequences ' % train_count) 81 | print('%d test sequences ' % test_count) 82 | print('%d validation sequences ' % valid_count) 83 | h5f_train = h5py.File(trainfile, 'w') 84 | h5f_valid = h5py.File(validfile, 'w') 85 | h5f_test = h5py.File(testfile, 'w') 86 | i = 0 87 | if train_count > 0: 88 | h5f_train.create_dataset('data' , data=halflifedata[i:i+train_count,:], **compress_args) 89 | h5f_train.create_dataset('promoter', data=promoters[i:i+train_count,:], **compress_args) 90 | h5f_train.create_dataset('label' , data=labels[i:i+train_count], **compress_args) 91 | h5f_train.create_dataset('geneName' , data=geneNames[i:i+train_count], **compress_args) 92 | h5f_train.close() 93 | i += train_count 94 | if valid_count > 0: 95 | h5f_valid.create_dataset('data' , data=halflifedata[i:i+valid_count,:], **compress_args) 96 | h5f_valid.create_dataset('promoter', data=promoters[i:i+valid_count,:], **compress_args) 97 | h5f_valid.create_dataset('label' , data=labels[i:i+valid_count], **compress_args) 98 | h5f_valid.create_dataset('geneName' , data=geneNames[i:i+valid_count], **compress_args) 99 | h5f_valid.close() 100 | i += valid_count 101 | if test_count > 0: 102 | h5f_test.create_dataset('data' , data=halflifedata[i:i+test_count,:], **compress_args) 103 | h5f_test.create_dataset('promoter', data=promoters[i:i+test_count,:], **compress_args) 104 | h5f_test.create_dataset('label' , data=labels[i:i+test_count], **compress_args) 105 | h5f_test.create_dataset('geneName' , data=geneNames[i:i+test_count], **compress_args) 106 | h5f_test.close() 107 | 108 | if options.orthologMode: 109 | print("Finding 1-1 orthologs...") 110 | h5f_train = h5py.File(trainfile2, 'w') 111 | h5f_valid = h5py.File(validfile2, 'w') 112 | h5f_test = h5py.File(testfile2, 'w') 113 | promoters2, halflifedata2, labels2, geneNames2 = preprocess(options.orthologMode, options.orthologMode) 114 | orthologs = pd.read_table("human2mouse_one2one_orthologs.txt", header=None) 115 | 116 | i = 0 117 | orthoids = orthologs[orthologs[0].isin(geneNames[i:i+train_count])][1] #transform human to mouse IDs 118 | idxs = np.where(np.isin(geneNames2,orthoids))[0].tolist() 119 | h5f_train.create_dataset('data' , data=halflifedata2[idxs,:], **compress_args) 120 | h5f_train.create_dataset('promoter', data=promoters2[idxs,:], **compress_args) 121 | h5f_train.create_dataset('label' , data=labels2[idxs], **compress_args) 122 | h5f_train.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args) 123 | print('%d 1-1 mouse orthologs found for training set' % labels2[idxs].shape) 124 | h5f_train.close() 125 | i += train_count 126 | orthoids = orthologs[orthologs[0].isin(geneNames[i:i+valid_count])][1] 127 | idxs = np.where(np.isin(geneNames2,orthoids))[0].tolist() 128 | h5f_valid.create_dataset('data' , data=halflifedata2[idxs,:], **compress_args) 129 | h5f_valid.create_dataset('promoter', data=promoters2[idxs,:], **compress_args) 130 | h5f_valid.create_dataset('label' , data=labels2[idxs], **compress_args) 131 | h5f_valid.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args) 132 | print('%d 1-1 mouse orthologs found validation set' % labels2[idxs].shape) 133 | h5f_valid.close() 134 | i += valid_count 135 | orthoids = orthologs[orthologs[0].isin(geneNames[i:i+test_count])][1] 136 | idxs = np.isin(geneNames2,orthoids) 137 | h5f_test.create_dataset('data' , data=halflifedata2[idxs,:], **compress_args) 138 | h5f_test.create_dataset('promoter', data=promoters2[idxs,:], **compress_args) 139 | h5f_test.create_dataset('label' , data=labels2[idxs], **compress_args) 140 | h5f_test.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args) 141 | print('%d 1-1 mouse orthologs found for test set ' % labels2[idxs].shape) 142 | h5f_test.close() 143 | else: 144 | parser.error('Nothing done...Run with --over to overwrite') 145 | 146 | def one_hot(seq): 147 | seq_len = len(seq.item(0)) 148 | seqindex = {'A':0, 'C':1, 'G':2, 'T':3, 'a':0, 'c':1, 'g':2, 't':3} 149 | seq_vec = np.zeros((len(seq),seq_len,4), dtype='bool') 150 | for i in range(len(seq)): 151 | thisseq = seq.item(i) 152 | for j in range(seq_len): 153 | try: 154 | seq_vec[i,j,seqindex[thisseq[j]]] = 1 155 | except: 156 | pass 157 | return seq_vec 158 | 159 | def preprocess(data_file, orthologMode): 160 | table = pd.read_table(data_file, index_col=0) 161 | maskedIDs = pd.read_table("mask_histone_genes_mm10.txt", header=None) #mask histone genes, chrY genes already filtered out 162 | maskedIDs2 = pd.read_table("mask_histone_genes.txt", header=None) #mask histone genes, chrY genes already filtered out 163 | table = table[~table.index.isin(maskedIDs[0])] #remove rows corresponding to chrY or histone sequences 164 | table = table[~table.index.isin(maskedIDs2[0])] #remove rows corresponding to chrY or histone sequences 165 | if orthologMode: 166 | orthologs = pd.read_table("1to1_orthologs_expression.txt", header=None) 167 | table = table[table.index.isin(orthologs[[0,1]].values.flatten())] #must match human or mouse 1-1 ortholog IDs 168 | table[table.columns[range(0,5)+[8]]] = np.log10(table[table.columns[range(0,5)+[8]]]+0.1) 169 | table = table.sample(table.shape[0], replace=False, random_state=1) 170 | table[table.columns[range(0,9)]] = preprocessing.scale(table[table.columns[range(0,9)]]) 171 | print("\nPre-processed data...one-hot encoding...") 172 | promoters = one_hot(table['PROMOTER'].as_matrix()) 173 | halflifedata = table[table.columns[range(1,9)]].as_matrix() 174 | labels = table['EXPRESSION'].as_matrix() 175 | geneNames = list(table.index) 176 | print("Processed data from %s" % data_file) 177 | return promoters, halflifedata, labels, geneNames 178 | 179 | if __name__ == '__main__': 180 | main() 181 | -------------------------------------------------------------------------------- /Fig2/subsample.py: -------------------------------------------------------------------------------- 1 | import sys, os, h5py 2 | from numpy.random import choice 3 | import numpy as np 4 | from optparse import OptionParser 5 | 6 | def main(): 7 | usage = 'usage: %prog [options] ' 8 | parser = OptionParser(usage) 9 | (options,args) = parser.parse_args() 10 | if len(args) != 3: 11 | print(args) 12 | sys.exit('Must provide number of subsamples, input directory, and output directory') 13 | counts, in_dir, out_dir = args 14 | counts = int(counts) 15 | if not os.path.exists(out_dir): 16 | os.mkdir(out_dir) 17 | file = h5py.File(os.path.join(in_dir, 'train.h5'), 'r') 18 | X_halflife, X_promoter, y, geneName = file['data'], file['promoter'], file['label'], file['geneName'] 19 | for i in range(10): 20 | print('sample %d' % i) 21 | os.symlink(os.path.join(os.path.realpath(in_dir), 'valid.h5'), os.path.join(out_dir, str(i)+'valid.h5')) 22 | os.symlink(os.path.join(os.path.realpath(in_dir), 'test.h5'), os.path.join(out_dir, str(i)+'test.h5')) 23 | h5f = h5py.File(os.path.join(out_dir, str(i)+'train.h5'), 'w') 24 | index = np.sort(choice(X_promoter.shape[0], size=counts, replace=False)) 25 | compress_args = {'compression': 'gzip', 'compression_opts': 1} 26 | h5f.create_dataset('data' , data=X_halflife[index,:], **compress_args) 27 | h5f.create_dataset('promoter', data=X_promoter[index,:,:], **compress_args) 28 | h5f.create_dataset('label' , data=y[index.tolist()], **compress_args) 29 | h5f.create_dataset('geneName', data=geneName[index.tolist()], **compress_args) 30 | h5f.close() 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /Fig2/subsampled_10fold: -------------------------------------------------------------------------------- 1 | ../datasets/subsampled_10fold/ -------------------------------------------------------------------------------- /Fig2/subsampling_10fold.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(latticeExtra) 3 | 4 | setwd("subsampled_10fold/") 5 | 6 | files = list.files(path='.', pattern='trial', recursive=T) 7 | # files = files[grepl("^2000|4000|6000|8000|10000|14000|16000",files)] 8 | table <- as.data.frame(do.call("rbind", lapply(files, FUN=function(file){ 9 | cmd = paste("tail -2", file, "| perl -ne \'@a=split /= /; print $a[1];\'") 10 | tmp = t(read.table(textConnection(system(cmd, intern=TRUE)))) 11 | tmp$sample = as.numeric(dirname(file)) 12 | tmp$rep = as.numeric(strsplit(basename(file), "_")[[1]][1]) 13 | tmp$trial = as.numeric(strsplit(strsplit(file, "_trial")[[1]][2], '\\.')[[1]][1]) 14 | names(tmp) = c("r2","MSE","sample","rep","trial") 15 | tmp 16 | }))) 17 | 18 | table = as.data.frame(apply(table,2,function(x) as.numeric(as.character(x)))) 19 | table = do.call("rbind", lapply(unique(table$sample), function(sub) { do.call("rbind", lapply(unique(table$rep), function(x) { tmp=table[table$rep==x & table$sample==sub,]; tmp[which( tmp$MSE == min(tmp$MSE) ),] } )) }) ) 20 | 21 | table = as.data.frame(aggregate(.~sample,table,function(x) c(mean=mean(x), sd=sd(x)))) 22 | table 23 | table[,2][,2]=table[,2][,2]/sqrt(10) #std err 24 | table[,3][,2]=table[,3][,2]/sqrt(10) #std err 25 | table 26 | 27 | pdf("subsample.pdf", height=3, width=5) 28 | obj1 = xyplot(MSE[,1] ~ sample, table, 29 | panel = function(x, y, ...){ 30 | panel.arrows(x, y, x, table[,3][,1]+1.96*table[,3][,2], length = 0, angle = 90) 31 | panel.arrows(x, y, x, table[,3][,1]-1.96*table[,3][,2], length = 0, angle = 90) 32 | panel.xyplot(x, y, ...) 33 | }, type = "o" , ylim=c(0.4,0.5), lwd=2, scales = list(x = list(at = seq(2000,16000,2000) ))) #limits = c(-500,500) 34 | obj2 = xyplot(r2[,1] ~ sample, table, 35 | panel = function(x, y, ...){ 36 | panel.arrows(x, y, x, table[,2][,1]+1.96*table[,2][,2], length = 0, angle = 90) 37 | panel.arrows(x, y, x, table[,2][,1]-1.96*table[,2][,2], length = 0, angle = 90) 38 | panel.xyplot(x, y, ...) 39 | }, type = "o", lwd=2, ylim=c(0.5,0.6)) 40 | doubleYScale(obj1, obj2, add.ylab2 = TRUE) 41 | dev.off() 42 | -------------------------------------------------------------------------------- /Fig2/tpe_1K_10epochs_optimized_0to20K.hyperopt: -------------------------------------------------------------------------------- 1 | ../Fig1_S2/tpe_1K_10epochs_optimized_0to20K.hyperopt -------------------------------------------------------------------------------- /Fig3_S3/57epigenomes.RPKM.pc.gz: -------------------------------------------------------------------------------- 1 | ../datasets/57epigenomes.RPKM.pc.gz -------------------------------------------------------------------------------- /Fig3_S3/Boyer_et_al_PCG_repressed.txt: -------------------------------------------------------------------------------- 1 | 1200009O22Rik 2 | 1300014I06Rik 3 | 1810031K17Rik 4 | 2010001J22Rik 5 | 2310045A20Rik 6 | 2410080H04Rik 7 | 2610017I09Rik 8 | 2610024A01Rik 9 | 2810451A06Rik 10 | 2900005J15Rik 11 | 4631426J05Rik 12 | 4930447C04Rik 13 | 4930577M16Rik 14 | 4933407N01Rik 15 | 4933408F15 16 | 4933436C20Rik 17 | 5330439J01Rik 18 | 5730446D14Rik 19 | 5730467H21Rik 20 | 5730557B15Rik 21 | 5730596B20Rik 22 | 6030405A18 23 | 6330514A18Rik 24 | 6332401O19Rik 25 | 6530402A20 26 | 9030611O19Rik 27 | 9030612E09Rik 28 | 9030623N16Rik 29 | 9130213B05Rik 30 | 9430022A14 31 | 9430023B20Rik 32 | 9430076C15Rik 33 | 9530027K23Rik 34 | 9930014A18Rik 35 | A330008L17Rik 36 | A930041G11Rik 37 | AB041550 38 | AI314180 39 | AI851790 40 | AW125753 41 | Adamts1 42 | Adamts5 43 | Adcy8 44 | Adm 45 | Adra1a 46 | Adra1d 47 | Adra2b 48 | Adrb1 49 | Aldh1a2 50 | Alx3 51 | Ar 52 | Arg2 53 | Arhgap20 54 | Asb3 55 | Ascl1 56 | Atf3 57 | Atoh1 58 | Atoh8 59 | Avpr1a 60 | BC014699 61 | BC038286 62 | BC055811 63 | BC061194 64 | Bach2 65 | Barx1 66 | Barx2 67 | Bcan 68 | Bcl11a 69 | Bcl2l11 70 | Bhlhb3 71 | Bhlhb4 72 | Bmp4 73 | Bmp6 74 | Bmp7 75 | Car7 76 | Cart1 77 | Cav1 78 | Cbln2 79 | Cbx4 80 | Cbx8 81 | Ccbe1 82 | Ccnd2 83 | Ccr9 84 | Cd14 85 | Cd24a 86 | Cdh11 87 | Cdh13 88 | Cdh2 89 | Cdh22 90 | Cdh4 91 | Cdh8 92 | Cdk5r2 93 | Cdkn2c 94 | Cdx2 95 | Cebpa 96 | Chst2 97 | Chst8 98 | Chx10 99 | Clstn1 100 | Cnih3 101 | Cnr1 102 | Cntfr 103 | Cntnap1 104 | Col12a1 105 | Col19a1 106 | Col27a1 107 | Col2a1 108 | Col4a1 109 | Col4a2 110 | Colec12 111 | Comp 112 | Corin 113 | Cpne7 114 | Crabp1 115 | Creb1 116 | Crhr1 117 | Cxcl12 118 | Cxcr4 119 | Cxxc4 120 | Cyp1b1 121 | Cyp24a1 122 | Cyp26a1 123 | Cyp27b1 124 | D13Bwg1146e 125 | D230002A01Rik 126 | D230039L06Rik 127 | D330050I23Rik 128 | D9Ucla1 129 | Dach1 130 | Dbx1 131 | Dgat2 132 | Dio3 133 | Dkk2 134 | Dll1 135 | Dll4 136 | Dlx1 137 | Dlx2 138 | Dlx3 139 | Dmrt2 140 | Dmrt3 141 | Dmrta1 142 | Dmrta2 143 | Dpf3 144 | E130018O15Rik 145 | E130112H22Rik 146 | E130306M17Rik 147 | E130307J07Rik 148 | E130309B19Rik 149 | E330016A19Rik 150 | Ebf1 151 | Ebf2 152 | Ebf3 153 | Efhd1 154 | Efna5 155 | Efnb2 156 | Egr3 157 | En1 158 | En2 159 | Eomes 160 | Epas1 161 | Epha5 162 | Evx1 163 | Evx2 164 | Fbn1 165 | Fbn2 166 | Fev 167 | Fgf15 168 | Fgf3 169 | Fgf5 170 | Fgf8 171 | Fgf9 172 | Fgfr3 173 | Fli1 174 | Flrt2 175 | Flt1 176 | Flt3 177 | Flt4 178 | Foxb2 179 | Foxc1 180 | Foxc2 181 | Foxd1 182 | Foxd4 183 | Foxe1 184 | Foxf1a 185 | Foxf2 186 | Foxl2 187 | Foxq1 188 | Fras1 189 | Frzb 190 | Fzd1 191 | Fzd2 192 | Gab1 193 | Gabra1 194 | Gad2 195 | Gadd45g 196 | Galgt2 197 | Gata3 198 | Gata4 199 | Gata5 200 | Gata6 201 | Gbx2 202 | Gdf6 203 | Gdf7 204 | Gdnf 205 | Gfra1 206 | Gfra2 207 | Ghsr 208 | Gm644 209 | Gm996 210 | Gnal 211 | Gpr120 212 | Gpr124 213 | Grid1 214 | Grik2 215 | Grin2a 216 | Gsc 217 | Gscl 218 | Gsh1 219 | Gsh2 220 | H2-Q1 221 | H2-Q10 222 | H2-Q7 223 | H2-Q8 224 | Hand1 225 | Hapln4 226 | Helt 227 | Hes5 228 | Hes7 229 | Hey2 230 | Hlx1 231 | Hlxb9 232 | Hmga2 233 | Hmx1 234 | Hoxa1 235 | Hoxa10 236 | Hoxa11 237 | Hoxa2 238 | Hoxa3 239 | Hoxa4 240 | Hoxa5 241 | Hoxa6 242 | Hoxb1 243 | Hoxb13 244 | Hoxb2 245 | Hoxb3 246 | Hoxb4 247 | Hoxb6 248 | Hoxb7 249 | Hoxb8 250 | Hoxc10 251 | Hoxc12 252 | Hoxc4 253 | Hoxc5 254 | Hoxc6 255 | Hoxc9 256 | Hoxd10 257 | Hoxd11 258 | Hoxd13 259 | Hoxd9 260 | Hpcal4 261 | Hs3st1 262 | Hs3st3b1 263 | Hsf4 264 | Hspa1a 265 | Hspa1b 266 | Hspa1l 267 | Htr1a 268 | Htr1b 269 | Htr6 270 | Id3 271 | Igf2 272 | Igfbp5 273 | Il15ra 274 | Insm1 275 | Ipf1 276 | Irf2 277 | Irf5 278 | Irs4 279 | Irx1 280 | Irx2 281 | Irx3 282 | Irx5 283 | Isl2 284 | Jun 285 | Kcna1 286 | Kcna6 287 | Kcnc4 288 | Kcnk12 289 | Kirrel3 290 | LOC385769 291 | LOC386518 292 | LOC432662 293 | LOC432907 294 | LOC434573 295 | LOC436493 296 | Lbxcor1 297 | Lef1 298 | Lhx2 299 | Lhx5 300 | Lhx6 301 | Lmbrd1 302 | Lmx1a 303 | Lrat 304 | Lrba 305 | Lrfn5 306 | Lrp8 307 | Lrrn1 308 | Lrrtm1 309 | MGC68323 310 | MGI:1920501 311 | MGI:1930803 312 | MGI:2143217 313 | MGI:2183445 314 | MGI:2669849 315 | MGI:2684334 316 | Mab21l1 317 | Mab21l2 318 | Mafa 319 | Mafb 320 | Mamdc1 321 | Meis1 322 | Mfsd2 323 | Mrg1 324 | Msx1 325 | Msx2 326 | Myc 327 | Nebl 328 | Nef3 329 | Nefl 330 | Neto1 331 | Neurod2 332 | Neurog1 333 | Neurog2 334 | Neurog3 335 | Nfatc1 336 | Nfix 337 | Nhlh2 338 | Nkx2-2 339 | Nkx2-3 340 | Nkx2-4 341 | Nkx2-5 342 | Nkx2-9 343 | Nkx6-1 344 | Nol4 345 | Npas2 346 | Npnt 347 | Nptx1 348 | Nr2e1 349 | Nr2f2 350 | Nrn1 351 | Nrp1 352 | Nrp2 353 | Ntf3 354 | Ntn1 355 | Ntrk3 356 | Ocln 357 | Olig2 358 | Olig3 359 | Onecut1 360 | Onecut2 361 | Onecut3 362 | Osr1 363 | Osr2 364 | Otp 365 | Otx1 366 | Otx3 367 | Ovol1 368 | Paqr9 369 | Pax1 370 | Pax2 371 | Pax3 372 | Pax6 373 | Pax7 374 | Pax8 375 | Pax9 376 | Pcdh1 377 | Pcdh10 378 | Pcdh18 379 | Pcdh7 380 | Pcdh8 381 | Pcdhga10 382 | Pcdhga11 383 | Pcdhga12 384 | Pcdhga3 385 | Pcdhga8 386 | Pcdhga9 387 | Pcdhgb5 388 | Pcdhgb6 389 | Pcdhgb7 390 | Pcdhgc3 391 | Pcdhgc4 392 | Pcdhgc5 393 | Pdgfra 394 | Pfdn4 395 | Pftk1 396 | Phlda2 397 | Phox2b 398 | Pitx1 399 | Pitx3 400 | Pkp1 401 | Plxnc1 402 | Podxl 403 | Pou3f2 404 | Pou3f3 405 | Pou3f4 406 | Pou4f2 407 | Pou4f3 408 | Ppm1l 409 | Ppp1r14c 410 | Prdm8 411 | Prkag2 412 | Ptger4 413 | Ptprt 414 | Ptpru 415 | Rab20 416 | Rasgrf1 417 | Rax 418 | Reln 419 | Rem1 420 | Rgl3 421 | Rgs20 422 | Rnf150 423 | Rtn4rl1 424 | Rtn4rl2 425 | Ryr2 426 | Sca1 427 | Scarf2 428 | Sdccag33 429 | Sema5b 430 | Sema6a 431 | Sema6d 432 | Serpine2 433 | Sez6l 434 | Sfrp5 435 | Shc3 436 | Shh 437 | Shox2 438 | Sidt1 439 | Sim1 440 | Six1 441 | Six2 442 | Six3 443 | Six6 444 | Slc16a2 445 | Slc30a3 446 | Slc32a1 447 | Slit2 448 | Slitrk3 449 | Smarca2 450 | Sox18 451 | Sox21 452 | Sox7 453 | Sox9 454 | Spon1 455 | Srd5a2 456 | Sstr1 457 | Sstr4 458 | St8sia3 459 | Stxbp6 460 | Svep1 461 | Tacstd2 462 | Tal1 463 | Tbr1 464 | Tbx15 465 | Tbx18 466 | Tbx2 467 | Tbx20 468 | Tbx4 469 | Tbx5 470 | Tcf21 471 | Tcfap2b 472 | Tcfap2d 473 | Tcfcp2l3 474 | Tdrd6 475 | Thbd 476 | Tiam2 477 | Tlx1 478 | Tlx3 479 | Tmeff2 480 | Tox 481 | Trhde 482 | Trp73 483 | Twist2 484 | Ube2j1 485 | Ucn 486 | Unc5c 487 | Unc5d 488 | Uncx4.1 489 | Vax1 490 | Vax2 491 | Vgll2 492 | Vsx1 493 | Wbscr17 494 | Wdr8 495 | Wnt1 496 | Wnt16 497 | Wnt2b 498 | Wnt3 499 | Wnt5a 500 | Wt1 501 | Zar1 502 | Zc3hav1 503 | Zfhx1b 504 | Zfp312 505 | Zfp339 506 | Zfp467 507 | Zfp503 508 | Zfpm1 509 | Zfpm2 510 | Zic1 511 | Zic4 512 | Znrf4 513 | -------------------------------------------------------------------------------- /Fig3_S3/EnsemblID2GeneName.txt: -------------------------------------------------------------------------------- 1 | ../datasets/EnsemblID2GeneName.txt -------------------------------------------------------------------------------- /Fig3_S3/Fig3ABCDEF_S3ABC.R: -------------------------------------------------------------------------------- 1 | #### MOUSE ###### 2 | 3 | a=read.delim("all_crossvalidated_predictions_mESC.txt") 4 | b=read.delim("all_crossvalidated_predictions_mouse.txt") 5 | colnames(a)[2:3]=c("mESCPred","mESCActual") 6 | a=merge(a,b,by=1) 7 | c=read.delim("ensembl2geneName_v90_mm10.txt") 8 | colnames(c)[2]="geneName" 9 | a=merge(a,c,by=1,all.x=T) 10 | a[a=='']="NA" 11 | 12 | summary(lm(mESCActual~mESCPred, a)) 13 | summary(lm(mESCActual~Pred, a)) 14 | 15 | # Table1 Moorthy et al, 2017 16 | a$color='black' 17 | a$color[a$geneName=="Sall1"]='red' 18 | a$color[a$geneName=="Tet1"]='red' 19 | a$color[a$geneName=="Prkcg"]='red' 20 | a$color[a$geneName=="AU018091"]='red' 21 | a$color[a$geneName=="Med13l"]='red' 22 | a$color[a$geneName=="Macf1"]='red' 23 | a$color[a$geneName=="Ranbp17"]='red' 24 | a$color[a$geneName=="Cbfa2t2"]='red' 25 | a$color[a$geneName=="Esrrb"]='red' 26 | a$color[a$geneName=="Dppa5a"]='red' 27 | a$color[a$geneName=="Ooep"]='red' 28 | a$color[a$geneName=="Mcl1"]='red' 29 | a$color[a$geneName=="Etl4"]='red' 30 | 31 | # A few genes from Whyte et al, 2013; Dowen et al, 2014; Hnisz et al 2013 32 | a$color[a$geneName=="Pou5f1"]='red' #same as Oct4 33 | a$color[a$geneName=="Sox2"]='red' 34 | a$color[a$geneName=="Nanog"]='red' 35 | a$color[a$geneName=="Klf4"]='red' 36 | a$color[a$geneName=="Tbx3"]='red' 37 | a$color[a$geneName=="Sall4"]='red' 38 | a$color[a$geneName=="Lefty1"]='red' 39 | a$color[a$geneName=="Lefty2"]='red' 40 | a$color[a$geneName=="Utf1"]='red' 41 | a$color[a$geneName=="Phc1"]='red' 42 | a$color[a$geneName=="Nr5a2"]='red' 43 | a$color[a$geneName=="Lrrc2"]='red' 44 | a$color[a$geneName=="Dppa3"]='red' 45 | a$color[a$geneName=="Prdm14"]='red' 46 | 47 | a$PredAdj=predict(lm(mESCActual~mESCPred, a)) 48 | a$MedianPred=predict(lm(mESCActual~Pred, a)) 49 | a$resid = a$mESCActual-a$PredAdj 50 | 51 | nrow(a) 52 | 53 | pdf("Fig3DEF_S3C.pdf", width=8, height=8) 54 | par(mar=c(7,7,5,5), mgp = c(5, 1.5, 0)) 55 | 56 | #Fig3D 57 | smoothScatter(a$mESCPred, a$mESCActual, cex.axis=2, cex.lab=2, bty="n", xlab="Predicted Expression Level, mESC model", ylab="mESC expression level (log10)", xlim=c(-1.5, 2), ylim=c(-1, 4), las=1, cex=.5) #, pch=1, col = a$color 58 | abline(0,1, col="red") 59 | text(a[a$color=="red","mESCPred"], a[a$color=="red","mESCActual"], labels = a[a$color=="red","geneName"], offset = 0.5, col="red") 60 | text(1.5, 4, labels = paste("r^2 =", round(cor(a$mESCPred, a$mESCActual)^2,2)), offset = 0.5, col="black") 61 | 62 | #FigS3C 63 | plot.ecdf(a$resid[a$color=='black'], xlim=c(-4,4), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="black") 64 | plot.ecdf(a$resid[a$color=='red'], verticals= TRUE, do.points = FALSE, col="red", add=T) 65 | legend("topleft", bg="white", bty="n", legend = c(paste("non-enhancer-driven genes, n = ", length(a$resid[a$color=='black']), sep=''), 66 | paste("enhancer-driven genes, n = ", length(a$resid[a$color=='red']), sep=''), 67 | paste("P value: ", formatC(ks.test(a$resid[a$color=='red'],a$resid[a$color=='black'],alternative="less")$p.value, digits = 2, format = 'g'), sep='')), text.col = c("black","red", "black")) 68 | 69 | #Fig3E 70 | silenced = read.delim("Boyer_et_al_PCG_repressed.txt",F) #from Boyer et al 71 | silenced = merge(silenced, c, by.x=1, by.y=2) 72 | nrow(silenced) 73 | active = read.delim("Whyte_et_al_superenhancers.txt",F) #from Whyte et al 74 | active = merge(active, c, by.x=1, by.y=2) 75 | nrow(active) 76 | a$color='black' 77 | a$color[a$Gene %in% silenced[,2] & !(a$Gene %in% active[,2])]='blue' 78 | a$color[a$Gene %in% active[,2] & !(a$Gene %in% silenced[,2])]='red' 79 | plot.ecdf(a$resid[a$color=='black'], xlim=c(-2,2), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="black", main="mESC") 80 | plot.ecdf(a$resid[a$color=='blue'], verticals= TRUE, do.points = FALSE, col="blue", add=T) 81 | plot.ecdf(a$resid[a$color=='red'], verticals= TRUE, do.points = FALSE, col="red", add=T) 82 | legend("topleft", bg="white", bty="n", legend = c(paste("Other genes, n = ", length(a$resid[a$color=='black']), sep=''), 83 | paste("PCG-silenced genes, n = ", length(a$resid[a$color=='blue']), sep=''), 84 | paste("Super-enhancer-associated genes, n = ", length(a$resid[a$color=='red']), sep=''), 85 | paste("PCG-silenced vs Black P value: ", formatC(ks.test(a$resid[a$color=='blue'],a$resid[a$color=='black'],alternative="greater")$p.value, digits = 2, format = 'g'), sep=''), 86 | paste("Super-enhancer vs Black P value: ", formatC(ks.test(a$resid[a$color=='red'],a$resid[a$color=='black'],alternative="less")$p.value, digits = 2, format = 'g'), sep='')), 87 | text.col = c("black","blue","red","black")) 88 | 89 | #Fig3F 90 | halflife = read.delim("Herzog_mESC_half_life.txt") #from Herzog et al 91 | halflife = halflife[,c(4,7)] 92 | colnames(halflife)[2]='half_life' 93 | halflife$half_life=log2(halflife$half_life) 94 | halflife = merge(halflife, c, by.x=1, by.y=2) 95 | a1=merge(halflife, a, by.x=3, by.y=1) 96 | "mESC half lives measured for this many genes:" 97 | nrow(a1) 98 | a1$quintile <- cut(a1$half_life, breaks=quantile(a1$half_life, probs=seq(0,1, by=0.2), na.rm=TRUE), include.lowest=TRUE) 99 | boxplot(a1$resid~a1$quintile,outline=F, cex=1.5, cex.axis=2, cex.lab=2, cex.main=2, las=2, notch=T, col="red") 100 | cor.test(a1$half_life, a1$resid) 101 | cor.test(a1$half_life, a1$resid, method='spearman') 102 | 103 | mouse = a 104 | 105 | a=read.delim("all_crossvalidated_predictions_mESC.txt",stringsAsFactors=F) 106 | colnames(a)[2:3]=c("mESCPred","mESCActual") 107 | b=read.delim("Ouyang_mESC_RPKM_ensemblID.txt",F) 108 | b$V2=log10(b$V2+0.1) 109 | a=merge(a,b,by=1,all.x=T) 110 | a$mESCActual=a$V2 111 | a$V2=NULL 112 | b=read.delim("all_crossvalidated_predictions_mouse.txt",stringsAsFactors=F) 113 | colnames(b)[2:3]=c("Pred","Actual") 114 | c=read.delim("mouse.median_expr.txt",F) 115 | c$V2=log10(c$V2+0.1) 116 | b=merge(b,c,by=1) 117 | b$Actual=b$V2 118 | b$V2=NULL 119 | a=merge(a,b,by=1,all=T) 120 | c=read.delim("ensembl2geneName_v90_mm10.txt",F,stringsAsFactors=F) 121 | c=c[,c(1,2,4)] 122 | colnames(c)[2:3]=c("geneName","Description") 123 | a=merge(a,c,by=1,all.x=T) 124 | a[a=='']=NA 125 | writefile(cbind(a$Gene, a$geneName, a$Description, round(a$Pred,3), round(a$Actual,3), round(a$mESCPred,3), round(a$mESCActual,3)), "TableS1_mouse.txt", col.names=T) 126 | 127 | ### HUMAN ##### 128 | 129 | a=read.delim(gzfile("57epigenomes.RPKM.pc.gz")) 130 | a$E000=NULL 131 | a[,2:ncol(a)]=log10(a[,2:ncol(a)]+0.1) 132 | a$medianExpr=apply(a[,2:ncol(a)], 1, median) 133 | 134 | nrow(a) 135 | b=read.delim("all_crossvalidated_predictions.txt") 136 | a=merge(a,b,by=1) 137 | b=read.delim("all_crossvalidated_predictions_K562.txt") 138 | colnames(b)[2:3]=c("K562Pred","K562Actual") 139 | a=merge(a,b,by=1) 140 | b=read.delim("all_crossvalidated_predictions_GM12878.txt") 141 | colnames(b)[2:3]=c("GM12878Pred","GM12878Actual") 142 | a=merge(a,b,by=1) 143 | nrow(a) 144 | 145 | c=read.delim("GSE78709_sure23.plasmid.norm.combined.45.55.minus.promoters.bigWigSignal",F) #from van Aresbergen et al 146 | d=read.delim("GSE78709_sure23.plasmid.norm.combined.45.55.plus.promoters.bigWigSignal",F) 147 | c=rbind(c,d) 148 | c$V6=log10(c$V6+0.1) 149 | c=c[,c(1,6)] 150 | colnames(c)[2]='SuRE' 151 | a=merge(a,c,by=1,all.x=T) 152 | 153 | c=read.delim("EnsemblID2GeneName.txt",F) 154 | colnames(c)[2:3]=c("geneName","Description") 155 | a=merge(a,c,by=1) 156 | a[a=='']="NA" 157 | 158 | pdf("Fig3ABC_S3ABC.pdf", width=8, height=8) 159 | par(mar=c(7,7,5,5), mgp = c(5, 1.5, 0)) 160 | 161 | a$resid = a$GM12878Actual-a$GM12878Pred 162 | 163 | #FigS3B 164 | silenced = read.delim("diHMM/GM12878/H3K27me3_silenced.txt",F) #from Marco et al 165 | nrow(silenced) 166 | active = read.delim("diHMM/GM12878/superenhancer.txt",F) 167 | nrow(active) 168 | a$color='black' 169 | a$color[a$gene_id %in% silenced[,1] & !(a$gene_id %in% active[,1])]='blue' 170 | a$color[a$gene_id %in% active[,1] & !(a$gene_id %in% silenced[,1])]='red' 171 | plot.ecdf(a$resid[a$color=='black'], xlim=c(-2,2), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="black", main="GM12878") 172 | plot.ecdf(a$resid[a$color=='blue'], verticals= TRUE, do.points = FALSE, col="blue", add=T) 173 | plot.ecdf(a$resid[a$color=='red'], verticals= TRUE, do.points = FALSE, col="red", add=T) 174 | legend("topleft", bg="white", bty="n", legend = c(paste("Other genes, n = ", length(a$resid[a$color=='black']), sep=''), 175 | paste("Silenced genes, n = ", length(a$resid[a$color=='blue']), sep=''), 176 | paste("Stretch-enhancer-associated genes, n = ", length(a$resid[a$color=='red']), sep=''), 177 | paste("Silenced vs Black P value: ", formatC(ks.test(a$resid[a$color=='blue'],a$resid[a$color=='black'],alternative="greater")$p.value, digits = 2, format = 'g'), sep=''), 178 | paste("Enhancer vs Black P value: ", formatC(ks.test(a$resid[a$color=='red'],a$resid[a$color=='black'],alternative="less")$p.value, digits = 2, format = 'g'), sep='')), 179 | text.col = c("black","blue","red","black")) 180 | 181 | #Fig3B 182 | a$resid = a$K562Actual-a$K562Pred 183 | silenced = read.delim("diHMM/K562/H3K27me3_silenced.txt",F) #from Marco et al 184 | nrow(silenced) 185 | active = read.delim("diHMM/K562/superenhancer.txt",F) 186 | nrow(active) 187 | a$color='black' 188 | a$color[a$gene_id %in% silenced[,1] & !(a$gene_id %in% active[,1])]='blue' 189 | a$color[a$gene_id %in% active[,1] & !(a$gene_id %in% silenced[,1])]='red' 190 | plot.ecdf(a$resid[a$color=='black'], xlim=c(-2,2), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="black", main="K562") 191 | plot.ecdf(a$resid[a$color=='blue'], verticals= TRUE, do.points = FALSE, col="blue", add=T) 192 | plot.ecdf(a$resid[a$color=='red'], verticals= TRUE, do.points = FALSE, col="red", add=T) 193 | legend("topleft", bg="white", bty="n", legend = c(paste("Other genes, n = ", length(a$resid[a$color=='black']), sep=''), 194 | paste("silenced genes, n = ", length(a$resid[a$color=='blue']), sep=''), 195 | paste("stretch-enhancer-associated genes, n = ", length(a$resid[a$color=='red']), sep=''), 196 | paste("Silenced vs Black P value: ", formatC(ks.test(a$resid[a$color=='blue'],a$resid[a$color=='black'],alternative="greater")$p.value, digits = 2, format = 'g'), sep=''), 197 | paste("Enhancer vs Black P value: ", formatC(ks.test(a$resid[a$color=='red'],a$resid[a$color=='black'],alternative="less")$p.value, digits = 2, format = 'g'), sep='')), 198 | text.col = c("black","blue","red","black")) 199 | 200 | #Fig3C 201 | halflife = read.delim("Schofield_K562_half_lives.txt") #from Schofield et al 202 | halflife = halflife[,c(1,6)] 203 | colnames(halflife)[2]='half_life' 204 | halflife$half_life=log2(halflife$half_life) 205 | halflife = merge(halflife, c, by.x=1, by.y=2) 206 | a1=merge(halflife, a, by.x=3, by.y=1) 207 | "K562 half lives measured for this many genes:" 208 | nrow(a1) 209 | a1$quintile <- cut(a1$half_life, breaks=quantile(a1$half_life, probs=seq(0,1, by=0.2), na.rm=TRUE), include.lowest=TRUE) 210 | boxplot(a1$resid~a1$quintile,outline=F, cex=1.5, cex.axis=2, cex.lab=2, cex.main=2, las=2, notch=T, col="red") 211 | cor.test(a1$half_life, a1$resid) 212 | cor.test(a1$half_life, a1$resid, method='spearman') 213 | 214 | a$color="black" 215 | #many groups 216 | a$color[grep("hemoglobin subunit", a$Description)]='red' 217 | #Xie et al 2017 218 | a$color[a$geneName=="PIM1"]='red' 219 | a$color[a$geneName=="SMYD3"]='red' 220 | a$color[a$geneName=="FADS1"]='red' 221 | a$color[a$geneName=="PRKAR2B"]='red' 222 | #Fulco et al 2016 223 | a$color[a$geneName=="GATA1"]='red' 224 | a$color[a$geneName=="MYC"]='red' 225 | 226 | #FigS3A 227 | plot.ecdf(a$resid[a$color=='black'], xlim=c(-4,4), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="black") 228 | plot.ecdf(a$resid[a$color=='red'], verticals= TRUE, do.points = FALSE, col="red", add=T) 229 | legend("topleft", bg="white", bty="n", legend = c(paste("non-enhancer-driven genes, n = ", length(a$resid[a$color=='black']), sep=''), 230 | paste("enhancer-driven genes, n = ", length(a$resid[a$color=='red']), sep=''), 231 | paste("P value: ", formatC(ks.test(a$resid[a$color=='red'],a$resid[a$color=='black'],alternative="less")$p.value, digits = 2, format = 'g'), sep='')), text.col = c("black","red", "black")) 232 | 233 | a$K562PredAdj=predict(lm(E123~K562Pred, a)) 234 | 235 | #Fig3A 236 | smoothScatter(a$K562PredAdj, a$E123, cex.axis=2, cex.lab=2, bty="n", xlab="Predicted K562 expression level", ylab="K562 expression level (log10)", xlim=c(-1.5, 2), ylim=c(-1, 4), las=1, cex=.5) 237 | abline(0,1, col="red") 238 | text(a[a$color=="red","K562PredAdj"], a[a$color=="red","E123"], labels = a[a$color=="red","geneName"], offset = 0.5, col="red") 239 | text(1.5, 4, labels = paste("r^2 =", round(cor(a$K562Pred, a$E123)^2,2)), offset = 0.5, col="black") 240 | 241 | writefile(cbind(a$gene_id, a$geneName, a$Description, round(a$Pred,3), round(a$K562Pred,3), round(a$GM12878Pred,3), round(a$SuRE,3), round(a[,2:58],3)), "TableS1_human.txt") -------------------------------------------------------------------------------- /Fig3_S3/Fig3ABC_S3ABC.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig3_S3/Fig3ABC_S3ABC.pdf -------------------------------------------------------------------------------- /Fig3_S3/Fig3DEF_S3C.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig3_S3/Fig3DEF_S3C.pdf -------------------------------------------------------------------------------- /Fig3_S3/Fig3GH.R: -------------------------------------------------------------------------------- 1 | library(ROCR) 2 | 3 | a=read.delim("all_crossvalidated_predictions_mESC.txt") 4 | colnames(a)[2:3]=c("mESCPred","mESCActual") 5 | c=read.delim("ensembl2geneName_v90_mm10.txt") 6 | colnames(c)[2]="geneName" 7 | a=merge(a,c,by=1,all.x=T) 8 | 9 | a$PredAdj=predict(lm(mESCActual~mESCPred, a)) 10 | a$resid = a$mESCActual-a$PredAdj 11 | nrow(a) 12 | a = a[a$mESCActual > min(a$mESCActual)+1,] 13 | nrow(a) 14 | miR = fastread("zcat Summary_Counts.default_predictions.txt.gz") 15 | nrow(a) 16 | 17 | values = sapply(unique(miR$"miRNA family"), function(fam){ 18 | miR2 = miR[miR$"Species ID"==10090 & miR$"miRNA family"==fam,c(2,3,16)] 19 | merged = merge(miR2, a, by.x=1,by.y=4, all.y=T) 20 | merged[is.na(merged)]=0 21 | 22 | if (sum(merged[,"Cumulative weighted context++ score"] != 0) > 10){ 23 | c(0,cor(merged$resid, as.numeric(merged[,"Cumulative weighted context++ score"]), method='spearman')) 24 | } 25 | else{ 26 | c(0,0) 27 | } 28 | }) 29 | values=t(values) 30 | 31 | pdf("Fig3GH.pdf", width=8, height=8) 32 | par(mar=c(7,7,5,5), mgp = c(5, 1.5, 0)) 33 | miRs=read.delim("mouseESC_GSE76288_miRNA_counts_Denzler.txt") 34 | miRs=miRs[,c("miRNA_seed","Embryonic_stem_cells._Average_RPM")] 35 | miRs$miRNA_seed=gsub("T","U",miRs$miRNA_seed) 36 | miRs=aggregate(miRs$Embryonic_stem_cells._Average_RPM,by=list(miRs$miRNA_seed), sum) 37 | miRs=miRs[order(miRs$x, decreasing=T),] 38 | miRs2=rbind(miRs[1:10,], c("Other", sum(miRs[11:nrow(miRs),"x"]))) 39 | pie(as.numeric(miRs2[,2]), labels = miRs2[,1],col=c('orange','blue','red','purple','darkolivegreen1','magenta','brown', 'cyan', 'yellow','grey','black'), main=paste("Top 10 miRNA families in mESCs"), clockwise = T, cex.main = 2, cex=1.8) 40 | 41 | miRs=merge(values, miRs, by.x=0, by.y=1, all.x=T) 42 | miRs$color="black" 43 | miRs$color[miRs$Row.names %in% miRs2[,1]]="red" 44 | colnames(miRs)[3]="spearman" 45 | p1 <- hist(miRs$spearman[miRs$color!="red"], 50, plot=F) 46 | p2 <- hist(miRs$spearman[miRs$color=="red"], 50, plot=F) 47 | plot( p1, col=rgb(0,0,1,1/2), xlim=c(-0.02,0.06)) 48 | plot( p2, col=rgb(1,0,0,1/2), xlim=c(-0.02,0.06), add=T) 49 | dev.off() -------------------------------------------------------------------------------- /Fig3_S3/Fig3GH.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig3_S3/Fig3GH.pdf -------------------------------------------------------------------------------- /Fig3_S3/Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz -------------------------------------------------------------------------------- /Fig3_S3/Mouse_FantomAnnotations.InputData.pM10Kb.mESC.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Mouse_FantomAnnotations.InputData.pM10Kb.mESC.txt.gz -------------------------------------------------------------------------------- /Fig3_S3/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz -------------------------------------------------------------------------------- /Fig3_S3/Ouyang_mESC_RPKM_ensemblID.txt: -------------------------------------------------------------------------------- 1 | ../datasets/Ouyang_mESC_RPKM_ensemblID.txt -------------------------------------------------------------------------------- /Fig3_S3/Roadmap_FantomAnnotations.InputData.pM10Kb.GM12878expr.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.GM12878expr.txt.gz -------------------------------------------------------------------------------- /Fig3_S3/Roadmap_FantomAnnotations.InputData.pM10Kb.K562expr.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.K562expr.txt.gz -------------------------------------------------------------------------------- /Fig3_S3/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz -------------------------------------------------------------------------------- /Fig3_S3/Summary_Counts.default_predictions.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Summary_Counts.default_predictions.txt.gz -------------------------------------------------------------------------------- /Fig3_S3/TableS1_human.txt: -------------------------------------------------------------------------------- 1 | ../datasets/TableS1_human.txt -------------------------------------------------------------------------------- /Fig3_S3/TableS1_mouse.txt: -------------------------------------------------------------------------------- 1 | ../datasets/TableS1_mouse.txt -------------------------------------------------------------------------------- /Fig3_S3/Whyte_et_al_superenhancers.txt: -------------------------------------------------------------------------------- 1 | Usp7 2 | Mreg 3 | Btbd11 4 | Zbtb45 5 | Brd1 6 | Tmem2 7 | Tnfsfm13 8 | Fam134b 9 | Zfhx2 10 | Tet2 11 | Trim71 12 | Smad7 13 | Rbpj 14 | Gli2 15 | Ankrd35 16 | Kif13b 17 | Prdm14 18 | Ccnd3 19 | Tns3 20 | Zbtb34 21 | Rbfox2 22 | Gm9104 23 | H2-M5 24 | Ppp2r5c 25 | Zfp710 26 | Upp1 27 | Qk 28 | Pum1 29 | Prrc2b 30 | Nr5a2 31 | Zfp281 32 | Sgk1 33 | Sgk1 34 | Tgif1 35 | Tgif1 36 | Zfp638 37 | Tead1 38 | Rara 39 | Etl4 40 | Fnbp1 41 | Ahi1 42 | Gm6724 43 | Alpl 44 | Cd9 45 | Socs3 46 | Gadd45a 47 | Phc1 48 | Enc1 49 | Smarcad1 50 | F2rl1 51 | Ftl2-ps 52 | Gpx4 53 | Hsd17b3 54 | Igfbp2 55 | Inhbb 56 | Klf2 57 | Klf3 58 | Lamc2 59 | Ldhb 60 | Mcl1 61 | Mybl2 62 | Mycn 63 | Pim1 64 | Pipox 65 | Pura 66 | Slc6a6 67 | Tcf15 68 | Tsc22d1 69 | Utf1 70 | Zfp42 71 | Macf1 72 | Agtrap 73 | Klf5 74 | Capns1 75 | Cbfa2t2 76 | Cldn4 77 | Col18a1 78 | Ctbp2 79 | Lefty1 80 | Enah 81 | Epha2 82 | Fgf4 83 | Gbx2 84 | Id1 85 | Ier2 86 | Klf4 87 | Klf9 88 | Sik1 89 | Mapt 90 | Nfkbia 91 | Uri1 92 | Ski 93 | Slc2a1 94 | Slc2a3 95 | Sox2 96 | Tbx3 97 | Ubtf 98 | Vdac1 99 | Spry2 100 | Spry4 101 | Esrrb 102 | Ppp2r5c 103 | Eif4a2 104 | Dmtn 105 | Pou5f1 106 | Dusp1 107 | Sema4b 108 | Dlc1 109 | Hs6st1 110 | Dmrt1 111 | Kat6b 112 | Mkrn1 113 | Abhd2 114 | Tmem131 115 | Rbpms 116 | Kras 117 | Tnip1 118 | Klf13 119 | Sall1 120 | Ppp1r1a 121 | Tdh 122 | Gpa33 123 | Ndfip1 124 | Ranbp17 125 | Mesdc2 126 | Tfcp2l1 127 | Jam2 128 | Spaca7 129 | Derl3 130 | Dppa5a 131 | Hsd17b14 132 | Fbxo36 133 | Ssr2 134 | Camk2n1 135 | Hmg20a 136 | Rpl14 137 | Gtf3c6 138 | Kctd16 139 | Bcas2 140 | Stoml1 141 | Glod5 142 | Polr3gl 143 | Manba 144 | Tet1 145 | Rpap3 146 | Nanog 147 | Sulf2 148 | Cenpv 149 | Lrrc2 150 | C2cd5 151 | Ddit4 152 | 1700012A03Rik 153 | Hspb8 154 | Uck2 155 | Msi2 156 | Elovl6 157 | Usp48 158 | Zfp704 159 | Opa1 160 | Ube2s 161 | Dst 162 | Gpr37l1 163 | Dppa3 164 | Mllt6 165 | Kazn 166 | Otx2 167 | Mbip 168 | Pitpnc1 169 | Irf2bpl 170 | Olfr90 171 | Ift52 172 | Med13l 173 | Cobl 174 | Itpk1 175 | Kank4 176 | Mtcl1 177 | Idh2 178 | Gpt2 179 | Rhof 180 | Trak1 181 | Nav2 182 | Chchd10 183 | 6430573F11Rik 184 | Lefty2 185 | Chd9 186 | Tmem220 187 | Amigo2 188 | Fam53a 189 | Reep3 190 | Pirt 191 | Dlgap3 192 | Ctif 193 | Platr26 194 | Sall4 195 | -------------------------------------------------------------------------------- /Fig3_S3/all_crossvalidated_predictions.txt: -------------------------------------------------------------------------------- 1 | ../datasets/all_crossvalidated_predictions.txt -------------------------------------------------------------------------------- /Fig3_S3/all_crossvalidated_predictions_GM12878.txt: -------------------------------------------------------------------------------- 1 | ../datasets/all_crossvalidated_predictions_GM12878.txt -------------------------------------------------------------------------------- /Fig3_S3/all_crossvalidated_predictions_K562.txt: -------------------------------------------------------------------------------- 1 | ../datasets/all_crossvalidated_predictions_K562.txt -------------------------------------------------------------------------------- /Fig3_S3/all_crossvalidated_predictions_mESC.txt: -------------------------------------------------------------------------------- 1 | ../datasets/all_crossvalidated_predictions_mESC.txt -------------------------------------------------------------------------------- /Fig3_S3/all_crossvalidated_predictions_mouse.txt: -------------------------------------------------------------------------------- 1 | ../datasets/all_crossvalidated_predictions_mouse.txt -------------------------------------------------------------------------------- /Fig3_S3/cross_valid: -------------------------------------------------------------------------------- 1 | ../datasets/cross_valid -------------------------------------------------------------------------------- /Fig3_S3/cross_valid_GM12878: -------------------------------------------------------------------------------- 1 | ../datasets/cross_valid_GM12878 -------------------------------------------------------------------------------- /Fig3_S3/cross_valid_K562: -------------------------------------------------------------------------------- 1 | ../datasets/cross_valid_K562 -------------------------------------------------------------------------------- /Fig3_S3/cross_valid_mESC: -------------------------------------------------------------------------------- 1 | ../datasets/cross_valid_mESC -------------------------------------------------------------------------------- /Fig3_S3/cross_valid_mouse: -------------------------------------------------------------------------------- 1 | ../datasets/cross_valid_mouse -------------------------------------------------------------------------------- /Fig3_S3/diHMM: -------------------------------------------------------------------------------- 1 | ../datasets/diHMM/ -------------------------------------------------------------------------------- /Fig3_S3/ensembl2geneName_v90_mm10.txt: -------------------------------------------------------------------------------- 1 | ../datasets/ensembl2geneName_v90_mm10.txt -------------------------------------------------------------------------------- /Fig3_S3/integrate_cv_results.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(latticeExtra) 3 | 4 | cv_folder = args[1] 5 | predfolder = args[2] 6 | outfile = args[3] 7 | 8 | files = list.files(path=cv_folder, pattern='.txt', full.names=T) 9 | 10 | table <- do.call("rbind", lapply(files, FUN=function(file){ 11 | cmd = paste("tail -2", file, "| perl -ne \'@a=split /= /; print $a[1];\'") 12 | tmp = t(read.table(textConnection(system(cmd, intern=TRUE)))) 13 | tmp$fold = as.numeric(strsplit(basename(file), "_")[[1]][1]) 14 | tmp$trial = as.numeric(strsplit(strsplit(file, "_trial")[[1]][2], '\\.')[[1]][1]) 15 | names(tmp) = c("r2","MSE","fold","trial") 16 | tmp 17 | })) 18 | 19 | table=as.data.frame(apply(table,2,function(x) as.numeric(as.character(x)))) 20 | head(table) 21 | 22 | do.call("rbind", lapply(unique(table$fold), function(x) { tmp=table[table$fold==x,]; tmp[which( tmp$MSE == min(tmp$MSE) ),] } ) ) 23 | table = do.call("rbind",lapply(unique(table$fold), function(x) { tmp=table[table$fold==x,]; tmp[which( tmp$MSE==min(tmp$MSE) ),c("fold","trial")] } )) 24 | 25 | if (nrow(table) == 10){ 26 | files = apply(table, 1, function(x) { paste(predfolder,x[2],x[1],"predictions.txt",sep='') } ) 27 | say(files) 28 | table = do.call("rbind", lapply(files, function(x) { read.delim(x) } ) ) 29 | write.table(table,file=outfile, quote=F, row.names=F, sep='\t') 30 | } 31 | #otherwise cant do, select which trial to use from table due to tie 32 | -------------------------------------------------------------------------------- /Fig3_S3/mouse.median_expr.txt: -------------------------------------------------------------------------------- 1 | ../datasets/mouse.median_expr.txt -------------------------------------------------------------------------------- /Fig3_S3/pM10Kb_1KTest_GM12878expr_cv: -------------------------------------------------------------------------------- 1 | ../datasets/pM10Kb_1KTest_GM12878expr_cv/ -------------------------------------------------------------------------------- /Fig3_S3/pM10Kb_1KTest_K562expr_cv: -------------------------------------------------------------------------------- 1 | ../datasets/pM10Kb_1KTest_K562expr_cv/ -------------------------------------------------------------------------------- /Fig3_S3/pM10Kb_1KTest_mESCexpr_cv: -------------------------------------------------------------------------------- 1 | ../datasets/pM10Kb_1KTest_mESCexpr_cv/ -------------------------------------------------------------------------------- /Fig3_S3/pM10Kb_Mouse_cv: -------------------------------------------------------------------------------- 1 | ../datasets/pM10Kb_Mouse_cv/ -------------------------------------------------------------------------------- /Fig3_S3/pM10Kb_cv: -------------------------------------------------------------------------------- 1 | ../datasets/pM10Kb_cv/ -------------------------------------------------------------------------------- /Fig3_S3/runme.sh: -------------------------------------------------------------------------------- 1 | # precomputed h5 files for human (pM10Kb_cv) and mouse (pM10Kb_Mouse_cv) are provided only to save space 2 | # but all can be generated as below: 3 | python setup_training_files.py --cv Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz pM10Kb_cv 4 | python setup_training_files.py --cv Roadmap_FantomAnnotations.InputData.pM10Kb.K562expr.txt.gz pM10Kb_1KTest_K562expr_cv 5 | python setup_training_files.py --cv Roadmap_FantomAnnotations.InputData.pM10Kb.GM12878expr.txt.gz pM10Kb_1KTest_GM12878expr_cv 6 | python setup_training_files.py --cv Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz pM10Kb_Mouse_cv 7 | python setup_training_files.py --cv Mouse_FantomAnnotations.InputData.pM10Kb.mESC.txt.gz pM10Kb_1KTest_mESCexpr_cv 8 | 9 | # RUN ON GPU USING FOLDERS ABOVE, TAKES MANY HOURS TO RUN ON GPU 10 | for y in {1..10}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_cv/ >pM10Kb_cv/fold$y\_trial$x.txt; } done } done & 11 | for y in {1..10}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_K562expr_cv/ >pM10Kb_1KTest_K562expr_cv/fold$y\_trial$x.txt; } done } done & 12 | for y in {1..10}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_GM12878expr_cv/ >pM10Kb_1KTest_GM12878expr_cv/fold$y\_trial$x.txt; } done } done & 13 | for y in {1..10}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_Mouse_cv/ >pM10Kb_Mouse_cv/fold$y\_trial$x.txt; } done } done & 14 | for y in {1..10}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_mESCexpr_cv/ >pM10Kb_1KTest_mESCexpr_cv/fold$y\_trial$x.txt; } done } done & 15 | 16 | #MERGED RESULTS INTO ALL-CROSSVALIDATED PREDICTIONS 17 | Rscript integrate_cv_results.R cross_valid pM10Kb_cv all_crossvalidated_predictions.txt 18 | Rscript integrate_cv_results.R cross_valid_K562 pM10Kb_1KTest_K562expr_cv all_crossvalidated_predictions_K562.txt 19 | Rscript integrate_cv_results.R cross_valid_GM12878 pM10Kb_1KTest_GM12878expr_cv all_crossvalidated_predictions_GM12878.txt 20 | Rscript integrate_cv_results.R cross_valid_mouse pM10Kb_Mouse_cv all_crossvalidated_predictions_mouse.txt 21 | Rscript integrate_cv_results.R cross_valid_mESC pM10Kb_1KTest_mESCexpr_cv all_crossvalidated_predictions_mESC.txt 22 | 23 | mkdir diHMM 24 | cd diHMM 25 | wget http://bcb.dfci.harvard.edu/~gcyuan/data/diHMM/diHMM_Annotations.zip 26 | unzip diHMM_Annotations.zip 27 | 28 | cd K562 29 | bedtools intersect -wo -a K562_nD30_nB30_domainLevelStatesColor.bed -b ../../Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz >K562_overlapping_genes.bed 30 | grep -P 'D7|D8|D9|D23' K562_overlapping_genes.bed | cut -f 18 | cut -b 1-15 | sort | uniq >H3K27me3_silenced.txt 31 | grep -P 'D10|D11|D12|D13' K562_overlapping_genes.bed | cut -f 18 | cut -b 1-15 | sort | uniq >superenhancer.txt 32 | cd ../GM12878/ 33 | bedtools intersect -wo -a GM12878_nD30_nB30_domainLevelStatesColor.bed -b ../../Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz >GM12878_overlapping_genes.bed 34 | grep -P 'D7|D8|D9|D23' GM12878_overlapping_genes.bed | cut -f 18 | cut -b 1-15 | sort | uniq >H3K27me3_silenced.txt 35 | grep -P 'D10|D11|D12|D13' GM12878_overlapping_genes.bed | cut -f 18 | cut -b 1-15 | sort | uniq >superenhancer.txt 36 | cd ../.. 37 | 38 | Rscript Fig3ABCDEF_S3ABC.R 39 | 40 | wget http://www.targetscan.org/mmu_71/mmu_71_data_download/Summary_Counts.default_predictions.txt.zip 41 | unzip Summary_Counts.default_predictions.txt.zip 42 | gzip Summary_Counts.default_predictions.txt.gz 43 | Rscript Fig3GH.R 44 | -------------------------------------------------------------------------------- /Fig3_S3/setup_training_files.py: -------------------------------------------------------------------------------- 1 | ../Fig2/setup_training_files.py -------------------------------------------------------------------------------- /Fig4_S4/57epigenomes.RPKM.pc.gz: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/57epigenomes.RPKM.pc.gz -------------------------------------------------------------------------------- /Fig4_S4/Fig4ABCD.R: -------------------------------------------------------------------------------- 1 | library(ROCR) 2 | 3 | #### MOUSE ###### 4 | 5 | a=read.delim("all_crossvalidated_predictions_mESC.txt") 6 | b=read.delim("all_crossvalidated_predictions_mouse.txt") 7 | colnames(a)[2:3]=c("mESCPred","mESCActual") 8 | a=merge(a,b,by=1) 9 | a[a=='']="NA" 10 | mouse = a 11 | 12 | ### HUMAN ##### 13 | 14 | a=read.delim(gzfile("57epigenomes.RPKM.pc.gz")) 15 | a$E000=NULL 16 | a[,2:ncol(a)]=log10(a[,2:ncol(a)]+0.1) 17 | 18 | nrow(a) 19 | b=read.delim("all_crossvalidated_predictions.txt") 20 | a=merge(a,b,by=1) 21 | b=read.delim("all_crossvalidated_predictions_K562.txt") 22 | colnames(b)[2:3]=c("K562Pred","K562Actual") 23 | a=merge(a,b,by=1) 24 | b=read.delim("all_crossvalidated_predictions_GM12878.txt") 25 | colnames(b)[2:3]=c("GM12878Pred","GM12878Actual") 26 | a=merge(a,b,by=1) 27 | nrow(a) 28 | 29 | c=read.delim("GSE78709_sure23.plasmid.norm.combined.45.55.minus.promoters.bigWigSignal",F) #from van Aresbergen et al 30 | d=read.delim("GSE78709_sure23.plasmid.norm.combined.45.55.plus.promoters.bigWigSignal",F) 31 | c=rbind(c,d) 32 | c$V6=log10(c$V6+0.1) 33 | c=c[,c(1,6)] 34 | colnames(c)[2]='SuRE' 35 | a=merge(a,c,by=1,all.x=T) 36 | 37 | pdf("Fig4ABCD.pdf", width=8, height=8) 38 | par(mar=c(7,7,5,5), mgp = c(5, 1.5, 0)) 39 | 40 | a$SuREAdj=predict(lm(E123~SuRE, a)) 41 | a$K562PredAdj=predict(lm(E123~K562Pred, a)) 42 | a$PromoterActivity=predict(lm(E123~SuRE+K562Pred, a)) 43 | 44 | #Fig4A 45 | (cors = data.frame( K562=c(cor(a$K562Actual,a$Pred)^2, cor(a$K562Actual,a$K562Pred)^2), 46 | GM12878=c(cor(a$GM12878Actual,a$Pred)^2, cor(a$GM12878Actual,a$K562Pred)^2), 47 | mESC=c(cor(mouse$mESCActual,mouse$Pred)^2, cor(mouse$mESCActual,mouse$mESCPred)^2) )) 48 | barplot(as.matrix(cors), las=2, beside=TRUE, col=c("red","blue"), border=F, ylim=c(0, 0.6), ylab="r^2 to gene expression level" ) 49 | 50 | #Fig4D 51 | smoothScatter(a$SuREAdj, a$E123, cex.axis=2, cex.lab=2, bty="n", xlab="SuRE activity", ylab="K562 expression level (log10)", xlim=c(-1.5, 2), ylim=c(-1, 4), las=1, cex=.5) 52 | abline(0,1, col="red") 53 | text(1.5, 4, labels = paste("r^2 =", round(cor(a$E123, a$SuRE)^2,2)), offset = 0.5, col="black") 54 | 55 | smoothScatter(a$PromoterActivity, a$E123, cex.axis=2, cex.lab=2, bty="n", xlab="Predicted expression level, joint model", ylab="K562 expression level (log10)", xlim=c(-1.5, 2), ylim=c(-1, 4), las=1, cex=.5) 56 | abline(0,1, col="red") 57 | text(1.5, 4, labels = paste("r^2 =", round(cor(a$PromoterActivity, a$E123)^2,2)), offset = 0.5, col="black") 58 | 59 | #Fig4BC 60 | a$deltaActual = a$K562Actual-a$GM12878Actual 61 | a$deltaPred = a$K562Pred-a$GM12878Pred 62 | nrow(a) 63 | smoothScatter(a$GM12878Actual, a$K562Actual, cex.axis=2, cex.lab=2, bty="n", xlab="GM12878 expression level (log10)", ylab="K562 expression level (log10)", xlim=c(-1, 4), ylim=c(-1, 4), las=1, cex=.5) 64 | abline(1,1, col="red") 65 | abline(-1,1, col="red") 66 | text(0, 4, labels = paste("Upregulated in K562:", nrow(a[a$deltaActual > 1,])), offset = 0.5, col="black") 67 | text(3, -1, labels = paste("Upregulated in GM12878:", nrow(a[a$deltaActual < -1,])), offset = 0.5, col="black") 68 | 69 | b=a[abs(a$deltaActual) > 1,] 70 | b$deltaActual = ifelse(b$deltaActual > 0, 1, 0) 71 | plot(performance( prediction( b$deltaPred, b$deltaActual), "tpr", "fpr"), col="blue", las=1, cex.axis=2, cex.lab=2) 72 | text(0.2, 1, labels = paste("AUC = ", round(performance( prediction(b$deltaPred, b$deltaActual), "auc")@y.values[[1]],2), ' (n = ', nrow(b), ')', sep=''), offset = 1.5, col="black") 73 | abline(0,1,col="grey") 74 | dev.off() -------------------------------------------------------------------------------- /Fig4_S4/Fig4ABCD.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/Fig4ABCD.pdf -------------------------------------------------------------------------------- /Fig4_S4/Fig4E.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/Fig4E.pdf -------------------------------------------------------------------------------- /Fig4_S4/Fig4E_S4.R: -------------------------------------------------------------------------------- 1 | options(warn=-1) 2 | 3 | a=read.delim("model_comparison.txt", sep=' ') 4 | a 5 | 6 | pdf("FigS4A.pdf", height=8, width=10) 7 | par(oma=c(1,20,1,1)) 8 | barplot(rbind(a$test_r_squared,a$test_r_squared_withHL),beside=T,horiz=T, 9 | names.arg=a$model,las=1,col=c("red","blue"), border=F, xlim=c(0,0.8)) 10 | legend("bottomright", bg="white", bty="n", legend = c("with half life", "without half life"), text.col = c("blue","red")) 11 | dev.off() 12 | 13 | a=read.delim("model_comparison_Fig3.txt", sep=' ') 14 | a 15 | pdf("Fig4E.pdf", height=8, width=10) 16 | par(oma=c(1,20,1,1)) 17 | barplot(a$r_squared,horiz=T, names.arg=a$model,las=1,col=c(rep("red",11),rep("blue",22)), border=F, xlim=c(0,0.8)) 18 | legend("bottomright", bg="white", bty="n", legend = c("mouse", "human"), text.col = c("red","blue")) 19 | dev.off() 20 | -------------------------------------------------------------------------------- /Fig4_S4/FigS4A.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/FigS4A.pdf -------------------------------------------------------------------------------- /Fig4_S4/FigS4B.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(Biostrings) 3 | library(rhdf5) 4 | library(reshape2) 5 | source("coefplot.r") 6 | 7 | h5dir = args[1] 8 | testIDs = h5read(paste(h5dir, "test.h5", sep='/'),"geneName") 9 | trainIDs = h5read(paste(h5dir, "train.h5", sep='/'),"geneName") 10 | valIDs = h5read(paste(h5dir, "valid.h5", sep='/'),"geneName") 11 | file1 = "Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz" 12 | file2 = "promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz" 13 | kmerlen = 1 14 | 15 | if (grepl("Mouse",h5dir)) { 16 | file1 = "Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz" 17 | file2 = "promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz" 18 | } 19 | 20 | inp.tbl <- fread(paste("zcat", file1),header=T,data.table=F,sep="\t") 21 | rownames(inp.tbl) = inp.tbl[,1] 22 | inp.tbl[,1] = NULL 23 | 24 | inp.tbl$PROMOTER = substring(inp.tbl$PROMOTER,8500,11500) 25 | 26 | inp.tbl=cbind(inp.tbl, do.call(rbind, lapply(inp.tbl$PROMOTER, function(x){ 27 | y = oligonucleotideFrequency(DNAStringSet(x), kmerlen) 28 | y/sum(y) 29 | }))) 30 | inp.tbl$T=NULL #remove TT dinucleotide to ensure matrix is full rank 31 | 32 | inp.tbl$PROMOTER = NULL 33 | inp.tbl[,c(1:5, 9)] = log10(inp.tbl[,c(1:5, 9)]+0.1) 34 | inp.tbl=as.data.frame(scale(inp.tbl)) 35 | 36 | # save(inp.tbl, file="5merInputTable.RData") 37 | # # load("TriInputTable.RData") 38 | 39 | motif_hits <- fread(paste("zcat", file2),header=T,data.table=F,sep="\t") 40 | motif_hits <- dcast(motif_hits, motif_hits[,2] ~ motif_hits[,1], function(x) 1, fill = 0) 41 | inp.tbl=merge(inp.tbl, motif_hits, by.x=0, by.y=1, all.x=T) 42 | 43 | sum(is.na(inp.tbl)) 44 | inp.tbl[is.na(inp.tbl)] = 0 45 | rownames(inp.tbl) = inp.tbl[,1] 46 | inp.tbl[,1] = NULL 47 | 48 | train = inp.tbl[rownames(inp.tbl) %in% trainIDs, ] 49 | valid = inp.tbl[rownames(inp.tbl) %in% valIDs, ] 50 | test = inp.tbl[rownames(inp.tbl) %in% testIDs, ] 51 | 52 | mod1 = lm(EXPRESSION ~ ., data=train) 53 | mod2 = lm(EXPRESSION ~ UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY, data=train) 54 | mod3 = lm(EXPRESSION ~ .-(UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY), data=train) 55 | 56 | cor(test$EXPRESSION, predict(mod1, newdata = test))^2 57 | cor(test$EXPRESSION, predict(mod2, newdata = test))^2 58 | cor(test$EXPRESSION, predict(mod3, newdata = test))^2 59 | 60 | summary(mod2) 61 | pdf("FigS4B_human.pdf") #change to mouse if dir is mouse 62 | coefplot(mod2, parm = -1) 63 | -------------------------------------------------------------------------------- /Fig4_S4/FigS4B.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/FigS4B.pdf -------------------------------------------------------------------------------- /Fig4_S4/FigS4B_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/FigS4B_2.pdf -------------------------------------------------------------------------------- /Fig4_S4/FigS4C.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(Biostrings) 3 | library(rhdf5) 4 | library(reshape2) 5 | library(beeswarm) 6 | 7 | h5dir = args[1] 8 | 9 | file1 = "Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz" 10 | file2 = "promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz" 11 | kmerlen = 4 12 | 13 | if (grepl("Mouse",h5dir)) { 14 | file1 = "Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz" 15 | file2 = "promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz" 16 | } 17 | 18 | inp.tbl <- fread(paste("zcat", file1),header=T,data.table=F,sep="\t") 19 | rownames(inp.tbl) = inp.tbl[,1] 20 | inp.tbl[,1] = NULL 21 | 22 | inp.tbl$PROMOTER = substring(inp.tbl$PROMOTER,8500,11500) 23 | 24 | inp.tbl=cbind(inp.tbl, do.call(rbind, lapply(inp.tbl$PROMOTER, function(x){ 25 | y = oligonucleotideFrequency(DNAStringSet(x), kmerlen) 26 | y/sum(y) 27 | }))) 28 | inp.tbl$T=NULL #remove TT dinucleotide to ensure matrix is full rank 29 | 30 | inp.tbl$PROMOTER = NULL 31 | inp.tbl[,c(1:5, 9)] = log10(inp.tbl[,c(1:5, 9)]+0.1) 32 | inp.tbl=as.data.frame(scale(inp.tbl)) 33 | 34 | # save(inp.tbl, file="5merInputTable.RData") 35 | # # load("TriInputTable.RData") 36 | 37 | motif_hits <- fread(paste("zcat", file2),header=T,data.table=F,sep="\t") 38 | motif_hits <- dcast(motif_hits, motif_hits[,2] ~ motif_hits[,1], function(x) 1, fill = 0) 39 | inp.tbl=merge(inp.tbl, motif_hits, by.x=0, by.y=1, all.x=T) 40 | 41 | sum(is.na(inp.tbl)) 42 | inp.tbl[is.na(inp.tbl)] = 0 43 | rownames(inp.tbl) = inp.tbl[,1] 44 | inp.tbl[,1] = NULL 45 | 46 | z = do.call("rbind", lapply(1:10, function(i){ 47 | testIDs = h5read(paste(h5dir, '/', i, "test.h5", sep=''),"geneName") 48 | trainIDs = h5read(paste(h5dir, '/', i, "train.h5", sep=''),"geneName") 49 | valIDs = h5read(paste(h5dir, '/', i, "valid.h5", sep=''),"geneName") 50 | train = inp.tbl[rownames(inp.tbl) %in% trainIDs | rownames(inp.tbl) %in% valIDs, ] 51 | # valid = inp.tbl[rownames(inp.tbl) %in% valIDs, ] 52 | test = inp.tbl[rownames(inp.tbl) %in% testIDs, ] 53 | 54 | mod1 = lm(EXPRESSION ~ ., data=train) 55 | c(i, cor(test$EXPRESSION, predict(mod1, newdata = test))^2) 56 | })) 57 | colnames(z)=c("fold","baseliner2") 58 | 59 | cv_folder = paste0(h5dir, 2) 60 | files = list.files(path=cv_folder, pattern='.txt.gz', full.names=T) 61 | table <- do.call("rbind", lapply(files, FUN=function(file){ 62 | cmd = paste("zcat ", file, "| tail -2 | perl -ne \'@a=split /= /; print $a[1];\'") 63 | tmp = t(read.table(textConnection(system(cmd, intern=TRUE)))) 64 | tmp$fold = as.numeric(strsplit(basename(file), "_")[[1]][1]) 65 | tmp$trial = as.numeric(strsplit(strsplit(file, "_trial")[[1]][2], '\\.')[[1]][1]) 66 | names(tmp) = c("r2","MSE","fold","trial") 67 | tmp 68 | })) 69 | 70 | table=as.data.frame(apply(table,2,function(x) as.numeric(as.character(x)))) 71 | table = do.call("rbind",lapply(unique(table$fold), function(x) { tmp=table[table$fold==x,]; tmp[which( tmp$MSE==min(tmp$MSE) ),c("r2","fold")] } )) 72 | table = aggregate(table$r2, by=list(fold=table$fold), mean) 73 | colnames(table)[2]="Xpressor2" 74 | table 75 | 76 | cvr2 = merge(table,z,by=1) 77 | t.test(cvr2[,2],cvr2[,3],paired=T) 78 | cvr2 79 | pdf("FigS4C_human.pdf") #change to mouse if dir is mouse 80 | beeswarm(cvr2[,2:3],ylim=c(0,1), las=2, bty='n', pch=19) #0.4,0.8 81 | dev.off() 82 | -------------------------------------------------------------------------------- /Fig4_S4/FigS4C_human.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/FigS4C_human.pdf -------------------------------------------------------------------------------- /Fig4_S4/FigS4C_mouse.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/FigS4C_mouse.pdf -------------------------------------------------------------------------------- /Fig4_S4/JASPAR_CORE_2016_vertebrates.meme: -------------------------------------------------------------------------------- 1 | ../datasets/JASPAR_CORE_2016_vertebrates.meme -------------------------------------------------------------------------------- /Fig4_S4/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz -------------------------------------------------------------------------------- /Fig4_S4/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz -------------------------------------------------------------------------------- /Fig4_S4/all_crossvalidated_predictions.txt: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/all_crossvalidated_predictions.txt -------------------------------------------------------------------------------- /Fig4_S4/all_crossvalidated_predictions_GM12878.txt: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/all_crossvalidated_predictions_GM12878.txt -------------------------------------------------------------------------------- /Fig4_S4/all_crossvalidated_predictions_K562.txt: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/all_crossvalidated_predictions_K562.txt -------------------------------------------------------------------------------- /Fig4_S4/all_crossvalidated_predictions_mESC.txt: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/all_crossvalidated_predictions_mESC.txt -------------------------------------------------------------------------------- /Fig4_S4/all_crossvalidated_predictions_mouse.txt: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/all_crossvalidated_predictions_mouse.txt -------------------------------------------------------------------------------- /Fig4_S4/baseline_models.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(Biostrings) 3 | library(rhdf5) 4 | library(reshape2) 5 | 6 | h5dir = args[1] 7 | kmerlen = args[2] 8 | 9 | testIDs = h5read(paste(h5dir, "test.h5", sep='/'),"geneName") 10 | trainIDs = h5read(paste(h5dir, "train.h5", sep='/'),"geneName") 11 | valIDs = h5read(paste(h5dir, "valid.h5", sep='/'),"geneName") 12 | file1 = "Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz" 13 | file2 = "promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz" 14 | 15 | if (grepl("Mouse",h5dir)) { 16 | file1 = "Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz" 17 | file2 = "promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz" 18 | } 19 | 20 | inp.tbl <- fread(paste("zcat", file1),header=T,data.table=F,sep="\t") 21 | rownames(inp.tbl) = inp.tbl[,1] 22 | inp.tbl[,1] = NULL 23 | 24 | inp.tbl$PROMOTER = substring(inp.tbl$PROMOTER,8500,11500) 25 | 26 | inp.tbl=cbind(inp.tbl, do.call(rbind, lapply(inp.tbl$PROMOTER, function(x){ 27 | y = oligonucleotideFrequency(DNAStringSet(x), kmerlen) 28 | y/sum(y) 29 | }))) 30 | 31 | inp.tbl$PROMOTER = NULL 32 | inp.tbl[,c(1:5, 9)] = log10(inp.tbl[,c(1:5, 9)]+0.1) 33 | inp.tbl=as.data.frame(scale(inp.tbl)) 34 | 35 | train = inp.tbl[rownames(inp.tbl) %in% trainIDs, ] 36 | valid = inp.tbl[rownames(inp.tbl) %in% valIDs, ] 37 | test = inp.tbl[rownames(inp.tbl) %in% testIDs, ] 38 | 39 | #full model 40 | mod1 = lm(EXPRESSION ~ ., data=train) 41 | #half life only model 42 | mod2 = lm(EXPRESSION ~ UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY, data=train) 43 | #promoter only model 44 | mod3 = lm(EXPRESSION ~ .-(UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY), data=train) 45 | summary(mod3) 46 | 47 | cor(test$EXPRESSION, predict(mod1, newdata = test))^2 48 | cor(test$EXPRESSION, predict(mod2, newdata = test))^2 49 | cor(test$EXPRESSION, predict(mod3, newdata = test))^2 50 | 51 | plot(predict(mod1, newdata = test), test$EXPRESSION) 52 | test = test[test$EXPRESSION >= -1, ] 53 | cor(test$EXPRESSION, predict(mod1, newdata = test))^2 54 | plot(predict(mod1, newdata = test), test$EXPRESSION) 55 | 56 | motif_hits <- fread(paste("zcat", file2),header=T,data.table=F,sep="\t") 57 | motif_hits <- dcast(motif_hits, motif_hits[,2] ~ motif_hits[,1], function(x) 1, fill = 0) 58 | 59 | inp.tbl=merge(inp.tbl, motif_hits, by.x=0, by.y=1, all.x=T) 60 | sum(is.na(inp.tbl)) 61 | inp.tbl[is.na(inp.tbl)] = 0 62 | rownames(inp.tbl) = inp.tbl[,1] 63 | inp.tbl[,1] = NULL 64 | 65 | train = inp.tbl[rownames(inp.tbl) %in% trainIDs, ] 66 | valid = inp.tbl[rownames(inp.tbl) %in% valIDs, ] 67 | test = inp.tbl[rownames(inp.tbl) %in% testIDs, ] 68 | 69 | mod1 = lm(EXPRESSION ~ ., data=train) 70 | mod2 = lm(EXPRESSION ~ UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY, data=train) 71 | mod3 = lm(EXPRESSION ~ .-(UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY), data=train) 72 | 73 | cor(test$EXPRESSION, predict(mod1, newdata = test))^2 74 | cor(test$EXPRESSION, predict(mod2, newdata = test))^2 75 | cor(test$EXPRESSION, predict(mod3, newdata = test))^2 76 | -------------------------------------------------------------------------------- /Fig4_S4/coefplot.r: -------------------------------------------------------------------------------- 1 | # published on http://www.r-statistics.com/2010/07/visualization-of-regression-coefficients-in-r 2 | # originally written by "Achim Zeileis" 3 | # GPL-2 4 | 5 | coefplot <- function(object, df = NULL, level = 0.95, parm = NULL, 6 | labels = TRUE, xlab = "Coefficient confidence intervals", ylab = "", 7 | xlim = NULL, ylim = NULL, 8 | las = 1, lwd = 1, lty = c(1, 2), pch = 19, col = 1, 9 | length = 0, angle = 30, code = 3, ...) 10 | { 11 | cf <- coef(object) 12 | se <- sqrt(diag(vcov(object))) 13 | if(is.null(parm)) parm <- seq_along(cf) 14 | if(is.numeric(parm) | is.logical(parm)) parm <- names(cf)[parm] 15 | if(is.character(parm)) parm <- which(names(cf) %in% parm) 16 | cf <- cf[parm] 17 | se <- se[parm] 18 | k <- length(cf) 19 | 20 | if(is.null(df)) { 21 | df <- if(identical(class(object), "lm")) df.residual(object) else 0 22 | } 23 | 24 | critval <- if(df > 0 & is.finite(df)) { 25 | qt((1 - level)/2, df = df) 26 | } else { 27 | qnorm((1 - level)/2) 28 | } 29 | ci1 <- cf + critval * se 30 | ci2 <- cf - critval * se 31 | 32 | lwd <- rep(lwd, length.out = 2) 33 | lty <- rep(lty, length.out = 2) 34 | pch <- rep(pch, length.out = k) 35 | col <- rep(col, length.out = k) 36 | 37 | if(is.null(xlim)) xlim <- range(c(0, min(ci1), max(ci2))) 38 | if(is.null(ylim)) ylim <- c(1 - 0.05 * k, 1.05 * k) 39 | 40 | if(isTRUE(labels)) labels <- names(cf) 41 | if(identical(labels, FALSE)) labels <- "" 42 | labels <- rep(labels, length.out = k) 43 | 44 | plot(0, 0, xlim = xlim, ylim = ylim, xlab = xlab, ylab = ylab, 45 | axes = FALSE, type = "n", las = las, ...) 46 | arrows(ci1, 1:k, ci2, 1:k, lty = lty[1], lwd = lwd[1], col = col, 47 | length = length, angle = angle, code = code) 48 | points(cf, 1:k, pch = pch, col = col) 49 | abline(v = 0, lty = lty[2], lwd = lwd[2]) 50 | axis(1) 51 | axis(2, at = 1:k, labels = labels, las = las) 52 | box() 53 | } 54 | -------------------------------------------------------------------------------- /Fig4_S4/gencode.v27lift37.basic.annotation.gtf.gz: -------------------------------------------------------------------------------- 1 | ../datasets/gencode.v27lift37.basic.annotation.gtf.gz -------------------------------------------------------------------------------- /Fig4_S4/model_comparison.txt: -------------------------------------------------------------------------------- 1 | model test_r_squared test_r_squared_withHL 2 | "HL-only" 0 0.1702954 3 | "1mer" 0.1566485 0.2727732 4 | "2mer" 0.3316107 0.3807903 5 | "3mer" 0.4012669 0.4329388 6 | "4mer" 0.4450406 0.468374 7 | "5mer" 0.4465227 0.469725 8 | "5mer" 0.4465227 0.469725 9 | "JASPAR TFs+4mer" 0.4564599 0.4778438 10 | "Xpresso, Hyperparameter-tuned, mononucleotide-shuffled input" 0.224 0.279 11 | "Xpresso, Hyperparameter-tuned, dinucleotide-shuffled input" 0.286 0.333 12 | "Xpresso, Manually-discovered hyperparameters" 0.511 0.532 13 | "Xpresso, Hyperparameter-tuned" 0.504 0.590 14 | "HL-only, mouse" 0 0.3045577 15 | "1mer, mouse" 0.2517215 0.3691906 16 | "2mer, mouse" 0.4498092 0.5138408 17 | "3mer, mouse" 0.5070994 0.5509732 18 | "4mer, mouse" 0.5395627 0.5721985 19 | "5mer, mouse" 0.5703819 0.5925751 20 | "6mer, mouse" 0.5101936 0.5359272 21 | "JASPAR TFs, mouse" 0.4555412 0.5042129 22 | "JASPAR TFs+5mer, mouse" 0.5790679 0.593618 23 | "Xpresso, Hyperparameter-tuned, mononucleotide-shuffled input, mouse" 0.413 0.456 24 | "Xpresso, Hyperparameter-tuned, dinucleotide-shuffled input, mouse" 0.502 0.544 25 | "Xpresso, Manually-discovered hyperparameters, mouse" 0.636 0.667 26 | "Xpresso, Hyperparameter-tuned, mouse" 0.632 0.710 -------------------------------------------------------------------------------- /Fig4_S4/model_comparison_Fig3.txt: -------------------------------------------------------------------------------- 1 | model r_squared species 2 | "Histone+ChIP-seq+PWM matches, ESC (Ouyang 2009)" 0.65 mouse 3 | "Chromatin, ESC (Cheng 2011)" 0.55 mouse 4 | "TF+DNase, ESC (Duren 2017)" 0.47 mouse 5 | "DNase, ESC (Duren 2017)" 0.35 mouse 6 | "Histone+TF+PWM matches, ESC (McLeay 2012)" 0.695 mouse 7 | "PWM matches with Histone support, ESC (McLeay 2012)" 0.52 mouse 8 | "PWM matches, ESC (McLeay 2012)" 0.28 mouse 9 | "Xpresso, mESC (this study)" 0.5884967 mouse 10 | "MPRA, neuron, CAGE correlation (Nguyen 2016)" 0.073 mouse 11 | "5mer and half life features, median expression levels (this study)" 0.5925751 mouse 12 | "Sequence and half life features, median expression levels (this study)" 0.710 mouse 13 | "Histone marks, CD4+ T cells (Karlić 2010)" 0.56 human 14 | "Sequence+Histone+TF+DNase, K562 (Zhou 2018)" 0.535 human 15 | "PWM matches with DNase/Histone support, GM12878 (Schmidt 2017)" 0.34 human 16 | "Histone+TF+PWM matches, GM12878 (McLeay 2012)" 0.41 human 17 | "PWM matches with Histone support, GM12878 (McLeay 2012)" 0.28 human 18 | "PWM matches, GM12878 (McLeay 2012)" 0.08 human 19 | "Xpresso, GM12878 (this study)" 0.4259194 human 20 | "Sequence+Histone+TF+DNase, K562 (Zhou 2018)" 0.569 human 21 | "PWM matches with DNase/Histone support, K562 (Schmidt 2017)" 0.47 human 22 | "MPRA, K562, CAGE correlation (van Arensbergen 2016)" 0.49 human 23 | "Chromatin marks, K562 (Cheng 2011)" 0.39 human 24 | "Xpresso, K562 (this study)" 0.5040678 human 25 | "PWM matches with DNase/Histone support, HepG2 (Schmidt 2017)" 0.46 human 26 | "MPRA, many (Cooper 2006)" 0.28 human 27 | "MPRA, many (Landolin 2010)" 0.185 human 28 | "TF ChIP, median among cell types (Cheng 2012)" 0.39 human 29 | "Histone marks, median among cell types (Dong 2012)" 0.62 human 30 | "Histone marks, median among cell types (Abdalla 2018)" 0.52 human 31 | "Sequence only, median among cell types (Bressiere 2018)" 0.336 human 32 | "Sequence only, median among cell types (Abdalla 2018)" 0.17 human 33 | "4mer and half life features, median expression levels (this study)" 0.468374 human 34 | "Sequence and half life features, median expression levels (this study)" 0.590 human 35 | -------------------------------------------------------------------------------- /Fig4_S4/promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz -------------------------------------------------------------------------------- /Fig4_S4/promoters_pM1.5Kb.FIMO_scanned.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/promoters_pM1.5Kb.FIMO_scanned.txt.gz -------------------------------------------------------------------------------- /Fig4_S4/promoters_pM1.5Kb.fa.gz: -------------------------------------------------------------------------------- 1 | ../datasets/promoters_pM1.5Kb.fa.gz -------------------------------------------------------------------------------- /Fig4_S4/promoters_pM1.5Kb.firstOrderMarkov_background: -------------------------------------------------------------------------------- 1 | ../datasets/promoters_pM1.5Kb.firstOrderMarkov_background -------------------------------------------------------------------------------- /Fig4_S4/promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz -------------------------------------------------------------------------------- /Fig4_S4/promoters_pM1.5Kb.mouse.FIMO_scanned.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/promoters_pM1.5Kb.mouse.FIMO_scanned.txt.gz -------------------------------------------------------------------------------- /Fig4_S4/promoters_pM1.5Kb.mouse.fa.gz: -------------------------------------------------------------------------------- 1 | ../datasets/promoters_pM1.5Kb.mouse.fa.gz -------------------------------------------------------------------------------- /Fig4_S4/promoters_pM1.5Kb.mouse.firstOrderMarkov_background: -------------------------------------------------------------------------------- 1 | ../datasets/promoters_pM1.5Kb.mouse.firstOrderMarkov_background -------------------------------------------------------------------------------- /Fig4_S4/runme.sh: -------------------------------------------------------------------------------- 1 | perl -ne 'print "chr$_";' hg38_promoters_cage_corrected.bed >hg38_promoters_cage_corrected_withChr.bed 2 | liftOver -bedPlus=6 hg38_promoters_cage_corrected_withChr.bed hg38ToHg19.over.chain hg19_promoters_cage_corrected_withChr.bed unmapped 3 | ./supplement_ids.pl >hg19_promoters_cage_corrected_withChr_andOthers.bed 4 | grep -P '\-$' hg19_promoters_cage_corrected_withChr_andOthers.bed >hg19_promoters_cage_corrected_withChr_andOthers_minus.bed 5 | grep -P '\+$' hg19_promoters_cage_corrected_withChr_andOthers.bed >hg19_promoters_cage_corrected_withChr_andOthers_plus.bed 6 | 7 | wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE78nnn/GSE78709/suppl/GSE78709_sure23.plasmid.norm.combined.45.55.plus.160504.bw 8 | wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE78nnn/GSE78709/suppl/GSE78709_sure23.plasmid.norm.combined.45.55.minus.160504.bw 9 | 10 | bigWigAverageOverBed -sampleAroundCenter=1000 GSE78709_sure23.plasmid.norm.combined.45.55.plus.160504.bw hg19_promoters_cage_corrected_withChr_andOthers_plus.bed GSE78709_sure23.plasmid.norm.combined.45.55.plus.promoters.bigWigSignal 11 | bigWigAverageOverBed -sampleAroundCenter=1000 GSE78709_sure23.plasmid.norm.combined.45.55.minus.160504.bw hg19_promoters_cage_corrected_withChr_andOthers_minus.bed GSE78709_sure23.plasmid.norm.combined.45.55.minus.promoters.bigWigSignal 12 | 13 | Rscript Fig4ABCD.R 14 | 15 | #generate baseline dinucleotide model, and prepare to extract features for PWM-based model 16 | zcat Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1,11 | tail -n+2 | perl -ne '@a=split; print ">$a[0]\n".substr($a[1],8500,3000)."\n";' >promoters_pM1.5Kb.fa 17 | zcat Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1,11 | tail -n+2 | perl -ne '@a=split; print ">$a[0]\n".substr($a[1],8500,3000)."\n";' >promoters_pM1.5Kb.mouse.fa 18 | fasta-get-markov promoters_pM1.5Kb.fa >promoters_pM1.5Kb.firstOrderMarkov_background 19 | fasta-get-markov promoters_pM1.5Kb.mouse.fa >promoters_pM1.5Kb.mouse.firstOrderMarkov_background 20 | 21 | fimo --bgfile promoters_pM1.5Kb.firstOrderMarkov_background --verbosity 1 --text --skip-matched-sequence JASPAR_CORE_2016_vertebrates.meme promoters_pM1.5Kb.fa | gzip -c >promoters_pM1.5Kb.FIMO_scanned.txt.gz 22 | fimo --bgfile promoters_pM1.5Kb.mouse.firstOrderMarkov_background --verbosity 1 --text --skip-matched-sequence JASPAR_CORE_2016_vertebrates.meme promoters_pM1.5Kb.mouse.fa | gzip -c >promoters_pM1.5Kb.mouse.FIMO_scanned.txt.gz 23 | 24 | zcat promoters_pM1.5Kb.FIMO_scanned.txt.gz | cut -f 1,2 | uniq | gzip -c >promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz 25 | zcat promoters_pM1.5Kb.mouse.FIMO_scanned.txt.gz | cut -f 1,2 | uniq | gzip -c >promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz 26 | 27 | #generate 1mer-6mer baseline models for human and mouse, respectively 28 | for x in {1..6}; do { Rscript baseline_models.R pM10Kb_1KTest $x; } done & 29 | for x in {1..6}; do { Rscript baseline_models.R pM10Kb_1KTest_Mouse $x; } done & 30 | 31 | # empirical results from Xpresso and baselines are stored in model_comparison.txt 32 | Rscript Fig4E_S4.R 33 | 34 | #generates for human, can change directory and code to generate for mouse 35 | Rscript FigS4B.R 36 | 37 | #generates for human, can change directory and code to generate for mouse 38 | Rscript FigS4C.R 39 | -------------------------------------------------------------------------------- /Fig4_S4/supplement_ids.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | open IN, "){ 5 | ($id) = ($_ =~ /(ENSG\d+)/); 6 | $seen{$id}=1; 7 | print $_; 8 | } 9 | close IN; 10 | 11 | open IN, "zgrep -P '\tgene\t' gencode.v27lift37.basic.annotation.gtf.gz | grep protein_coding | "; 12 | while(){ chomp; 13 | @a = split /\t/; 14 | ($id) = ($a[-1] =~ /(ENSG\d+)/); 15 | print join("\t", $a[0], $a[4]-1, $a[4]+1, $id, '0', $a[6]), "\n" if !$seen{$id} && $a[6] eq '-'; 16 | print join("\t", $a[0], $a[3]-1, $a[3]+1, $id, '0', $a[6]), "\n" if !$seen{$id} && $a[6] eq '+'; 17 | $seen{$id}=1; 18 | } 19 | close IN; -------------------------------------------------------------------------------- /Fig5_S5/hg19.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr1 249250621 2 | chr2 243199373 3 | chr3 198022430 4 | chr4 191154276 5 | chr5 180915260 6 | chr6 171115067 7 | chr7 159138663 8 | chrX 155270560 9 | chr8 146364022 10 | chr9 141213431 11 | chr10 135534747 12 | chr11 135006516 13 | chr12 133851895 14 | chr13 115169878 15 | chr14 107349540 16 | chr15 102531392 17 | chr16 90354753 18 | chr17 81195210 19 | chr18 78077248 20 | chr20 63025520 21 | chrY 59373566 22 | chr19 59128983 23 | chr22 51304566 24 | chr21 48129895 25 | chr6_ssto_hap7 4928567 26 | chr6_mcf_hap5 4833398 27 | chr6_cox_hap2 4795371 28 | chr6_mann_hap4 4683263 29 | chr6_apd_hap1 4622290 30 | chr6_qbl_hap6 4611984 31 | chr6_dbb_hap3 4610396 32 | chr17_ctg5_hap1 1680828 33 | chr4_ctg9_hap1 590426 34 | chr1_gl000192_random 547496 35 | chrUn_gl000225 211173 36 | chr4_gl000194_random 191469 37 | chr4_gl000193_random 189789 38 | chr9_gl000200_random 187035 39 | chrUn_gl000222 186861 40 | chrUn_gl000212 186858 41 | chr7_gl000195_random 182896 42 | chrUn_gl000223 180455 43 | chrUn_gl000224 179693 44 | chrUn_gl000219 179198 45 | chr17_gl000205_random 174588 46 | chrUn_gl000215 172545 47 | chrUn_gl000216 172294 48 | chrUn_gl000217 172149 49 | chr9_gl000199_random 169874 50 | chrUn_gl000211 166566 51 | chrUn_gl000213 164239 52 | chrUn_gl000220 161802 53 | chrUn_gl000218 161147 54 | chr19_gl000209_random 159169 55 | chrUn_gl000221 155397 56 | chrUn_gl000214 137718 57 | chrUn_gl000228 129120 58 | chrUn_gl000227 128374 59 | chr1_gl000191_random 106433 60 | chr19_gl000208_random 92689 61 | chr9_gl000198_random 90085 62 | chr17_gl000204_random 81310 63 | chrUn_gl000233 45941 64 | chrUn_gl000237 45867 65 | chrUn_gl000230 43691 66 | chrUn_gl000242 43523 67 | chrUn_gl000243 43341 68 | chrUn_gl000241 42152 69 | chrUn_gl000236 41934 70 | chrUn_gl000240 41933 71 | chr17_gl000206_random 41001 72 | chrUn_gl000232 40652 73 | chrUn_gl000234 40531 74 | chr11_gl000202_random 40103 75 | chrUn_gl000238 39939 76 | chrUn_gl000244 39929 77 | chrUn_gl000248 39786 78 | chr8_gl000196_random 38914 79 | chrUn_gl000249 38502 80 | chrUn_gl000246 38154 81 | chr17_gl000203_random 37498 82 | chr8_gl000197_random 37175 83 | chrUn_gl000245 36651 84 | chrUn_gl000247 36422 85 | chr9_gl000201_random 36148 86 | chrUn_gl000235 34474 87 | chrUn_gl000239 33824 88 | chr21_gl000210_random 27682 89 | chrUn_gl000231 27386 90 | chrUn_gl000229 19913 91 | chrM 16571 92 | chrUn_gl000226 15008 93 | chr18_gl000207_random 4262 94 | -------------------------------------------------------------------------------- /Fig5_S5/human_trainepoch.11-0.426.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/human_trainepoch.11-0.426.h5 -------------------------------------------------------------------------------- /Fig5_S5/mouse_trainepoch.05-0.278.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/mouse_trainepoch.05-0.278.h5 -------------------------------------------------------------------------------- /Fig5_S5/predict_seqs.py: -------------------------------------------------------------------------------- 1 | import sys, pickle 2 | import pandas as pd 3 | import numpy as np 4 | from optparse import OptionParser 5 | from keras.models import Model, load_model 6 | 7 | def main(): 8 | usage = 'usage: %prog [options] ' 9 | parser = OptionParser(usage) 10 | parser.add_option('--revCom', dest='revcom', default=False, action='store_true', help='Make predictions for minus strand instead of plus? % [Default: %default]') 11 | (options,args) = parser.parse_args() 12 | 13 | if len(args) != 4: 14 | print(args) 15 | parser.error('Must provide mode hyperparameter file and 2-column file to generate predictions for') 16 | else: 17 | param_file = args[0] 18 | trained_model = args[1] 19 | test_file = args[2] 20 | outfile = args[3] 21 | 22 | def revCom(x): 23 | for y in range(0,x.shape[0]): 24 | x[y] = np.fliplr(np.flipud(x[y])) 25 | return x 26 | 27 | trials = pickle.load(open(param_file, "rb")) 28 | best = trials.argmin 29 | model = load_model(trained_model, compile=False) 30 | 31 | table = pd.read_table(test_file, index_col=0, header=None) 32 | seqs = one_hot(table.as_matrix()) 33 | if options.revcom: 34 | seqs = revCom(seqs) 35 | if seqs.shape[1] != 10500: 36 | tsspos = 7000 37 | leftpos = tsspos - seqs.shape[1] / 2 38 | if seqs.shape[1] <= tsspos: 39 | tmpseqs = np.zeros((seqs.shape[0],10500,4), dtype='bool') 40 | tmpseqs[:,leftpos:(leftpos+seqs.shape[1]),:] = seqs 41 | seqs = tmpseqs 42 | else: 43 | print('Sequences are above the allowable size of 10500nt') 44 | sys.exit() 45 | halflifedata = np.zeros((seqs.shape[0],6), dtype='float16') 46 | print("Processed data from %s" % test_file) 47 | predictions_test = model.predict([seqs, halflifedata], batch_size=20).flatten() 48 | df = pd.DataFrame(np.column_stack((table.index, predictions_test)), columns=['Info','Pred']) 49 | df.to_csv(outfile, index=False, header=True, sep='\t') 50 | 51 | def one_hot(seq): 52 | seq_len = len(seq.item(0)) 53 | seqindex = {'A':0, 'C':1, 'G':2, 'T':3, 'a':0, 'c':1, 'g':2, 't':3} 54 | seq_vec = np.zeros((len(seq),seq_len,4), dtype='bool') 55 | for i in range(len(seq)): 56 | thisseq = seq.item(i) 57 | for j in range(seq_len): 58 | try: 59 | seq_vec[i,j,seqindex[thisseq[j]]] = 1 60 | except: 61 | pass 62 | return seq_vec 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /Fig5_S5/region.1Mb.bed: -------------------------------------------------------------------------------- 1 | 1 109500000 110300000 2 | -------------------------------------------------------------------------------- /Fig5_S5/region.1Mb.intervals.100ntStep.Minus.bw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/region.1Mb.intervals.100ntStep.Minus.bw -------------------------------------------------------------------------------- /Fig5_S5/region.1Mb.intervals.100ntStep.Plus.bw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/region.1Mb.intervals.100ntStep.Plus.bw -------------------------------------------------------------------------------- /Fig5_S5/region.1Mb.intervals.100ntStep.input.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/region.1Mb.intervals.100ntStep.input.txt.gz -------------------------------------------------------------------------------- /Fig5_S5/region.1Mb.intervals.100ntStep.mouse.input.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/region.1Mb.intervals.100ntStep.mouse.input.txt.gz -------------------------------------------------------------------------------- /Fig5_S5/region.1Mb.intervals.100ntStep.mouse.minus.bw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/region.1Mb.intervals.100ntStep.mouse.minus.bw -------------------------------------------------------------------------------- /Fig5_S5/region.1Mb.intervals.100ntStep.mouse.plus.bw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/region.1Mb.intervals.100ntStep.mouse.plus.bw -------------------------------------------------------------------------------- /Fig5_S5/region.1Mb.mouse.bed: -------------------------------------------------------------------------------- 1 | chr3 107800000 108500000 2 | -------------------------------------------------------------------------------- /Fig5_S5/runme.sh: -------------------------------------------------------------------------------- 1 | #region.1Mb.bed is human locus; region.1Mb.mouse.bed is mouse locus 2 | 3 | BASEFILE="region.1Mb.intervals.100ntStep" #for Mouse locus use "region.1Mb.intervals.100ntStep.mouse" 4 | 5 | #generate 100nt step with 10.5Kb window size 6 | bedtools makewindows -b region.1Mb.bed -w 10500 -s 100 | perl -ne '@a=split/\t/; print $_ if $a[2]-$a[1] == 10500;' | sort | uniq >$BASEFILE.bed 7 | #extract sequences from fasta of human or mouse genome 8 | bedtools getfasta -tab -fi Homo_sapiens_assembly19.fasta -bed $BASEFILE.bed -fo $BASEFILE.input.txt 9 | 10 | #generate predictions 11 | python predict_seqs.py tpe_1K_10epochs_optimized_0to20K.hyperopt human_trainepoch.11-0.426.h5 $BASEFILE.input.txt $BASEFILE.Plus.txt 12 | python predict_seqs.py --revCom tpe_1K_10epochs_optimized_0to20K.hyperopt human_trainepoch.11-0.426.h5 $BASEFILE.input.txt $BASEFILE.Minus.txt 13 | 14 | #center plus and minus stranded predictions for given 100nt "TSS" 15 | tail -n+2 $BASEFILE.Plus.txt | perl -ne '@a=($_ =~ /(.*):(\d+)-(\d+)\t(.*)/); print "chr".join("\t", $a[0], $a[1]+7000-5000, $a[1]+7000+5000, $a[3]+1), "\n";' >$BASEFILE.Plus.bedGraph 16 | tail -n+2 $BASEFILE.Minus.txt | perl -ne '@a=($_ =~ /(.*):(\d+)-(\d+)\t(.*)/); print "chr".join("\t", $a[0], $a[1]+3500-5000, $a[1]+3500+5000, $a[3]+1), "\n";' >$BASEFILE.Minus.bedGraph 17 | 18 | #convert to bigwig to view on UCSC browser 19 | bedGraphToBigWig $BASEFILE.Plus.bedGraph hg19.chrom.sizes $BASEFILE.Plus.bw 20 | bedGraphToBigWig $BASEFILE.Minus.bedGraph hg19.chrom.sizes $BASEFILE.Minus.bw 21 | -------------------------------------------------------------------------------- /Fig5_S5/tpe_1K_10epochs_optimized_0to20K.hyperopt: -------------------------------------------------------------------------------- 1 | ../Fig1_S2/tpe_1K_10epochs_optimized_0to20K.hyperopt -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/Fig6B.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/Fig6B.pdf -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/Fig6C_S7_S8.R: -------------------------------------------------------------------------------- 1 | file = args[1] 2 | outfile = args[2] 3 | 4 | b=list() 5 | mononuc=list() 6 | 7 | ################ cpg 8 | a=fastread(paste("zcat ", file,sep='')) 9 | a$PROMOTER = substring(a$PROMOTER,6000,14000) 10 | a$EXPRESSION=log10(a$EXPRESSION+0.1) 11 | a$EXPRESSION=(a$EXPRESSION-min(a$EXPRESSION))/(max(a$EXPRESSION)-min(a$EXPRESSION)) 12 | a$bin <- cut(a$EXPRESSION, breaks=c(0,0.001,0.33,0.66,1), include.lowest = TRUE) 13 | 14 | # b=lapply(levels(a$bin), function(x){ table(do.call(c, lapply(a[a$bin==x,"PROMOTER"], function(y) unlist(gregexpr("CG|cG|Gc|cg",y)) )))/nrow(a[a$bin==x,]) } ) 15 | # g=lapply(levels(a$bin), function(x){ table(do.call(c, lapply(a[a$bin==x,"PROMOTER"], function(y) unlist(gregexpr("G|g",y)) )))/nrow(a[a$bin==x,]) } ) 16 | # c=lapply(levels(a$bin), function(x){ table(do.call(c, lapply(a[a$bin==x,"PROMOTER"], function(y) unlist(gregexpr("C|c",y)) )))/nrow(a[a$bin==x,]) } ) 17 | 18 | for (nuc1 in c('a','c','g','t')){ 19 | say(nuc1) 20 | mononuc[[nuc1]]=lapply(levels(a$bin), function(x){ table(do.call(c, lapply(a[a$bin==x,"PROMOTER"], function(y) unlist(gregexpr(paste(nuc1,'|',toupper(nuc1),sep=''),y)) )))/nrow(a[a$bin==x,]) } ) 21 | } 22 | 23 | pdf(outfile,width=10,height=8) #makes Fig6C/FigS7 for human for FigS8 for mouse 24 | par(mfrow=c(4,4), oma = c(5,4,0,0) + 0.1, mar = c(0,0,1,1) + 0.1) 25 | i=0 26 | for (nuc1 in c('a','c','g','t')){ 27 | for (nuc2 in c('a','c','g','t')){ 28 | dinuc = paste(nuc1,nuc2,'|',toupper(nuc1),toupper(nuc2),'|',nuc1,toupper(nuc2),'|',toupper(nuc1),nuc2,sep='') 29 | dinuc2 = paste(toupper(nuc1),toupper(nuc2),sep='') 30 | say(dinuc) 31 | b=lapply(levels(a$bin), function(x){ table(do.call(c, lapply(a[a$bin==x,"PROMOTER"], function(y) unlist(gregexpr(dinuc,y)) )))/nrow(a[a$bin==x,]) } ) 32 | 33 | x = 1:8000 34 | xin = -4000:3999 35 | idx = 1:8000 36 | 37 | plot(xin, predict(loess(b[[4]][idx]/(mononuc[[nuc1]][[4]][idx]*mononuc[[nuc2]][[4]][idx+1])~x, span=0.01), newdata=idx), col='cyan', type='l', ylim=c(0,1.6), main = dinuc2, axes = FALSE) 38 | axis(side = 1, labels = (i %/% 4 == 3)) 39 | axis(side = 2, labels = (i %% 4 == 0), las=1) 40 | lines(xin, predict(loess(b[[3]][idx]/(mononuc[[nuc1]][[3]][idx]*mononuc[[nuc2]][[3]][idx+1])~x, span=0.01), newdata=idx), col='blue', type='l') 41 | lines(xin, predict(loess(b[[2]][idx]/(mononuc[[nuc1]][[2]][idx]*mononuc[[nuc2]][[2]][idx+1])~x, span=0.01), newdata=idx), col='red', type='l') 42 | lines(xin, predict(loess(b[[1]][idx]/(mononuc[[nuc1]][[1]][idx]*mononuc[[nuc2]][[1]][idx+1])~x, span=0.01), newdata=idx), col='black', type='l') 43 | i=i+1 44 | } 45 | } 46 | title(xlab = "Position relative to TSS", ylab = "Observed/Expected", outer = TRUE, line = 3) 47 | # legend("topleft", bg="white", bty="n", legend = levels(a$bin), text.col = c("black", "red","blue","cyan"), outer = TRUE) 48 | dev.off() 49 | -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/FigS6.R: -------------------------------------------------------------------------------- 1 | library(RColorBrewer) 2 | library(zoo) 3 | 4 | folder = args[1] 5 | species = args[2] 6 | 7 | for (type in c('gradinput','intgrad')){ #'saliency', 'elrp', 'deeplift' 8 | b=list() 9 | for(num in 1:10){ #10 folds of CV 10 | say(type, num) 11 | a=fastread(paste("zcat ", folder, type, '.', num, ".txt.gz",sep='')) 12 | a$V2=(a$V2-min(a$V2))/(max(a$V2)-min(a$V2)) 13 | a$bin <- cut(a$V2, breaks=c(0,0.001,0.33,0.66,1), include.lowest = TRUE) 14 | 15 | b[[num]]=sapply(levels(a$bin), function(x){ apply(a[a$bin==x,3:(ncol(a)-7)], 2, mean) } ) 16 | a[,3:(ncol(a)-7)]=round(t(t(a[,3:(ncol(a)-7)]) - b[[num]][,1]),3) #broadcast subtraction of vector through matrix 17 | b[[num]]=sapply(levels(a$bin), function(x){ apply(a[a$bin==x,3:(ncol(a)-7)], 2, mean) } ) 18 | } 19 | 20 | pdf(paste(species, type, ".pdf",sep=''),width=10,height=4) #ran this for both mouse and human 10-fold CV results for each technique 21 | c=apply(simplify2array(b), 1:2, mean) 22 | x = 1:10500 23 | xin = -7000:3499 24 | plot(xin, predict(loess(c[x,4]~x, span=0.01)), col='cyan', type='l') 25 | abline(0,0,col="black") 26 | lines(xin, predict(loess(c[x,3]~x, span=0.01)), col='blue', type='l') 27 | lines(xin, predict(loess(c[x,2]~x, span=0.01)), col='red', type='l') 28 | legend("topleft", bg="white", bty="n", legend = colnames(c)[2:4], text.col = c("red","blue","cyan")) 29 | dev.off() 30 | } 31 | -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/FigS7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/FigS7.pdf -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/FigS8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/FigS8.pdf -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz: -------------------------------------------------------------------------------- 1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/all_crossvalidated_predictions.txt: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/all_crossvalidated_predictions.txt -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/all_crossvalidated_predictions_mouse.txt: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/all_crossvalidated_predictions_mouse.txt -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/best_positions.R: -------------------------------------------------------------------------------- 1 | library(zoo) 2 | library(GenomicRanges) 3 | library(mixtools) 4 | 5 | file = args[1] 6 | fold = args[2] 7 | species = args[3] 8 | 9 | if(species == 'human') expr=read.delim("all_crossvalidated_predictions.txt") else expr=read.delim("all_crossvalidated_predictions_mouse.txt") 10 | 11 | kmerlen = 1 12 | 13 | a=fastread(paste("zcat ", file, sep='')) 14 | ids=a[,1] 15 | preds = expr[match(ids, expr$Gene),"Pred"] 16 | mixmdl = normalmixEM(preds) 17 | thresh2 = mixmdl$mu[2] 18 | post.df <- as.data.frame(cbind(x = mixmdl$x, mixmdl$posterior)) 19 | threshold = post.df[which(post.df$comp.1 == max(post.df$comp.1[post.df$comp.1 <= 0.5])),"x"] 20 | 21 | # pdf("Fig6B.pdf",width=10,height=4) #general plot to make 6B histogram, but on full data rather than 1 fold 22 | # plot(mixmdl,which=2) 23 | # abline(v=threshold) 24 | # dev.off() 25 | 26 | a=a[,3:(ncol(a)-6)] 27 | 28 | # low vs high 29 | zeromean=apply(a[preds < threshold,], 2, mean ) 30 | zerosd=apply(a[preds < threshold,], 2, function(x) qnorm(0.995)*sd(x) ) #99th% confidence interval of z-distribution 31 | 32 | gr <- GRanges() 33 | gr = suppressWarnings(do.call("c", sapply(1:nrow(a), function(seq) { 34 | idx = which(as.vector(unlist(lapply(1:ncol(a), function(x) { a[seq,x] > zeromean[x] + zerosd[x] })))) #| a[seq,x] < zeromean[x] - zerosd[x] 35 | GRanges(seqnames = rep(ids[seq],length(idx)),IRanges(start = idx, end = idx+kmerlen-1)) 36 | }))) 37 | 38 | writefile(as.data.frame(reduce(gr)), paste("motif_analysis/bestpos1mer", fold, species, ".txt", sep=''), col.names=F) 39 | -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/cv_human: -------------------------------------------------------------------------------- 1 | ../datasets/cv_human/ -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/cv_mouse: -------------------------------------------------------------------------------- 1 | ../datasets/cv_mouse/ -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/deep_explain_cv.py: -------------------------------------------------------------------------------- 1 | import os, h5py, math 2 | import numpy as np 3 | import pandas as pd 4 | import tensorflow as tf 5 | from optparse import OptionParser 6 | from tensorflow import keras 7 | from keras.models import load_model, Model 8 | from keras import backend as K 9 | from deepexplain.tensorflow import DeepExplain 10 | 11 | batchsize = 500 12 | 13 | def main(): 14 | usage = 'usage: %prog [options] ' 15 | parser = OptionParser(usage) 16 | (options,args) = parser.parse_args() 17 | 18 | if len(args) != 3: 19 | print args 20 | parser.error('Must provide data file and output directory') 21 | else: 22 | data_file = args[0] 23 | out_dir = args[1] 24 | fold = args[2] 25 | testfile = os.path.join(out_dir, fold+'test.h5') 26 | 27 | testfile = h5py.File(testfile, 'r') 28 | X_testhalflife, X_testpromoter, y_test, geneName = testfile['data'], testfile['promoter'], testfile['label'], testfile['geneName'] 29 | model = load_model(data_file) 30 | 31 | with DeepExplain(session=K.get_session()) as de: 32 | input_tensor = model.inputs 33 | fModel = Model(inputs = input_tensor, outputs = model.outputs) 34 | for method in ['deeplift', 'grad*input', 'saliency', 'elrp', 'intgrad']: #'occlusion' not supported 35 | pdframe = pd.DataFrame() 36 | for i in range(0, int(math.ceil(len(geneName) / float(batchsize)))): 37 | first = i*batchsize 38 | last = (i*batchsize+batchsize) 39 | if last > len(geneName): last = len(geneName) 40 | xs = X_testpromoter[first:last,3000:13500,:] 41 | xs2 = X_testhalflife[first:last,:] 42 | ys = y_test[first:last] 43 | gN = geneName[first:last] 44 | if method in ('intgrad', 'deeplift'): #try these methods with and without specified baseline 45 | #empirical ACGT frequencies in -7Kb to +3.5Kb sequence surrounding human TSSs (for non-expressed genes only) 46 | baseline = [np.repeat(np.array([[0.2617064, 0.2335449, 0.2379253, 0.2668234]]), 10500, axis=0), np.zeros(6)] 47 | map = de.explain(method, fModel(input_tensor), input_tensor, [xs, xs2], baseline = baseline) 48 | else: 49 | map = de.explain(method, fModel(input_tensor), input_tensor, [xs, xs2]) 50 | X, halflife = map[0], map[1] 51 | frame = pd.DataFrame(np.column_stack((ys, 10**3 * np.sum(X, 2), halflife))) 52 | frame.index = gN 53 | pdframe = pdframe.append(frame) 54 | pdframe.to_csv(os.path.join(out_dir, method.replace("*", "")+'.'+fold+'.txt'),sep='\t',header=False, float_format='%.3f') 55 | 56 | if __name__ == '__main__': 57 | main() 58 | -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/extract_kmer.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | $species = shift; 4 | 5 | if ($species eq "human"){ open IN, "zcat Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1,11 | "; } 6 | else{ open IN, "zcat Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1,11 | "; } 7 | while(){ 8 | ($id, $seq) = split /\t/, $_; 9 | $id2seq{$id} = $seq; 10 | } 11 | close IN; 12 | 13 | while(<>){ chomp; 14 | @a = split /\t/, $_; 15 | $id = $a[0]; 16 | $start = $a[1]+3000-1; 17 | $len = $a[3]; 18 | print ">$id\_$start\_$len\n", substr($id2seq{$id}, $start, $len), "\n"; 19 | } 20 | -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/human_all_1mer_400K/dreme.txt: -------------------------------------------------------------------------------- 1 | # DREME 4.12.0 2 | # command: dreme -oc human_all_1mer_400K/ -p subsampled_bestpos1mer_human_all_400K.fa -n subsampled_negbestpos1mer_human_all_400K.fa -mink 2 -maxk 10 3 | # positives: 400000 from subsampled_bestpos1mer_human_all_400K.fa (Mon Mar 26 23:42:31 PDT 2018) 4 | # negatives: 400000 from subsampled_negbestpos1mer_human_all_400K.fa (Mon Mar 26 23:42:22 PDT 2018) 5 | # host: parvati.grid.gs.washington.edu 6 | # when: Tue Mar 27 00:03:11 PDT 2018 7 | 8 | MEME version 4.12.0 9 | 10 | ALPHABET "DNA" DNA-LIKE 11 | A "Adenine" CC0000 ~ T "Thymine" 008000 12 | C "Cytosine" 0000CC ~ G "Guanine" FFB300 13 | N "Any base" = ACGT 14 | X = ACGT 15 | . = ACGT 16 | V "Not T" = ACG 17 | H "Not G" = ACT 18 | D "Not C" = AGT 19 | B "Not A" = CGT 20 | M "Amino" = AC 21 | R "Purine" = AG 22 | W "Weak" = AT 23 | S "Strong" = CG 24 | Y "Pyrimidine" = CT 25 | K "Keto" = GT 26 | U = T 27 | END ALPHABET 28 | 29 | strands: + - 30 | 31 | Background letter frequencies (from dataset): 32 | A 0.251 C 0.242 G 0.249 T 0.258 33 | 34 | 35 | MOTIF CG DREME-1 36 | 37 | # Word RC Word Pos Neg P-value E-value 38 | # BEST CG CG 225655 155300 4.9e-5418 3.4e-5412 39 | # CG CG 225655 155300 4.9e-5418 3.4e-5412 40 | 41 | letter-probability matrix: alength= 4 w= 2 nsites= 464497 E= 3.4e-5412 42 | 0.000000 1.000000 0.000000 0.000000 43 | 0.000000 0.000000 1.000000 0.000000 44 | 45 | 46 | MOTIF CACTGCAM DREME-2 47 | 48 | # Word RC Word Pos Neg P-value E-value 49 | # BEST CACTGCAM KTGCAGTG 9108 5326 3.5e-224 1.4e-218 50 | # CACTGCAA TTGCAGTG 5101 2647 8.5e-176 3.3e-170 51 | # CACTGCAC GTGCAGTG 4665 2988 1.6e-083 6.0e-078 52 | 53 | letter-probability matrix: alength= 4 w= 8 nsites= 9781 E= 1.4e-218 54 | 0.000000 1.000000 0.000000 0.000000 55 | 1.000000 0.000000 0.000000 0.000000 56 | 0.000000 1.000000 0.000000 0.000000 57 | 0.000000 0.000000 0.000000 1.000000 58 | 0.000000 0.000000 1.000000 0.000000 59 | 0.000000 1.000000 0.000000 0.000000 60 | 1.000000 0.000000 0.000000 0.000000 61 | 0.522135 0.477865 0.000000 0.000000 62 | 63 | 64 | MOTIF TCCCAGCW DREME-3 65 | 66 | # Word RC Word Pos Neg P-value E-value 67 | # BEST TCCCAGCW WGCTGGGA 10157 7016 1.3e-130 4.9e-125 68 | # TCCCAGCT AGCTGGGA 5479 3775 1.3e-071 5.0e-066 69 | # TCCCAGCA TGCTGGGA 4695 3248 2.1e-060 8.3e-055 70 | 71 | letter-probability matrix: alength= 4 w= 8 nsites= 10188 E= 4.9e-125 72 | 0.000000 0.000000 0.000000 1.000000 73 | 0.000000 1.000000 0.000000 0.000000 74 | 0.000000 1.000000 0.000000 0.000000 75 | 0.000000 1.000000 0.000000 0.000000 76 | 1.000000 0.000000 0.000000 0.000000 77 | 0.000000 0.000000 1.000000 0.000000 78 | 0.000000 1.000000 0.000000 0.000000 79 | 0.461131 0.000000 0.000000 0.538869 80 | 81 | 82 | MOTIF GCCTCCHAAA DREME-4 83 | 84 | # Word RC Word Pos Neg P-value E-value 85 | # BEST GCCTCCHAAA TTTDGGAGGC 4692 2651 2.4e-128 9.3e-123 86 | # GCCTCCCAAA TTTGGGAGGC 4452 2498 2.5e-124 9.5e-119 87 | # GCCTCCTAAA TTTAGGAGGC 151 100 7.7e-004 3.0e+002 88 | # GCCTCCAAAA TTTTGGAGGC 91 53 9.7e-004 3.7e+002 89 | 90 | letter-probability matrix: alength= 4 w= 10 nsites= 4697 E= 9.3e-123 91 | 0.000000 0.000000 1.000000 0.000000 92 | 0.000000 1.000000 0.000000 0.000000 93 | 0.000000 1.000000 0.000000 0.000000 94 | 0.000000 0.000000 0.000000 1.000000 95 | 0.000000 1.000000 0.000000 0.000000 96 | 0.000000 1.000000 0.000000 0.000000 97 | 0.019374 0.948478 0.000000 0.032148 98 | 1.000000 0.000000 0.000000 0.000000 99 | 1.000000 0.000000 0.000000 0.000000 100 | 1.000000 0.000000 0.000000 0.000000 101 | 102 | 103 | MOTIF CAGGWGA DREME-5 104 | 105 | # Word RC Word Pos Neg P-value E-value 106 | # BEST CAGGWGA TCWCCTG 11536 8393 3.0e-113 1.2e-107 107 | # CAGGAGA TCTCCTG 7483 5442 9.6e-074 3.7e-068 108 | # CAGGTGA TCACCTG 4092 2982 1.7e-040 6.6e-035 109 | 110 | letter-probability matrix: alength= 4 w= 7 nsites= 11631 E= 1.2e-107 111 | 0.000000 1.000000 0.000000 0.000000 112 | 1.000000 0.000000 0.000000 0.000000 113 | 0.000000 0.000000 1.000000 0.000000 114 | 0.000000 0.000000 1.000000 0.000000 115 | 0.646290 0.000000 0.000000 0.353710 116 | 0.000000 0.000000 1.000000 0.000000 117 | 1.000000 0.000000 0.000000 0.000000 118 | 119 | 120 | MOTIF CCTGTAR DREME-6 121 | 122 | # Word RC Word Pos Neg P-value E-value 123 | # BEST CCTGTAR YTACAGG 11198 8131 3.8e-111 1.5e-105 124 | # CCTGTAA TTACAGG 8099 5757 3.1e-090 1.2e-084 125 | # CCTGTAG CTACAGG 3116 2383 1.7e-023 6.6e-018 126 | 127 | letter-probability matrix: alength= 4 w= 7 nsites= 11238 E= 1.5e-105 128 | 0.000000 1.000000 0.000000 0.000000 129 | 0.000000 1.000000 0.000000 0.000000 130 | 0.000000 0.000000 0.000000 1.000000 131 | 0.000000 0.000000 1.000000 0.000000 132 | 0.000000 0.000000 0.000000 1.000000 133 | 1.000000 0.000000 0.000000 0.000000 134 | 0.722460 0.000000 0.277540 0.000000 135 | 136 | 137 | MOTIF CAGGMTGG DREME-7 138 | 139 | # Word RC Word Pos Neg P-value E-value 140 | # BEST CAGGMTGG CCAKCCTG 10211 7468 3.0e-097 1.1e-091 141 | # CAGGCTGG CCAGCCTG 8697 6311 1.2e-086 4.6e-081 142 | # CAGGATGG CCATCCTG 1520 1162 2.3e-012 9.0e-007 143 | 144 | letter-probability matrix: alength= 4 w= 8 nsites= 10232 E= 1.1e-091 145 | 0.000000 1.000000 0.000000 0.000000 146 | 1.000000 0.000000 0.000000 0.000000 147 | 0.000000 0.000000 1.000000 0.000000 148 | 0.000000 0.000000 1.000000 0.000000 149 | 0.148944 0.851056 0.000000 0.000000 150 | 0.000000 0.000000 0.000000 1.000000 151 | 0.000000 0.000000 1.000000 0.000000 152 | 0.000000 0.000000 1.000000 0.000000 153 | 154 | 155 | MOTIF GTGGHTCA DREME-8 156 | 157 | # Word RC Word Pos Neg P-value E-value 158 | # BEST GTGGHTCA TGADCCAC 5973 4156 2.2e-074 8.6e-069 159 | # GTGGCTCA TGAGCCAC 4517 3073 7.4e-063 2.9e-057 160 | # GTGGATCA TGATCCAC 1197 833 3.0e-016 1.2e-010 161 | # GTGGTTCA TGAACCAC 352 285 4.4e-003 1.7e+003 162 | 163 | letter-probability matrix: alength= 4 w= 8 nsites= 6067 E= 8.6e-069 164 | 0.000000 0.000000 1.000000 0.000000 165 | 0.000000 0.000000 0.000000 1.000000 166 | 0.000000 0.000000 1.000000 0.000000 167 | 0.000000 0.000000 1.000000 0.000000 168 | 0.197297 0.744520 0.000000 0.058184 169 | 0.000000 0.000000 0.000000 1.000000 170 | 0.000000 1.000000 0.000000 0.000000 171 | 1.000000 0.000000 0.000000 0.000000 172 | 173 | 174 | MOTIF GCTAATTTTK DREME-9 175 | 176 | # Word RC Word Pos Neg P-value E-value 177 | # BEST GCTAATTTTK MAAAATTAGC 3715 2529 8.0e-052 3.1e-046 178 | # GCTAATTTTT AAAAATTAGC 3318 2263 4.6e-046 1.8e-040 179 | # GCTAATTTTG CAAAATTAGC 397 267 2.5e-007 9.7e-002 180 | 181 | letter-probability matrix: alength= 4 w= 10 nsites= 3715 E= 3.1e-046 182 | 0.000000 0.000000 1.000000 0.000000 183 | 0.000000 1.000000 0.000000 0.000000 184 | 0.000000 0.000000 0.000000 1.000000 185 | 1.000000 0.000000 0.000000 0.000000 186 | 1.000000 0.000000 0.000000 0.000000 187 | 0.000000 0.000000 0.000000 1.000000 188 | 0.000000 0.000000 0.000000 1.000000 189 | 0.000000 0.000000 0.000000 1.000000 190 | 0.000000 0.000000 0.000000 1.000000 191 | 0.000000 0.000000 0.106864 0.893136 192 | 193 | 194 | MOTIF GCYAACA DREME-10 195 | 196 | # Word RC Word Pos Neg P-value E-value 197 | # BEST GCYAACA TGTTRGC 4892 3534 1.8e-050 7.0e-045 198 | # GCCAACA TGTTGGC 3302 2457 2.5e-029 9.6e-024 199 | # GCTAACA TGTTAGC 1596 1081 9.2e-024 3.5e-018 200 | 201 | letter-probability matrix: alength= 4 w= 7 nsites= 4906 E= 7.0e-045 202 | 0.000000 0.000000 1.000000 0.000000 203 | 0.000000 1.000000 0.000000 0.000000 204 | 0.000000 0.673869 0.000000 0.326131 205 | 1.000000 0.000000 0.000000 0.000000 206 | 1.000000 0.000000 0.000000 0.000000 207 | 0.000000 1.000000 0.000000 0.000000 208 | 1.000000 0.000000 0.000000 0.000000 209 | 210 | 211 | MOTIF CTTGAACC DREME-11 212 | 213 | # Word RC Word Pos Neg P-value E-value 214 | # BEST CTTGAACC GGTTCAAG 3267 2202 7.8e-048 3.0e-042 215 | # CTTGAACC GGTTCAAG 3267 2202 7.8e-048 3.0e-042 216 | 217 | letter-probability matrix: alength= 4 w= 8 nsites= 3268 E= 3.0e-042 218 | 0.000000 1.000000 0.000000 0.000000 219 | 0.000000 0.000000 0.000000 1.000000 220 | 0.000000 0.000000 0.000000 1.000000 221 | 0.000000 0.000000 1.000000 0.000000 222 | 1.000000 0.000000 0.000000 0.000000 223 | 1.000000 0.000000 0.000000 0.000000 224 | 0.000000 1.000000 0.000000 0.000000 225 | 0.000000 1.000000 0.000000 0.000000 226 | 227 | 228 | MOTIF AGGTCARGAG DREME-12 229 | 230 | # Word RC Word Pos Neg P-value E-value 231 | # BEST AGGTCARGAG CTCYTGACCT 2305 1433 5.7e-047 2.2e-041 232 | # AGGTCAGGAG CTCCTGACCT 2047 1295 2.3e-039 9.0e-034 233 | # AGGTCAAGAG CTCTTGACCT 258 139 1.2e-009 4.6e-004 234 | 235 | letter-probability matrix: alength= 4 w= 10 nsites= 2305 E= 2.2e-041 236 | 1.000000 0.000000 0.000000 0.000000 237 | 0.000000 0.000000 1.000000 0.000000 238 | 0.000000 0.000000 1.000000 0.000000 239 | 0.000000 0.000000 0.000000 1.000000 240 | 0.000000 1.000000 0.000000 0.000000 241 | 1.000000 0.000000 0.000000 0.000000 242 | 0.111931 0.000000 0.888069 0.000000 243 | 0.000000 0.000000 1.000000 0.000000 244 | 1.000000 0.000000 0.000000 0.000000 245 | 0.000000 0.000000 1.000000 0.000000 246 | 247 | 248 | MOTIF CTCAGCCYC DREME-13 249 | 250 | # Word RC Word Pos Neg P-value E-value 251 | # BEST CTCAGCCYC GRGGCTGAG 4962 3671 9.6e-045 3.7e-039 252 | # CTCAGCCTC GAGGCTGAG 4594 3388 2.6e-042 1.0e-036 253 | # CTCAGCCCC GGGGCTGAG 368 283 4.9e-004 1.9e+002 254 | 255 | letter-probability matrix: alength= 4 w= 9 nsites= 4964 E= 3.7e-039 256 | 0.000000 1.000000 0.000000 0.000000 257 | 0.000000 0.000000 0.000000 1.000000 258 | 0.000000 1.000000 0.000000 0.000000 259 | 1.000000 0.000000 0.000000 0.000000 260 | 0.000000 0.000000 1.000000 0.000000 261 | 0.000000 1.000000 0.000000 0.000000 262 | 0.000000 1.000000 0.000000 0.000000 263 | 0.000000 0.074134 0.000000 0.925866 264 | 0.000000 1.000000 0.000000 0.000000 265 | 266 | 267 | MOTIF CACMA DREME-14 268 | 269 | # Word RC Word Pos Neg P-value E-value 270 | # BEST CACMA TKGTG 48806 45155 4.0e-037 1.5e-031 271 | # CACCA TGGTG 29802 27221 1.8e-029 6.8e-024 272 | # CACAA TTGTG 20742 19569 1.0e-009 4.0e-004 273 | 274 | letter-probability matrix: alength= 4 w= 5 nsites= 52085 E= 1.5e-031 275 | 0.000000 1.000000 0.000000 0.000000 276 | 1.000000 0.000000 0.000000 0.000000 277 | 0.000000 1.000000 0.000000 0.000000 278 | 0.409523 0.590477 0.000000 0.000000 279 | 1.000000 0.000000 0.000000 0.000000 280 | 281 | 282 | MOTIF AAAAATAC DREME-15 283 | 284 | # Word RC Word Pos Neg P-value E-value 285 | # BEST AAAAATAC GTATTTTT 4100 3055 1.0e-035 3.8e-030 286 | # AAAAATAC GTATTTTT 4100 3055 1.0e-035 3.8e-030 287 | 288 | letter-probability matrix: alength= 4 w= 8 nsites= 4103 E= 3.8e-030 289 | 1.000000 0.000000 0.000000 0.000000 290 | 1.000000 0.000000 0.000000 0.000000 291 | 1.000000 0.000000 0.000000 0.000000 292 | 1.000000 0.000000 0.000000 0.000000 293 | 1.000000 0.000000 0.000000 0.000000 294 | 0.000000 0.000000 0.000000 1.000000 295 | 1.000000 0.000000 0.000000 0.000000 296 | 0.000000 1.000000 0.000000 0.000000 297 | 298 | 299 | MOTIF AGTGCAATG DREME-16 300 | 301 | # Word RC Word Pos Neg P-value E-value 302 | # BEST AGTGCAATG CATTGCACT 969 551 2.3e-027 8.4e-022 303 | # AGTGCAATG CATTGCACT 969 551 2.3e-027 8.4e-022 304 | 305 | letter-probability matrix: alength= 4 w= 9 nsites= 969 E= 8.4e-022 306 | 1.000000 0.000000 0.000000 0.000000 307 | 0.000000 0.000000 1.000000 0.000000 308 | 0.000000 0.000000 0.000000 1.000000 309 | 0.000000 0.000000 1.000000 0.000000 310 | 0.000000 1.000000 0.000000 0.000000 311 | 1.000000 0.000000 0.000000 0.000000 312 | 1.000000 0.000000 0.000000 0.000000 313 | 0.000000 0.000000 0.000000 1.000000 314 | 0.000000 0.000000 1.000000 0.000000 315 | 316 | 317 | MOTIF CACYTG DREME-17 318 | 319 | # Word RC Word Pos Neg P-value E-value 320 | # BEST CACYTG CARGTG 13283 11654 5.5e-026 2.0e-020 321 | # CACCTG CAGGTG 7814 6606 1.7e-024 6.2e-019 322 | # CACTTG CAAGTG 5564 5132 1.4e-005 5.0e+000 323 | 324 | letter-probability matrix: alength= 4 w= 6 nsites= 13533 E= 2.0e-020 325 | 0.000000 1.000000 0.000000 0.000000 326 | 1.000000 0.000000 0.000000 0.000000 327 | 0.000000 1.000000 0.000000 0.000000 328 | 0.000000 0.585901 0.000000 0.414099 329 | 0.000000 0.000000 0.000000 1.000000 330 | 0.000000 0.000000 1.000000 0.000000 331 | 332 | 333 | MOTIF GGGTTTCWC DREME-18 334 | 335 | # Word RC Word Pos Neg P-value E-value 336 | # BEST GGGTTTCWC GWGAAACCC 1289 910 2.9e-016 1.1e-010 337 | # GGGTTTCAC GTGAAACCC 966 662 2.4e-014 8.7e-009 338 | # GGGTTTCTC GAGAAACCC 323 248 9.6e-004 3.5e+002 339 | 340 | letter-probability matrix: alength= 4 w= 9 nsites= 1289 E= 1.1e-010 341 | 0.000000 0.000000 1.000000 0.000000 342 | 0.000000 0.000000 1.000000 0.000000 343 | 0.000000 0.000000 1.000000 0.000000 344 | 0.000000 0.000000 0.000000 1.000000 345 | 0.000000 0.000000 0.000000 1.000000 346 | 0.000000 0.000000 0.000000 1.000000 347 | 0.000000 1.000000 0.000000 0.000000 348 | 0.749418 0.000000 0.000000 0.250582 349 | 0.000000 1.000000 0.000000 0.000000 350 | 351 | 352 | MOTIF CTCMTGATC DREME-19 353 | 354 | # Word RC Word Pos Neg P-value E-value 355 | # BEST CTCMTGATC GATCAKGAG 403 213 8.0e-015 3.0e-009 356 | # CTCATGATC GATCATGAG 242 108 3.0e-013 1.1e-007 357 | # CTCCTGATC GATCAGGAG 164 105 1.9e-004 7.1e+001 358 | 359 | letter-probability matrix: alength= 4 w= 9 nsites= 403 E= 3.0e-009 360 | 0.000000 1.000000 0.000000 0.000000 361 | 0.000000 0.000000 0.000000 1.000000 362 | 0.000000 1.000000 0.000000 0.000000 363 | 0.595533 0.404467 0.000000 0.000000 364 | 0.000000 0.000000 0.000000 1.000000 365 | 0.000000 0.000000 1.000000 0.000000 366 | 1.000000 0.000000 0.000000 0.000000 367 | 0.000000 0.000000 0.000000 1.000000 368 | 0.000000 1.000000 0.000000 0.000000 369 | 370 | 371 | MOTIF GGCAGAGS DREME-20 372 | 373 | # Word RC Word Pos Neg P-value E-value 374 | # BEST GGCAGAGS SCTCTGCC 2879 2383 3.7e-012 1.3e-006 375 | # GGCAGAGG CCTCTGCC 2178 1800 1.0e-009 3.7e-004 376 | # GGCAGAGC GCTCTGCC 704 587 6.1e-004 2.3e+002 377 | 378 | letter-probability matrix: alength= 4 w= 8 nsites= 2885 E= 1.3e-006 379 | 0.000000 0.000000 1.000000 0.000000 380 | 0.000000 0.000000 1.000000 0.000000 381 | 0.000000 1.000000 0.000000 0.000000 382 | 1.000000 0.000000 0.000000 0.000000 383 | 0.000000 0.000000 1.000000 0.000000 384 | 1.000000 0.000000 0.000000 0.000000 385 | 0.000000 0.000000 1.000000 0.000000 386 | 0.000000 0.243674 0.756326 0.000000 387 | 388 | 389 | MOTIF GGAGGTGGA DREME-21 390 | 391 | # Word RC Word Pos Neg P-value E-value 392 | # BEST GGAGGTGGA TCCACCTCC 1070 787 2.6e-011 9.7e-006 393 | # GGAGGTGGA TCCACCTCC 1070 787 2.6e-011 9.7e-006 394 | 395 | letter-probability matrix: alength= 4 w= 9 nsites= 1072 E= 9.7e-006 396 | 0.000000 0.000000 1.000000 0.000000 397 | 0.000000 0.000000 1.000000 0.000000 398 | 1.000000 0.000000 0.000000 0.000000 399 | 0.000000 0.000000 1.000000 0.000000 400 | 0.000000 0.000000 1.000000 0.000000 401 | 0.000000 0.000000 0.000000 1.000000 402 | 0.000000 0.000000 1.000000 0.000000 403 | 0.000000 0.000000 1.000000 0.000000 404 | 1.000000 0.000000 0.000000 0.000000 405 | 406 | 407 | MOTIF AYCTTGGC DREME-22 408 | 409 | # Word RC Word Pos Neg P-value E-value 410 | # BEST AYCTTGGC GCCAAGRT 1614 1265 3.9e-011 1.4e-005 411 | # ATCTTGGC GCCAAGAT 1100 854 1.4e-008 5.1e-003 412 | # ACCTTGGC GCCAAGGT 514 411 3.9e-004 1.4e+002 413 | 414 | letter-probability matrix: alength= 4 w= 8 nsites= 1614 E= 1.4e-005 415 | 1.000000 0.000000 0.000000 0.000000 416 | 0.000000 0.318463 0.000000 0.681537 417 | 0.000000 1.000000 0.000000 0.000000 418 | 0.000000 0.000000 0.000000 1.000000 419 | 0.000000 0.000000 0.000000 1.000000 420 | 0.000000 0.000000 1.000000 0.000000 421 | 0.000000 0.000000 1.000000 0.000000 422 | 0.000000 1.000000 0.000000 0.000000 423 | 424 | 425 | MOTIF GGCAGATCA DREME-23 426 | 427 | # Word RC Word Pos Neg P-value E-value 428 | # BEST GGCAGATCA TGATCTGCC 476 311 2.2e-009 8.0e-004 429 | # GGCAGATCA TGATCTGCC 476 311 2.2e-009 8.0e-004 430 | 431 | letter-probability matrix: alength= 4 w= 9 nsites= 476 E= 8.0e-004 432 | 0.000000 0.000000 1.000000 0.000000 433 | 0.000000 0.000000 1.000000 0.000000 434 | 0.000000 1.000000 0.000000 0.000000 435 | 1.000000 0.000000 0.000000 0.000000 436 | 0.000000 0.000000 1.000000 0.000000 437 | 1.000000 0.000000 0.000000 0.000000 438 | 0.000000 0.000000 0.000000 1.000000 439 | 0.000000 1.000000 0.000000 0.000000 440 | 1.000000 0.000000 0.000000 0.000000 441 | 442 | 443 | MOTIF AGTCTTGCTC DREME-24 444 | 445 | # Word RC Word Pos Neg P-value E-value 446 | # BEST AGTCTTGCTC GAGCAAGACT 604 431 4.1e-008 1.5e-002 447 | # AGTCTTGCTC GAGCAAGACT 604 431 4.1e-008 1.5e-002 448 | 449 | letter-probability matrix: alength= 4 w= 10 nsites= 604 E= 1.5e-002 450 | 1.000000 0.000000 0.000000 0.000000 451 | 0.000000 0.000000 1.000000 0.000000 452 | 0.000000 0.000000 0.000000 1.000000 453 | 0.000000 1.000000 0.000000 0.000000 454 | 0.000000 0.000000 0.000000 1.000000 455 | 0.000000 0.000000 0.000000 1.000000 456 | 0.000000 0.000000 1.000000 0.000000 457 | 0.000000 1.000000 0.000000 0.000000 458 | 0.000000 0.000000 0.000000 1.000000 459 | 0.000000 1.000000 0.000000 0.000000 460 | 461 | 462 | MOTIF GTGTTGGGA DREME-25 463 | 464 | # Word RC Word Pos Neg P-value E-value 465 | # BEST GTGTTGGGA TCCCAACAC 425 282 4.2e-008 1.5e-002 466 | # GTGTTGGGA TCCCAACAC 425 282 4.2e-008 1.5e-002 467 | 468 | letter-probability matrix: alength= 4 w= 9 nsites= 426 E= 1.5e-002 469 | 0.000000 0.000000 1.000000 0.000000 470 | 0.000000 0.000000 0.000000 1.000000 471 | 0.000000 0.000000 1.000000 0.000000 472 | 0.000000 0.000000 0.000000 1.000000 473 | 0.000000 0.000000 0.000000 1.000000 474 | 0.000000 0.000000 1.000000 0.000000 475 | 0.000000 0.000000 1.000000 0.000000 476 | 0.000000 0.000000 1.000000 0.000000 477 | 1.000000 0.000000 0.000000 0.000000 478 | 479 | 480 | # Stopping reason: E-value threshold exceeded 481 | # Running time: 44160.40 seconds 482 | -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/humangradinput.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/humangradinput.pdf -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/humanintgrad.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/humanintgrad.pdf -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/motif_analysis: -------------------------------------------------------------------------------- 1 | ../datasets/motif_analysis/ -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/mouse_all_1mer_400K/dreme.txt: -------------------------------------------------------------------------------- 1 | # DREME 4.12.0 2 | # command: dreme -oc mouse_all_1mer_400K/ -p subsampled_bestpos1mer_mouse_all_400K.fa -n subsampled_negbestpos1mer_mouse_all_400K.fa -mink 2 -maxk 10 3 | # positives: 400000 from subsampled_bestpos1mer_mouse_all_400K.fa (Mon Mar 26 23:42:00 PDT 2018) 4 | # negatives: 400000 from subsampled_negbestpos1mer_mouse_all_400K.fa (Mon Mar 26 23:42:14 PDT 2018) 5 | # host: parvati.grid.gs.washington.edu 6 | # when: Tue Mar 27 00:03:55 PDT 2018 7 | 8 | MEME version 4.12.0 9 | 10 | ALPHABET "DNA" DNA-LIKE 11 | A "Adenine" CC0000 ~ T "Thymine" 008000 12 | C "Cytosine" 0000CC ~ G "Guanine" FFB300 13 | N "Any base" = ACGT 14 | X = ACGT 15 | . = ACGT 16 | V "Not T" = ACG 17 | H "Not G" = ACT 18 | D "Not C" = AGT 19 | B "Not A" = CGT 20 | M "Amino" = AC 21 | R "Purine" = AG 22 | W "Weak" = AT 23 | S "Strong" = CG 24 | Y "Pyrimidine" = CT 25 | K "Keto" = GT 26 | U = T 27 | END ALPHABET 28 | 29 | strands: + - 30 | 31 | Background letter frequencies (from dataset): 32 | A 0.260 C 0.232 G 0.239 T 0.269 33 | 34 | 35 | MOTIF CG DREME-1 36 | 37 | # Word RC Word Pos Neg P-value E-value 38 | # BEST CG CG 186358 136959 4.6e-2762 3.2e-2756 39 | # CG CG 186358 136959 4.6e-2762 3.2e-2756 40 | 41 | letter-probability matrix: alength= 4 w= 2 nsites= 419330 E= 3.2e-2756 42 | 0.000000 1.000000 0.000000 0.000000 43 | 0.000000 0.000000 1.000000 0.000000 44 | 45 | 46 | MOTIF CAVC DREME-2 47 | 48 | # Word RC Word Pos Neg P-value E-value 49 | # BEST CAVC GBTG 207580 196664 6.8e-132 2.6e-126 50 | # CACC GGTG 98623 89412 1.3e-130 5.1e-125 51 | # CAGC GCTG 113609 107884 1.0e-046 4.0e-041 52 | # CAAC GTTG 72750 69684 1.7e-019 6.4e-014 53 | 54 | letter-probability matrix: alength= 4 w= 4 nsites= 352826 E= 2.6e-126 55 | 0.000000 1.000000 0.000000 0.000000 56 | 1.000000 0.000000 0.000000 0.000000 57 | 0.237108 0.350895 0.411996 0.000000 58 | 0.000000 1.000000 0.000000 0.000000 59 | 60 | 61 | MOTIF CA DREME-3 62 | 63 | # Word RC Word Pos Neg P-value E-value 64 | # BEST CA TG 387026 384137 1.3e-067 4.1e-062 65 | # CA TG 387026 384137 1.3e-067 4.1e-062 66 | 67 | letter-probability matrix: alength= 4 w= 2 nsites= 1816073 E= 4.1e-062 68 | 0.000000 1.000000 0.000000 0.000000 69 | 1.000000 0.000000 0.000000 0.000000 70 | 71 | 72 | MOTIF GGY DREME-4 73 | 74 | # Word RC Word Pos Neg P-value E-value 75 | # BEST GGY RCC 178464 174623 2.7e-018 1.8e-013 76 | # GGT ACC 120888 117686 2.6e-015 1.7e-010 77 | # GGC GCC 91497 89041 2.6e-011 1.7e-006 78 | 79 | letter-probability matrix: alength= 4 w= 3 nsites= 277850 E= 1.8e-013 80 | 0.000000 0.000000 1.000000 0.000000 81 | 0.000000 0.000000 1.000000 0.000000 82 | 0.000000 0.426817 0.000000 0.573183 83 | 84 | 85 | MOTIF CCTTTARTCC DREME-5 86 | 87 | # Word RC Word Pos Neg P-value E-value 88 | # BEST CCTTTARTCC GGAYTAAAGG 1380 1001 3.8e-015 1.7e-010 89 | # CCTTTAATCC GGATTAAAGG 1304 952 6.1e-014 2.8e-009 90 | # CCTTTAGTCC GGACTAAAGG 76 49 9.8e-003 4.5e+002 91 | 92 | letter-probability matrix: alength= 4 w= 10 nsites= 1382 E= 1.7e-010 93 | 0.000000 1.000000 0.000000 0.000000 94 | 0.000000 1.000000 0.000000 0.000000 95 | 0.000000 0.000000 0.000000 1.000000 96 | 0.000000 0.000000 0.000000 1.000000 97 | 0.000000 0.000000 0.000000 1.000000 98 | 1.000000 0.000000 0.000000 0.000000 99 | 0.945007 0.000000 0.054993 0.000000 100 | 0.000000 0.000000 0.000000 1.000000 101 | 0.000000 1.000000 0.000000 0.000000 102 | 0.000000 1.000000 0.000000 0.000000 103 | 104 | 105 | # Stopping reason: E-value threshold exceeded 106 | # Running time: 14961.69 seconds 107 | -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/mouse_all_1mer_400K/dreme.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | dreme -oc mouse_all_1mer_400K/ -p subsampled_bestpos1mer_mouse_all_400K.fa -n subsampled_negbestpos1mer_mouse_all_400K.fa -mink 2 -maxk 10 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | both 24 | 25 | 26 | 100 27 | 0.01 28 | 1 29 | parvati.grid.gs.washington.edu 30 | Tue Mar 27 00:03:55 PDT 2018 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/mousegradinput.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/mousegradinput.pdf -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/mouseintgrad.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/mouseintgrad.pdf -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/pM10Kb_Mouse_cv: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/pM10Kb_Mouse_cv/ -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/pM10Kb_cv: -------------------------------------------------------------------------------- 1 | ../Fig3_S3/pM10Kb_cv/ -------------------------------------------------------------------------------- /Fig6_S6_S7_S8/runme.sh: -------------------------------------------------------------------------------- 1 | # Deep Explain CV computes the following methods: 2 | # 'deeplift', 'grad*input', 'saliency', 'elrp', 'intgrad' 3 | # 'grad*input' and 'intgrad' are presented in the paper for the reasons described 4 | # thus only these two have stored precomputed results to save space in download 5 | 6 | #human (using best model from 10 trials on each of 10 folds of data) 7 | python deep_explain_cv.py cv_human/01trainepoch.10-0.427.h5 pM10Kb_cv/ 1 8 | python deep_explain_cv.py cv_human/62trainepoch.13-0.421.h5 pM10Kb_cv/ 2 9 | python deep_explain_cv.py cv_human/63trainepoch.10-0.428.h5 pM10Kb_cv/ 3 10 | python deep_explain_cv.py cv_human/24trainepoch.09-0.427.h5 pM10Kb_cv/ 4 11 | python deep_explain_cv.py cv_human/25trainepoch.16-0.422.h5 pM10Kb_cv/ 5 12 | python deep_explain_cv.py cv_human/76trainepoch.11-0.414.h5 pM10Kb_cv/ 6 13 | python deep_explain_cv.py cv_human/17trainepoch.09-0.414.h5 pM10Kb_cv/ 7 14 | python deep_explain_cv.py cv_human/08trainepoch.11-0.418.h5 pM10Kb_cv/ 8 15 | python deep_explain_cv.py cv_human/79trainepoch.14-0.436.h5 pM10Kb_cv/ 9 16 | python deep_explain_cv.py cv_human/110trainepoch.08-0.425.h5 pM10Kb_cv/ 10 17 | 18 | #Fig6A and FigS6A-B 19 | Rscript FigS6.R pM10Kb_cv/ human #resulting plots in {human/mouse}{gradinput/intgrad}.pdf 20 | 21 | # for human (Fig6C and S7) 22 | Rscript Fig6C_S7_S8.R Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz FigS7.pdf 23 | 24 | for x in {1..10}; do { Rscript best_positions.R pM10Kb_cv/gradinput.$x.txt.gz $x human; } done 25 | 26 | cd motif_analysis/ 27 | #generate permuted set 28 | for x in {1..10}; do { cut -f 1 negbestpos1mer$x\human.txt | shuf | paste - <(cut -f 2- bestpos1mer$x\human.txt) >negbestpos1mer$x\human.txt; } done 29 | for x in {1..10}; do { ./extract_kmers.pl human bestpos1mer$x\human.fa; } done 30 | for x in {1..10}; do { ./extract_kmers.pl human negbestpos1mer$x\human.fa; } done 31 | cat bestpos1mer*human.fa >bestpos1mer_human_all.fa 32 | cat negbestpos1mer*human.fa >negbestpos1mer_human_all.fa 33 | fasta-subsample bestpos1mer_human_all.fa 400000 >subsampled_bestpos1mer_human_all_400K.fa 34 | fasta-subsample negbestpos1mer_human_all.fa 400000 >subsampled_negbestpos1mer_human_all_400K.fa 35 | #Fig6B human 36 | dreme -oc human_all_1mer_400K/ -p subsampled_bestpos1mer_human_all_400K.fa -n subsampled_negbestpos1mer_human_all_400K.fa -mink 2 -maxk 10 37 | cd .. 38 | 39 | #mouse (using best model from 10 trials on each of 10 folds of data) 40 | python deep_explain_cv.py cv_mouse/71trainepoch.07-0.3200.h5 pM10Kb_Mouse_cv/ 1 41 | python deep_explain_cv.py cv_mouse/02trainepoch.07-0.3186.h5 pM10Kb_Mouse_cv/ 2 42 | python deep_explain_cv.py cv_mouse/63trainepoch.06-0.3173.h5 pM10Kb_Mouse_cv/ 3 43 | python deep_explain_cv.py cv_mouse/64trainepoch.09-0.3194.h5 pM10Kb_Mouse_cv/ 4 44 | python deep_explain_cv.py cv_mouse/45trainepoch.13-0.3113.h5 pM10Kb_Mouse_cv/ 5 45 | python deep_explain_cv.py cv_mouse/96trainepoch.07-0.3134.h5 pM10Kb_Mouse_cv/ 6 46 | python deep_explain_cv.py cv_mouse/77trainepoch.07-0.3223.h5 pM10Kb_Mouse_cv/ 7 47 | python deep_explain_cv.py cv_mouse/88trainepoch.06-0.3243.h5 pM10Kb_Mouse_cv/ 8 48 | python deep_explain_cv.py cv_mouse/79trainepoch.07-0.3171.h5 pM10Kb_Mouse_cv/ 9 49 | python deep_explain_cv.py cv_mouse/610trainepoch.10-0.3200.h5 pM10Kb_Mouse_cv/ 10 50 | 51 | #FigS6C-D 52 | Rscript FigS6.R pM10Kb_Mouse_cv/ mouse 53 | 54 | for x in {1..10}; do { Rscript best_positions.R pM10Kb_Mouse_cv/gradinput.$x.txt.gz $x mouse; } done 55 | 56 | cd motif_analysis/ 57 | #generate permuted set 58 | for x in {1..10}; do { cut -f 1 negbestpos1mer$x\mouse.txt | shuf | paste - <(cut -f 2- bestpos1mer$x\mouse.txt) >negbestpos1mer$x\mouse.txt; } done 59 | for x in {1..10}; do { ./extract_kmers.pl mouse bestpos1mer$x\mouse.fa; } done 60 | for x in {1..10}; do { ./extract_kmers.pl mouse negbestpos1mer$x\mouse.fa; } done 61 | cat bestpos1mer*mouse.fa >bestpos1mer_mouse_all.fa 62 | cat negbestpos1mer*mouse.fa >negbestpos1mer_mouse_all.fa 63 | fasta-subsample bestpos1mer_mouse_all.fa 400000 >subsampled_bestpos1mer_mouse_all_400K.fa 64 | fasta-subsample negbestpos1mer_mouse_all.fa 400000 >subsampled_negbestpos1mer_mouse_all_400K.fa 65 | #Fig6B mouse 66 | dreme -oc mouse_all_1mer_400K/ -p subsampled_bestpos1mer_mouse_all_400K.fa -n subsampled_negbestpos1mer_mouse_all_400K.fa -mink 2 -maxk 10 67 | cd .. 68 | 69 | # for mouse (FigS8) 70 | Rscript Fig6C_S7_S8.R Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz FigS8.pdf 71 | -------------------------------------------------------------------------------- /FigS1/57epigenomes.RPKM.pc.gz: -------------------------------------------------------------------------------- 1 | ../datasets/57epigenomes.RPKM.pc.gz -------------------------------------------------------------------------------- /FigS1/EG.name.txt: -------------------------------------------------------------------------------- 1 | ../datasets/EG.name.txt -------------------------------------------------------------------------------- /FigS1/FigS1.R: -------------------------------------------------------------------------------- 1 | library(gplots) 2 | 3 | x=read.delim(gzfile("57epigenomes.RPKM.pc.gz"), row.names=1) 4 | x$E000=NULL 5 | names=read.delim("EG.name.txt",F) 6 | colnames(x)=paste(colnames(x), gsub("_", " ", as.character(unlist(sapply(colnames(x), function(x) names[names$V1==x, "V2"]))))) 7 | 8 | y=as.matrix(cor(x, method='spearman')) 9 | colnames(y)=colnames(x) 10 | 11 | pdf("FigS1.pdf", height=10, width=10) 12 | par(oma=c(16,1,1,14)) 13 | heatmap.2(y, trace="none", breaks=seq(0,1,0.05), #,density.info="none" 14 | symkey=FALSE, cexRow=0.6, cexCol=0.6, dendrogram="row", key=TRUE, col=matlab::jet.colors(20), denscol="black") 15 | dev.off() 16 | -------------------------------------------------------------------------------- /FigS1/FigS1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/FigS1/FigS1.pdf -------------------------------------------------------------------------------- /FigS1/runme.sh: -------------------------------------------------------------------------------- 1 | Rscript FigS1.R -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright 2018 Vikram Agarwal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Xpresso: Predicting gene expression levels from genomic sequence 4 | 5 | This repository is intended to accompany our publication, primarily to enhance the reproducibility of our results. For more information please refer to: 6 | 7 | Agarwal V, Shendure J. [Predicting mRNA abundance directly from genomic sequence using deep convolutional neural networks](https://www.cell.com/cell-reports/pdf/S2211-1247(20)30616-1.pdf). 2020. **_Cell Reports_** 31 (7), 107663. [Youtube talk introducing the paper](https://youtu.be/xSdIJc-grXQ). 8 | 9 | 10 | These tools can be used in a variety of organisms and cell types of interest to: 11 | 12 | * Perform hyperparameter optimization in the gene expression prediction task (as shown in **Fig 1**) 13 | * Perform evolutionary analyses on human and mouse organisms, as well as one-to-one orthologs of each (as shown in **Fig 2**) 14 | * Uncover modes of gene regulation in a cell type of interest that are operating at the transcriptional and post-transcriptional levels (as shown in **Fig 3**) 15 | * Evaluate model performance for cell type-specifc and cell type-agnostic models (as shown in **Fig 4**) 16 | * Predict transcriptional activity across a genomic locus (as shown in **Fig 5**) 17 | * Interpret deep learning models to learn about promoter properties (as shown in **Fig 6**) 18 | 19 | If you find our code or predictions to be helpful for your work, please cite the paper above. 20 | 21 | 22 | # Dependencies for running entire pipeline: 23 | * Python3 modules: numpy, h5py, pandas, sklearn, keras (>=2.2.4-tf), hyperopt, biopython 24 | 25 | * R libraries: LSD, data.table, latticeExtra, Biostrings, rhdf5, ROCR, gplots, mixtools, reshape2, beeswarm, RColorBrewer, zoo, GenomicRanges 26 | 27 | * [TensorFlow (>=1.15.0)](https://www.tensorflow.org/install/) 28 | 29 | * [DeepExplain](https://github.com/marcoancona/DeepExplain) 30 | 31 | * [The MEME Suite](http://meme-suite.org/doc/download.html?man_type=web) 32 | 33 | * [UCSC tools](http://hgdownload.soe.ucsc.edu/downloads.html#source_downloads) installation, including bigBedToBed 34 | 35 | * [BEDTools](https://github.com/arq5x/bedtools2/releases) 36 | 37 | # Instructions for use 38 | 39 | For R code to work properly, please copy the contents of .Rprofile in this folder to your local .Rprofile. 40 | 41 | Users are advised to read the code closely and modify commented pieces as appropriate to acquire 42 | desired output for your environment. For example, you will need to download all of the additional 43 | R library and Python module dependencies for the code to work. This being said, if you find crucial 44 | files are missing, making the code unusable, or if you identify a major problem in the code, please 45 | raise a Github issue. 46 | 47 | In each Figure's folder, change directories to it and please read the file "runme.sh" first as it provides a general overview of relevant commands that were used sequentially to pre-process the data and generate the figures. 48 | 49 | **OPTIONAL**: For full functionality and to fix symbolic links, run the following command in the base Xpresso directory to download the associated datapack: 50 | 51 | `wget -r -np -nH --reject "index.html*" --cut-dirs 5 https://krishna.gs.washington.edu/content/members/vagar/Xpresso/data/datasets/` 52 | 53 | The figures will link to this folder accordingly. Some of the files need to be decompressed, and not all files are provided due to minimize the package size (currently ~11Gb). If you need additional files not provided for the purpose of reproduction, please contact Vikram Agarwal (vagar {at} calicolabs {dot} com). 54 | 55 | # Colab and Xpresso website 56 | 57 | Simpler tools for a broad overview and to deploy pre-trained models can be accessed at the Xpresso [website](https://xpresso.gs.washington.edu/). You can also start training models and generating predictions quickly using the iPython Notebook, or open it in Google Colab to get up to use a cloud GPU with this link: 58 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/vagarwal87/bdd33e66fa2c59c41409ca47e7132e61/xpresso.ipynb) 59 | 60 | **Note: The Colab generates predictions on a FASTA file of arbitrary DNA sequences without considering mRNA half-life features. To consider half-life features, one must prepare the full test file as shown 61 | in the datapack and Fig1_S2/** 62 | -------------------------------------------------------------------------------- /allfxns.pm: -------------------------------------------------------------------------------- 1 | package allfxns; 2 | 3 | use Getopt::Long; 4 | use POSIX qw/ceil floor/; 5 | use List::Util qw/min max/; 6 | #use Math::CDF qw/qnorm/; 7 | #use List::MoreUtils qw/uniq/; 8 | use Env; Env::import(); 9 | 10 | @ISA = qw(Exporter); 11 | @EXPORT = qw(ceil floor qnorm uniq min max 12 | bsub qsub parallelize 13 | fisher_yates_shuffle histogram log2 log10 mean median medianabsdev quantile round stdev sum trimmed_mean trimmed_stdev vecsum zscore 14 | alifold com comRNA fold gen_all_nmers readFasta rev revCom revComRNA gcContent cpgContent 15 | intersect unique array_diff array_minus); 16 | 17 | #JOB SUBMISSION SUBROUTINES 18 | 19 | sub bsub{ 20 | ($options, $error, $output, $job) = @_; 21 | system "bsub $options -e $error -o $output <$job"; 22 | unlink $job; 23 | } 24 | 25 | sub qsub{ 26 | ($options, $error, $output, $job) = @_; 27 | system "qsub $options -S /bin/bash -e $error -o $output $job"; 28 | unlink $job; 29 | } 30 | 31 | sub parallelize{ 32 | %o = %{$_[0]}; 33 | ($cmd, $indir, $uniq, $instr, $outstr, $count, $jobnum) = ($o{"cmd"}, $o{"indir"}, $o{"uniq"}, $o{"instr"}, $o{"outstr"}, 0, 1000000); 34 | $outdir = $o{"outdir"} || "$TMP"; 35 | $skipuniq = $o{"skipuniq"} || ""; 36 | $exten = $o{"exten"} || "out"; 37 | $qsub = $o{"qsub"} || 0; 38 | $bsub = $o{"bsub"} || 0; 39 | $onefile = $o{"onefile"} || 0; 40 | $skipsame = $o{"skipsame"} || 0; 41 | $redir = ($onefile) ? ">>" : ">"; 42 | @files = <$indir/*.$uniq>; 43 | system "mkdir $outdir" if !(-d $outdir); 44 | 45 | if ($qsub || $bsub) { 46 | $cmdperjob = $o{"cmdperjob"} || 1; 47 | $subopts = $o{"subopts"} || "-q idle"; 48 | $suberr = $o{"suberr"} || "$TMP/sub.err"; 49 | $subout = $o{"subout"} || "$TMP/sub.out"; 50 | $jobfolder = $o{"jobfolder"} || "$TMP"; 51 | } 52 | 53 | foreach $file (@files){ 54 | ++$count; 55 | $code = (split /\.$uniq/, (split /\//, $file)[-1])[0]; 56 | # $fileexists = `grep -P '$code\\t' /lab/bartel3_ata/agarwal/metazoans/human.utrs/three_prime_UTR/bins.txt`; next if $fileexists; 57 | # $region = (split /\//, $file)[-2]; 58 | $code = "concat$jobnum" if $onefile; 59 | next if (($skipsame && -s "$outdir/$code.$exten" != 0) || ($skipuniq ne "" && $file =~ /$skipuniq/)); # -s if file has zero size 60 | if (!$qsub && !$bsub) { 61 | #print "$cmd $instr $file $outstr $outdir/$code.$exten\n"; 62 | #system "$cmd $instr $file $outstr $outdir/$code.$exten"; 63 | #$numfile = `grep '>' $file | wc -l`; 64 | $outfile = $code; 65 | $numfile1 = (-e "$outdir/$outfile.$exten") ? `wc -l $outdir/$outfile.aln` : 1; 66 | print "$cmd $outdir/$outfile.aln\n" if $numfile1 == 0; 67 | } 68 | else { 69 | if ($cmd =~ /bin_MSA/ && $cmd !~ /all/){ 70 | # ($bin) = (split /\s/, `grep -m1 -P '^$code\t' $DIR/targetpred/robin/3UTRs_nonredundant_18577genes.UTR_cons.10bins`)[-1]; die "getbin" if $bin != int($bin); # $bin++; --> do this if robin's 71 | ($tmp, $species, $region, $kmerlen) = (split /\s/, $cmd); 72 | ($bin) = (split /\s/, `grep -m1 -P '^$code\t' $DIR/metazoans/$species/$region/allgenes.bins`)[-1] if $kmerlen == 2 || $kmerlen == 8; #.23way 73 | # print "$code, $species, $region, $kmerlen, $bin\n"; 74 | } 75 | ## print "$cmd $bin $instr $file 2>&- $outstr $redir $outdir/$code.$exten\n"; 76 | $jobfile = "$jobfolder/job$jobnum.sh"; 77 | open SH, ">>$jobfile" or die "can't open $jobfile"; 78 | print SH "$cmd $bin $instr $file 2>&- $outstr $redir $outdir/$code.$exten\n"; 79 | close SH; 80 | if ($count % $cmdperjob == 0) { 81 | qsub($subopts, $suberr, $subout, $jobfile) if $qsub; 82 | bsub($subopts, $suberr, $subout, $jobfile) if $bsub; 83 | $jobnum++; 84 | } 85 | } 86 | } 87 | qsub($subopts, $suberr, $subout, $jobfile) if $qsub; 88 | bsub($subopts, $suberr, $subout, $jobfile) if $bsub; 89 | print STDERR "COMPLETE!\n"; 90 | } 91 | 92 | #STATISTICS SUBROUTINES 93 | 94 | sub fisher_yates_shuffle { 95 | local $x = shift; 96 | for ($i = @$x; --$i; ) { 97 | $j = int rand ($i+1); 98 | next if $i == $j; 99 | @$x[$i,$j] = @$x[$j,$i]; 100 | } 101 | } 102 | 103 | sub histogram { 104 | $bin_width = 10; 105 | if ($#_ == 1){ 106 | ($hash, $bin_width) = @_; 107 | } 108 | else { $hash = shift; } 109 | $max, $min; 110 | %a = %$hash; 111 | %histogram; 112 | foreach (keys %a){ 113 | $histogram{ceil(($_ + 1) / $bin_width) -1} += $a{$_}; 114 | } 115 | 116 | while ( ($key, $value) = each(%histogram) ) { 117 | $max = $key if !defined($min) || $key > $max; 118 | $min = $key if !defined($min) || $key < $min; 119 | } 120 | 121 | for ($i = $min; $i <= $max; $i++) { 122 | $bin = sprintf("% 10d", ($i) * $bin_width); 123 | $frequency = $histogram{$i} || 0; 124 | 125 | $frequency = "#" x $frequency; 126 | print $bin." ".$frequency."\n" if $frequency ne ""; 127 | } 128 | 129 | print "===============================\n\n"; 130 | print " Width: ".$bin_width."\n"; 131 | print " Range: ".$min."-".$max."\n\n"; 132 | } 133 | 134 | sub log2 { return log($_[0])/log(2); } 135 | 136 | sub log10 { return log($_[0])/log(10); } 137 | 138 | sub mean { return sum($_[0])/scalar(@{$_[0]}); } 139 | 140 | sub median{ return quantile($_[0], 2); } 141 | 142 | sub medianabsdev{ 143 | local $med = median($_[0]); local @b; 144 | push(@b, abs($_ - $med)) for (@{$_[0]}); 145 | return median(\@b); 146 | } 147 | 148 | sub quantile{ 149 | local $rpole = shift; 150 | local $x = shift; 151 | @pole = @$rpole; 152 | $ret; 153 | @pole = sort {$a <=> $b} @pole; 154 | if( ($#pole % $x) == 0 ) { 155 | $ret = $pole[int($#pole/$x)]; 156 | } else { 157 | $ret = ($pole[int($#pole/$x)] + $pole[int($#pole/$x)+1]) / 2; 158 | } 159 | return $ret; 160 | } 161 | 162 | sub round{ return int($_[0] + 0.5 * ($_[0] <=> 0)); } 163 | 164 | sub stdev{ 165 | return 0 unless @_ > 1; 166 | local $mean = mean(\@_); 167 | local $tot = 0; 168 | foreach (@_){ $tot += ($_ - $mean)**2; } 169 | return sqrt( $tot / $#_ ); 170 | } 171 | 172 | sub trimmed_mean { 173 | local $a = shift; local $perc = shift; 174 | $perc /= 200; 175 | @a = sort {$a <=> $b} @$a; $num = scalar(@a); 176 | @a = @a[int($num*$perc)..int($num*(1-$perc))]; 177 | return mean(\@a); 178 | } 179 | 180 | sub trimmed_stdev { 181 | local $a = shift; local $perc = shift; 182 | $perc /= 200; 183 | @a = sort {$a <=> $b} @$a; $num = scalar(@a); 184 | @a = @a[int($num*$perc)..int($num*(1-$perc))]; 185 | return stdev(@a); 186 | } 187 | 188 | sub sum{ 189 | local $sum; 190 | $sum += $_ for(@{$_[0]}); 191 | return $sum; 192 | } 193 | 194 | sub vecsum{ #sum two vectors passed as array refs 195 | $len = max(scalar(@{$_[0]}), scalar(@{$_[1]}))-1; 196 | for $i (0..$len){ ${$_[0]}[$i] += ${$_[1]}[$i]; } 197 | } 198 | 199 | sub zscore{ 200 | local $val = shift; 201 | return ( $val - mean(\@_) ) / stdev(@_); 202 | } 203 | 204 | 205 | #NUCLEIC ACID SEQUENCE SUBROUTINES 206 | 207 | sub alifold{ 208 | local $file = shift; 209 | local $o = "-d0 -r -cv 0.6 -nc 0.5"; 210 | $score = `RNAalifold $o $file | tail -1`; 211 | ($mfe) = ($score =~ /.*\(\s*(-\d+.\d+) = .*\).*/); 212 | return $mfe; 213 | } 214 | 215 | sub com{ 216 | local $seq = shift; 217 | $seq =~ tr/tucgaTUCGA/aagctAAGCT/; 218 | return $seq; 219 | } 220 | 221 | sub comRNA{ 222 | local $seq = shift; 223 | $seq =~ tr/tucgaTUCGA/aagcuAAGCU/; 224 | return $seq; 225 | } 226 | 227 | sub fold{ 228 | local $seq = shift; 229 | local $o = shift; 230 | local $score = `echo $seq | RNAfold $o | tail -1`; 231 | ($mfe) = ($score =~ /.*\((\s?-\d+.\d+)\).*/); 232 | return $mfe; 233 | } 234 | 235 | sub gen_all_nmers{ 236 | local $nmer_size = shift; 237 | local @words = ''; 238 | foreach (1..$nmer_size) { 239 | @new_words = (); 240 | foreach $word (@words){ 241 | foreach $i ( qw/A C T G/ ){ push (@new_words, $word.$i); } 242 | } 243 | @words = @new_words; 244 | } 245 | return @words; 246 | } 247 | 248 | sub readFasta{ 249 | local $fasta = shift; 250 | local %fasta = (); 251 | open DNA, "<$fasta" || die "Could not open fasta file for $fasta\n"; 252 | while ($line = ){ chomp $line; 253 | if ($line =~ /^>\s?(\w+\.?\d*)/){ $header = $1; } 254 | # if ($line =~ /^>(.*)/){ $header = $1; } 255 | else { $fasta{$header} .= $line; } 256 | } 257 | close DNA; 258 | return \%fasta; 259 | } 260 | 261 | sub rev{ return scalar reverse $_[0]; } 262 | 263 | sub revCom{ return rev(com($_[0])); } 264 | 265 | sub revComRNA{ return rev(comRNA($_[0])); } 266 | 267 | sub gcContent{ return 0 if (() = ($_[0] =~ /[AUTCG]/ig)) == 0; return sprintf("%.3f", (() = ($_[0] =~ /[CG]/ig))/(() = ($_[0] =~ /[AUTCG]/ig))); } # %G + C in seq, ignoring case, missing nucleotides, or gaps 268 | 269 | sub cpgContent{ return 0 if (() = ($_[0] =~ /[AUTCG]/ig)) == 0; return sprintf("%.3f", (() = ($_[0] =~ /CG/ig))/(length($_[0])-1)); } 270 | 271 | #ARRAY SUBROUTINES 272 | 273 | sub unique(@) { 274 | return keys %{ {map { $_ => undef } @_}}; 275 | } 276 | 277 | sub intersect(\@\@) { 278 | my %e = map { $_ => undef } @{$_[0]}; 279 | return grep { exists( $e{$_} ) } @{$_[1]}; 280 | } 281 | 282 | sub array_diff(\@\@) { 283 | my %e = map { $_ => undef } @{$_[1]}; 284 | return @{[ ( grep { (exists $e{$_}) ? ( delete $e{$_} ) : ( 1 ) } @{ $_[0] } ), keys %e ] }; 285 | } 286 | 287 | sub array_minus(\@\@) { 288 | my %e = map{ $_ => undef } @{$_[1]}; 289 | return grep( ! exists( $e{$_} ), @{$_[0]} ); 290 | } 291 | -------------------------------------------------------------------------------- /xpresso_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/xpresso_logo.png --------------------------------------------------------------------------------