├── .Rprofile
├── Fig1_S2
    ├── 57epigenomes.median_expr.txt
    ├── Fig1B_S2B.R
    ├── Fig1C.R
    ├── Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz
    ├── Xpresso.py
    ├── choose_reference_genes.pl
    ├── choose_reference_genes_forhg19.pl
    ├── extract_promoters.pl
    ├── geneName2Ensembl.pl
    ├── geneName2EnsemblMouse.pl
    ├── generate_training_input.pl
    ├── print_losses.py
    ├── process_RNAseq.R
    ├── process_RNAseq_mouse.R
    ├── runme.sh
    ├── setup_training_files.py
    └── tpe_1K_10epochs_optimized_0to20K.hyperopt
├── Fig2
    ├── 1to1_orthologs_expression.txt
    ├── 57epigenomes.median_expr.txt
    ├── Fig2A.R
    ├── Fig2BC.R
    ├── Fig2D.R
    ├── Fig2E.pdf
    ├── Fig2EFG.R
    ├── Fig2F.pdf
    ├── Fig2G.pdf
    ├── Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz
    ├── Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz
    ├── Xpresso.py
    ├── all_crossvalidated_predictions.txt
    ├── all_crossvalidated_predictions_mouse.txt
    ├── ensembl2geneName_v90_mm10.txt
    ├── human2mouse_one2one_orthologs.txt
    ├── human2mouse_orthologs.txt
    ├── mouse.median_expr.txt
    ├── ortholog_results
    ├── pM10Kb_1KTest
    ├── pM10Kb_1KTest_Mouse
    ├── pM10Kb_1KTest_one2oneOrthologs
    ├── runme.sh
    ├── setup_training_files.py
    ├── subsample.py
    ├── subsampled_10fold
    ├── subsampling_10fold.R
    └── tpe_1K_10epochs_optimized_0to20K.hyperopt
├── Fig3_S3
    ├── 57epigenomes.RPKM.pc.gz
    ├── Boyer_et_al_PCG_repressed.txt
    ├── EnsemblID2GeneName.txt
    ├── Fig3ABCDEF_S3ABC.R
    ├── Fig3ABC_S3ABC.pdf
    ├── Fig3DEF_S3C.pdf
    ├── Fig3GH.R
    ├── Fig3GH.pdf
    ├── Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz
    ├── Mouse_FantomAnnotations.InputData.pM10Kb.mESC.txt.gz
    ├── Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz
    ├── Ouyang_mESC_RPKM_ensemblID.txt
    ├── Roadmap_FantomAnnotations.InputData.pM10Kb.GM12878expr.txt.gz
    ├── Roadmap_FantomAnnotations.InputData.pM10Kb.K562expr.txt.gz
    ├── Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz
    ├── Schofield_K562_half_lives.txt
    ├── Summary_Counts.default_predictions.txt.gz
    ├── TableS1_human.txt
    ├── TableS1_mouse.txt
    ├── Whyte_et_al_superenhancers.txt
    ├── all_crossvalidated_predictions.txt
    ├── all_crossvalidated_predictions_GM12878.txt
    ├── all_crossvalidated_predictions_K562.txt
    ├── all_crossvalidated_predictions_mESC.txt
    ├── all_crossvalidated_predictions_mouse.txt
    ├── cross_valid
    ├── cross_valid_GM12878
    ├── cross_valid_K562
    ├── cross_valid_mESC
    ├── cross_valid_mouse
    ├── diHMM
    ├── ensembl2geneName_v90_mm10.txt
    ├── integrate_cv_results.R
    ├── mouse.median_expr.txt
    ├── mouseESC_GSE76288_miRNA_counts_Denzler.txt
    ├── pM10Kb_1KTest_GM12878expr_cv
    ├── pM10Kb_1KTest_K562expr_cv
    ├── pM10Kb_1KTest_mESCexpr_cv
    ├── pM10Kb_Mouse_cv
    ├── pM10Kb_cv
    ├── runme.sh
    └── setup_training_files.py
├── Fig4_S4
    ├── 57epigenomes.RPKM.pc.gz
    ├── Fig4ABCD.R
    ├── Fig4ABCD.pdf
    ├── Fig4E.pdf
    ├── Fig4E_S4.R
    ├── FigS4A.pdf
    ├── FigS4B.R
    ├── FigS4B.pdf
    ├── FigS4B_2.pdf
    ├── FigS4C.R
    ├── FigS4C_human.pdf
    ├── FigS4C_mouse.pdf
    ├── GSE78709_sure23.plasmid.norm.combined.45.55.minus.promoters.bigWigSignal
    ├── GSE78709_sure23.plasmid.norm.combined.45.55.plus.promoters.bigWigSignal
    ├── JASPAR_CORE_2016_vertebrates.meme
    ├── Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz
    ├── Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz
    ├── all_crossvalidated_predictions.txt
    ├── all_crossvalidated_predictions_GM12878.txt
    ├── all_crossvalidated_predictions_K562.txt
    ├── all_crossvalidated_predictions_mESC.txt
    ├── all_crossvalidated_predictions_mouse.txt
    ├── baseline_models.R
    ├── coefplot.r
    ├── gencode.v27lift37.basic.annotation.gtf.gz
    ├── hg19_promoters_cage_corrected_withChr.bed
    ├── hg19_promoters_cage_corrected_withChr_andOthers.bed
    ├── hg19_promoters_cage_corrected_withChr_andOthers_minus.bed
    ├── hg19_promoters_cage_corrected_withChr_andOthers_plus.bed
    ├── hg38ToHg19.over.chain
    ├── hg38_promoters_cage_corrected.bed
    ├── hg38_promoters_cage_corrected_withChr.bed
    ├── model_comparison.txt
    ├── model_comparison_Fig3.txt
    ├── promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz
    ├── promoters_pM1.5Kb.FIMO_scanned.txt.gz
    ├── promoters_pM1.5Kb.fa.gz
    ├── promoters_pM1.5Kb.firstOrderMarkov_background
    ├── promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz
    ├── promoters_pM1.5Kb.mouse.FIMO_scanned.txt.gz
    ├── promoters_pM1.5Kb.mouse.fa.gz
    ├── promoters_pM1.5Kb.mouse.firstOrderMarkov_background
    ├── runme.sh
    └── supplement_ids.pl
├── Fig5_S5
    ├── hg19.chrom.sizes
    ├── human_trainepoch.11-0.426.h5
    ├── mouse_trainepoch.05-0.278.h5
    ├── predict_seqs.py
    ├── region.1Mb.bed
    ├── region.1Mb.intervals.100ntStep.Minus.bedGraph
    ├── region.1Mb.intervals.100ntStep.Minus.bw
    ├── region.1Mb.intervals.100ntStep.Plus.bedGraph
    ├── region.1Mb.intervals.100ntStep.Plus.bw
    ├── region.1Mb.intervals.100ntStep.bed
    ├── region.1Mb.intervals.100ntStep.input.txt.gz
    ├── region.1Mb.intervals.100ntStep.mouse.bed
    ├── region.1Mb.intervals.100ntStep.mouse.input.txt.gz
    ├── region.1Mb.intervals.100ntStep.mouse.minus.bedGraph
    ├── region.1Mb.intervals.100ntStep.mouse.minus.bw
    ├── region.1Mb.intervals.100ntStep.mouse.plus.bedGraph
    ├── region.1Mb.intervals.100ntStep.mouse.plus.bw
    ├── region.1Mb.mouse.bed
    ├── runme.sh
    └── tpe_1K_10epochs_optimized_0to20K.hyperopt
├── Fig6_S6_S7_S8
    ├── Fig6B.pdf
    ├── Fig6C_S7_S8.R
    ├── FigS6.R
    ├── FigS7.pdf
    ├── FigS8.pdf
    ├── Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz
    ├── Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz
    ├── all_crossvalidated_predictions.txt
    ├── all_crossvalidated_predictions_mouse.txt
    ├── best_positions.R
    ├── cv_human
    ├── cv_mouse
    ├── deep_explain_cv.py
    ├── extract_kmer.pl
    ├── human_all_1mer_400K
    │   ├── dreme.html
    │   ├── dreme.txt
    │   └── dreme.xml
    ├── humangradinput.pdf
    ├── humanintgrad.pdf
    ├── motif_analysis
    ├── mouse_all_1mer_400K
    │   ├── dreme.html
    │   ├── dreme.txt
    │   └── dreme.xml
    ├── mousegradinput.pdf
    ├── mouseintgrad.pdf
    ├── pM10Kb_Mouse_cv
    ├── pM10Kb_cv
    └── runme.sh
├── FigS1
    ├── 57epigenomes.RPKM.pc.gz
    ├── EG.name.txt
    ├── FigS1.R
    ├── FigS1.pdf
    └── runme.sh
├── LICENSE.txt
├── README.md
├── Xpresso.ipynb
├── allfxns.pm
└── xpresso_logo.png


/.Rprofile:
--------------------------------------------------------------------------------
 1 | args = commandArgs(trailingOnly = T)
 2 | 
 3 | head <- function(x, y = 5) { base::print(utils::head(x, y)) }
 4 | say <- function(...) { base::print(paste(...)) }
 5 | 
 6 | tryCatch({options(width = as.integer(Sys.getenv("COLUMNS")))}, error = function(err) {options(width=236)})
 7 | 
 8 | .Last <- function(){
 9 |     if (!any(commandArgs() == '--no-readline') && interactive()) {
10 |     	require(utils)
11 |     	try(savehistory(".Rhistory"))
12 |     }
13 | }
14 | 
15 | error.bar <- function(x, y, upper, lower=upper, length=0.1,...){
16 | 	if(length(x) != length(y) | length(y) !=length(lower) | length(lower) != length(upper))
17 | 	stop("vectors must be same length")
18 | 	arrows(x,y+upper, x, y-lower, angle=90, code=3, length=length, ...)
19 | }
20 | 
21 | writefile = function(obj, x, ...){
22 | 	write.table(obj, file=x, quote=F, row.names=F, sep='\t', ...)
23 | }
24 | 
25 | fastread = function(file, ...){
26 | 	data.table::fread(file,data.table=F,sep="\t", ...)
27 | }
28 | 


--------------------------------------------------------------------------------
/Fig1_S2/57epigenomes.median_expr.txt:
--------------------------------------------------------------------------------
1 | ../datasets/57epigenomes.median_expr.txt


--------------------------------------------------------------------------------
/Fig1_S2/Fig1B_S2B.R:
--------------------------------------------------------------------------------
 1 | library(latticeExtra)
 2 | 
 3 | getresults = function(thisfile){
 4 |     sites = read.table(text = system(paste("python print_losses.py", thisfile), intern=T), sep='\t')
 5 |     colnames(sites)=c("leftpos","rightpos","loss","params")
 6 |     print(sites[which(sites$loss == min(sites$loss)),])
 7 |     say(nrow(sites), "trials")
 8 |     sites
 9 | }
10 | 
11 | c = getresults(args[1])
12 | e = getresults(args[2])
13 | 
14 | pdf("Fig1B.pdf",width=5,height=4)
15 | plot(1:nrow(c), sapply(1:nrow(c), function(x) min(c[1:x, "loss"])), lwd=2, bty='n', col='red', type="l", 
16 | xlim = c(0, 1000), ylim = c(0.4, 0.7), xlab="Number of iterations", ylab="Validation mean squared error, best model found")
17 | abline(h=0.479, lwd=2, lty=2, col='black')
18 | lines(1:nrow(e), sapply(1:nrow(e), function(x) min(e[1:x, "loss"])), lwd=2, col='purple')
19 | legend("topright", bg="white", bty="n", legend = c("Tree of Parzen estimators", "Simulated annealing","Best manually discovered, -1.5Kb to 1.5Kb"),
20 | text.col = c("red", "purple","black"), cex=0.8)
21 | dev.off()
22 | 
23 | plotboundaries = function(a){
24 |     b=aggregate(a$loss, by=list(leftpos=a$leftpos, rightpos=a$rightpos), min)
25 |     totsize = 10000
26 |     pdf("FigS2B.pdf",width=5,height=6)
27 |     layout(matrix(c(1,1,1,2), 1, 4))
28 |     par(mar = c(1, 1, 5, 1))
29 |     b$mycol = as.character("red")
30 |     N=min(nrow(b),100)
31 |     b=b[order(b$x, decreasing=T),]
32 |     b=b[(nrow(b)-N+1):nrow(b),]
33 |     b=rbind(c(8500,11500,0.479,"blue"), b)
34 |     b$leftpos=as.integer(b$leftpos) - 10000
35 |     b$rightpos=as.integer(b$rightpos) - 10000
36 |     plot(1:(N+1),xlim=c(-totsize,totsize), type="n", cex.lab = 2, bty="n", yaxt='n', xaxt='n')
37 |     axis(3, at=seq(-totsize,totsize,totsize/5))
38 |     mtext("Position relative to TSS", side=3, line=3)
39 |     for(x in 1:(N+1)) lines(c(b$leftpos[x],b$rightpos[x]), c(x,x), col=b$mycol[x], type="l", lty=1, lwd=2)
40 |     abline(v=0, lwd=2, col='black')
41 | 
42 |     plot(1:(N+1),xlim=c(0.4,0.48), type="n", cex.lab = 2, bty="n", yaxt='n', xaxt='n')
43 |     axis(3, at=seq(0.4,0.48,0.02))
44 |     mtext("Validation MSE", side=3, line=3)
45 |     for(x in 1:(N+1)) lines(c(0,b$x[x]), c(x,x), col="grey", type="l", lty=1, lwd=2)
46 |     abline(v=0.479, lwd=2, lty=2, col='blue')
47 |     dev.off()
48 | }
49 | 
50 | plotboundaries(c)


--------------------------------------------------------------------------------
/Fig1_S2/Fig1C.R:
--------------------------------------------------------------------------------
 1 | library(latticeExtra)
 2 | 
 3 | crp.rg <- colorRampPalette(c("red","orange","green","cyan","blue","purple","magenta"))
 4 | cols <- sample(crp.rg(10))
 5 | 
 6 | plotresults = function(dir){
 7 |     files = paste(list.files(path=dir, pattern='.txt', full.names=T))
 8 |     pdf("Fig1C.pdf",width=5,height=4)
 9 |     plot(0, lwd=1, bty='n', type="l", xlim = c(0, 25), ylim = c(0.4, 1), xlab="Epoch", ylab="Validation MSE", las = 1)
10 |     abline(h=0.479, lwd=1, lty=2, col='black')
11 |     lapply(1:length(files), FUN=function(i){
12 |         file = files[i]
13 |         cmd = paste("grep val_loss", file, " | perl -ne 'chomp; ($mse) = ($_ =~ /val_loss: (\\d+.\\d+)/); print \"$mse \";'")
14 |         sites = unlist(strsplit(system(cmd, intern=T), "\\s+"))
15 |         sites = as.numeric(sites[2:length(sites)])
16 |         lines(1:length(sites), sites, lwd=2, col=cols[i])
17 |     })
18 |     dev.off()
19 | }
20 | 
21 | c = plotresults(args[1])


--------------------------------------------------------------------------------
/Fig1_S2/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz


--------------------------------------------------------------------------------
/Fig1_S2/Xpresso.py:
--------------------------------------------------------------------------------
  1 | import sys, os, h5py, pickle
  2 | import pandas as pd
  3 | from optparse import OptionParser
  4 | from scipy import stats
  5 | import tensorflow as tf
  6 | from tensorflow import keras
  7 | from keras.optimizers import Adam
  8 | from keras.models import Model, load_model
  9 | from keras.layers import *
 10 | from keras.metrics import *
 11 | from keras.utils import plot_model
 12 | from keras import backend as K
 13 | from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
 14 | from hyperopt import fmin, tpe, rand, anneal, hp, STATUS_OK, STATUS_FAIL, Trials, mix, partial, space_eval
 15 | 
 16 | global X_trainhalflife, X_trainpromoter, y_train, geneName_train, X_validhalflife, X_validpromoter, y_valid, geneName_valid, X_testhalflife, X_testpromoter, y_test, geneName_test, params
 17 | 
 18 | def main():
 19 |     usage = 'usage: %prog [options] <mode> <database_file> <database_directory>'
 20 |     parser = OptionParser(usage)
 21 |     parser.add_option('-c', dest='counts', default=0, type='int', help='Number of training counts to subsample [Default: %default]')
 22 |     parser.add_option('--bestmanual', dest='bestmanual', default=False, action='store_true', help='Try best manually identified model % [Default: %default]')
 23 |     parser.add_option('--fold', dest='cvfold', default='', type='string', help='Which of the 10 folds of cross-validation to use % [Default: %default]')
 24 |     parser.add_option('--trial', dest='trial', default='', type='string', help='Trial number % [Default: %default]')
 25 |     parser.add_option('--usemodel', dest='usemodel', default=None, type='string', help='Use pre-trained model % [Default: %default]')
 26 |     (options,args) = parser.parse_args()
 27 | 
 28 |     if len(args) != 3:
 29 |         print (args)
 30 |         parser.error('Must provide mode (tune, train, or test), hyperparameter database file, and database directory')
 31 |     else:
 32 |         mode = args[0]
 33 |         database = args[1]
 34 |         datadir = args[2]
 35 | 
 36 |     global X_trainhalflife, X_trainpromoter, y_train, geneName_train, X_validhalflife, X_validpromoter, y_valid, geneName_valid, X_testhalflife, X_testpromoter, y_test, geneName_test, params
 37 |     params['datadir'] = datadir
 38 |     if not options.usemodel:
 39 |         trainfile = h5py.File(os.path.join(datadir, options.cvfold+'train.h5'), 'r') #_mouse1to1
 40 |         X_trainhalflife, X_trainpromoter, y_train, geneName_train = trainfile['data'], trainfile['promoter'], trainfile['label'], trainfile['geneName']
 41 |         validfile = h5py.File(os.path.join(datadir, options.cvfold+'valid.h5'), 'r') #_mouse1to1
 42 |         X_validhalflife, X_validpromoter, y_valid, geneName_valid = validfile['data'], validfile['promoter'], validfile['label'], validfile['geneName']
 43 | 
 44 |     if mode == "tune":
 45 |         while True: # loop indefinitely and stop whenever you like
 46 |             run_trials(database)
 47 |     else:
 48 |         testfile = h5py.File(os.path.join(datadir, options.cvfold+'test.h5'), 'r') #_mouse1to1_human1to1
 49 |         X_testhalflife, X_testpromoter, y_test, geneName_test = testfile['data'], testfile['promoter'], testfile['label'], testfile['geneName']
 50 |         if options.bestmanual:
 51 |             params = { 'datadir' : datadir, 'batchsize' : 2**6, 'leftpos' : 8500, 'rightpos' : 11500, 'activationFxn' : 'relu', 'numFiltersConv1' : 2**6, 'filterLenConv1' : 5, 'dilRate1' : 1,
 52 |                        'maxPool1' : 10, 'numconvlayers' : { 'numFiltersConv2' : 2**6, 'filterLenConv2' : 5, 'dilRate2' : 1, 'maxPool2' : 20, 'numconvlayers1' : { 'numconvlayers2' : 'two' } },
 53 |                        'dense1' : 100, 'dropout1' : 0.5, 'numdenselayers' : { 'layers' : 'one' } }
 54 |             print("Using best human-identified parameters")
 55 |         else:
 56 |             trials = pickle.load(open(database, "rb"))
 57 |             best = trials.argmin
 58 |             params = space_eval(params, best)
 59 |             print("Found saved Trials!")
 60 |         print ("The best parameters are:")
 61 |         print (params)
 62 |         params['subsample'] = options.counts
 63 |         params['cvfold'] = options.cvfold
 64 |         params['trial'] = options.trial
 65 |         params['usemodel'] = options.usemodel
 66 |         params['tuneMode'] = 0 #enable mode that trains best model structure over up to 100 epochs, and evaluates final model on test set
 67 |         results = objective(params)
 68 |         print("Best Validation MSE = %.3f" % results['loss'])
 69 | 
 70 | params = {
 71 |     'tuneMode' : 1,
 72 |     'batchsize' : 2**hp.quniform('batchsize', 5, 7, 1),
 73 |     'leftpos' : hp.quniform('leftpos', 0, 10000, 500),
 74 |     'rightpos' : hp.quniform('rightpos', 10000, 20000, 500),
 75 |     'activationFxn' : 'relu', #hp.choice('activationFxn', ['relu', 'elu', 'selu', 'LeakyReLU', 'PReLU']) -- tried but none worked better than simply relu
 76 |     'numFiltersConv1' : 2**hp.quniform('numFiltersConv1', 4, 7, 1),
 77 |     'filterLenConv1' : hp.quniform('filterLenConv1', 1, 10, 1),
 78 |     'dilRate1' : hp.quniform('dilRate1', 1, 4, 1),
 79 |     'maxPool1' : hp.quniform('maxPool1', 5, 100, 5),
 80 |     'numconvlayers' : hp.choice('numconvlayers', [
 81 |     {
 82 |         'numconvlayers1' : 'one'
 83 |     },
 84 |     {
 85 |         'numFiltersConv2' : 2**hp.quniform('numFiltersConv2', 4, 7, 1),
 86 |         'filterLenConv2' : hp.quniform('filterLenConv2', 1, 10, 1),
 87 |         'dilRate2' : hp.quniform('dilRate2', 1, 4, 1),
 88 |         'maxPool2' : hp.quniform('maxPool2', 5, 100, 5),
 89 |         'numconvlayers1' : hp.choice('numconvlayers1', [
 90 |         {
 91 |             'numconvlayers2' : 'two'
 92 |         },
 93 |         {
 94 |             'numFiltersConv3' : 2**hp.quniform('numFiltersConv3', 4, 7, 1),
 95 |             'filterLenConv3' : hp.quniform('filterLenConv3', 1, 10, 1),
 96 |             'dilRate3' : hp.quniform('dilRate3', 1, 4, 1),
 97 |             'maxPool3' : hp.quniform('maxPool3', 5, 100, 5),
 98 |             'numconvlayers2' : hp.choice('numconvlayers2', [
 99 |             {
100 |                 'numconvlayers3' : 'three'
101 |             },
102 |             {
103 |                 'numFiltersConv4' : 2**hp.quniform('numFiltersConv4', 4, 7, 1),
104 |                 'filterLenConv4' : hp.quniform('filterLenConv4', 1, 10, 1),
105 |                 'dilRate4' : hp.quniform('dilRate4', 1, 4, 1),
106 |                 'maxPool4' : hp.quniform('maxPool4', 5, 100, 5),
107 |                 'numconvlayers3' : 'four'
108 |             }])
109 |         }])
110 |     }]),
111 |     'dense1' : 2**hp.quniform('dense1', 1, 8, 1),
112 |     'dropout1' : hp.uniform('dropout1', 0, 1),
113 |     'numdenselayers' : hp.choice('numdenselayers', [
114 |         {
115 |             'layers' : 'one'
116 |         },
117 |         {
118 |             'layers' : 'two' ,
119 |             'dense2' : 2**hp.quniform('dense2', 1, 8, 1),
120 |             'dropout2' : hp.uniform('dropout2', 0, 1)
121 |         }
122 |     ])
123 | }
124 | 
125 | def run_trials(database):
126 |     trials_step = 5  # how many additional trials to do after loading saved trials
127 |     max_trials = 5  # initial max_trials. put something small to not have to wait
128 | 
129 |     try:  # try to load an already saved trials object, and increase the max
130 |         trials = pickle.load(open(database, "rb"))
131 |         print("Found saved Trials! Loading...")
132 |         max_trials = len(trials.trials) + trials_step
133 |         print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, trials_step))
134 |     except:  # create a new trials object and start searching
135 |         trials = Trials()
136 | 
137 |     best = fmin(objective, params, max_evals = max_trials, trials = trials,
138 |         algo = anneal.suggest)
139 |         # algo = rand.suggest)
140 |         # algo = tpe.suggest)
141 |         # algo = partial(mix.suggest, p_suggest=[(0.2, rand.suggest),(0.6, tpe.suggest),(0.2, anneal.suggest)]))
142 | 
143 |     ##### sample random parameter sets and print
144 |     # import hyperopt.pyll.stochastic
145 |     # print (hyperopt.pyll.stochastic.sample(params))
146 | 
147 |     print( "Best:", best)
148 |     # save the trials object
149 |     with open(database, "wb") as f:
150 |         pickle.dump(trials, f)
151 | 
152 | def objective(params):
153 |     leftpos = int(params['leftpos'])
154 |     rightpos = int(params['rightpos'])
155 |     activationFxn = params['activationFxn']
156 |     if not params['usemodel']:
157 |         global X_trainhalflife, y_train
158 |         X_trainpromoterSubseq = X_trainpromoter[:,leftpos:rightpos,:]
159 |         X_validpromoterSubseq = X_validpromoter[:,leftpos:rightpos,:]
160 |         halflifedata = Input(shape=(X_trainhalflife.shape[1:]), name='halflife')
161 |         input_promoter = Input(shape=X_trainpromoterSubseq.shape[1:], name='promoter')
162 | 
163 |     try:
164 |     # if True:
165 |         mse = 1
166 |         if params['usemodel']:
167 |             model = load_model(params['usemodel'])
168 |             print('Loaded results from:', params['usemodel'])
169 |         else:
170 |             x = Conv1D(int(params['numFiltersConv1']), int(params['filterLenConv1']), dilation_rate=int(params['dilRate1']), padding='same', kernel_initializer='glorot_normal', input_shape=X_trainpromoterSubseq.shape[1:],activation=activationFxn)(input_promoter)
171 |             x = MaxPooling1D(int(params['maxPool1']))(x)
172 | 
173 |             if params['numconvlayers']['numconvlayers1'] != 'one':
174 |                 maxPool2 = int(params['numconvlayers']['maxPool2'])
175 |                 x = Conv1D(int(params['numconvlayers']['numFiltersConv2']), int(params['numconvlayers']['filterLenConv2']), dilation_rate=int(params['numconvlayers']['dilRate2']), padding='same', kernel_initializer='glorot_normal',activation=activationFxn)(x) #[2, 3, 4, 5, 6, 7, 8, 9, 10]
176 |                 x = MaxPooling1D(maxPool2)(x)
177 |                 if params['numconvlayers']['numconvlayers1']['numconvlayers2'] != 'two':
178 |                     maxPool3 = int(params['numconvlayers']['numconvlayers1']['maxPool3'])
179 |                     x = Conv1D(int(params['numconvlayers']['numconvlayers1']['numFiltersConv3']), int(params['numconvlayers']['numconvlayers1']['filterLenConv3']), dilation_rate=int(params['numconvlayers']['numconvlayers1']['dilRate3']), padding='same', kernel_initializer='glorot_normal',activation=activationFxn)(x) #[2, 3, 4, 5]
180 |                     x = MaxPooling1D(maxPool3)(x)
181 |                     if params['numconvlayers']['numconvlayers1']['numconvlayers2']['numconvlayers3'] != 'three':
182 |                         maxPool4 = int(params['numconvlayers']['numconvlayers1']['numconvlayers2']['maxPool4'])
183 |                         x = Conv1D(int(params['numconvlayers']['numconvlayers1']['numconvlayers2']['numFiltersConv4']), int(params['numconvlayers']['numconvlayers1']['numconvlayers2']['filterLenConv4']), dilation_rate=int(params['numconvlayers']['numconvlayers1']['numconvlayers2']['dilRate4']), padding='same', kernel_initializer='glorot_normal',activation=activationFxn)(x) #[2, 3, 4, 5]
184 |                         x = MaxPooling1D(maxPool4)(x)
185 | 
186 |             x = Flatten()(x)
187 |             x = Concatenate()([x, halflifedata])
188 |             x = Dense(int(params['dense1']))(x)
189 |             x = Activation(activationFxn)(x)
190 |             x = Dropout(params['dropout1'])(x)
191 |             if params['numdenselayers']['layers'] == 'two':
192 |                 x = Dense(int(params['numdenselayers']['dense2']))(x)
193 |                 x = Activation(activationFxn)(x)
194 |                 x = Dropout(params['numdenselayers']['dropout2'])(x)
195 |             main_output = Dense(1)(x)
196 |             model = Model(inputs=[input_promoter, halflifedata], outputs=[main_output])
197 |             model.compile(Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),'mean_squared_error', metrics=['mean_squared_error'])
198 | 
199 |         if params['tuneMode']:
200 |             result = model.fit([X_trainpromoterSubseq, X_trainhalflife], y_train, batch_size=int(params['batchsize']), shuffle="batch", epochs=10,
201 |                                 validation_data=[[X_validpromoterSubseq, X_validhalflife], y_valid])
202 |             mse = min(result.history['val_mean_squared_error'])
203 |             print("leftpos, rightpos, mse")
204 |             print(leftpos, rightpos, mse)
205 |         else:
206 |             print(model.summary())
207 |             plot_model(model, to_file=os.path.join(params['datadir'], 'best_model.png')) #requires Pydot/Graphviz to generate graph of network
208 |             X_testpromoterSubseq = X_testpromoter[:,leftpos:rightpos,:]
209 |             if not params['usemodel']:
210 |                 if params['subsample'] > 0:
211 |                     X_trainpromoterSubseq = X_trainpromoterSubseq[0:params['subsample'],:,:]
212 |                     X_trainhalflife = X_trainhalflife[0:params['subsample'],:]
213 |                     y_train = y_train[0:params['subsample']]
214 |                 check_cb = ModelCheckpoint(os.path.join(params['datadir'], params['trial']+params['cvfold']+'trainepoch.{epoch:02d}-{val_loss:.4f}.h5'), monitor='val_loss', verbose=1, save_best_only=True, mode='min')
215 |                 earlystop_cb = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')
216 |                 result = model.fit([X_trainpromoterSubseq, X_trainhalflife], y_train, batch_size=int(params['batchsize']), shuffle="batch", epochs=100,
217 |                     validation_data=[[X_validpromoterSubseq, X_validhalflife], y_valid], callbacks=[earlystop_cb, check_cb])
218 |                 mse_history = result.history['val_mean_squared_error']
219 |                 mse = min(mse_history)
220 |                 best_file = os.path.join(params['datadir'], params['trial']+params['cvfold']+'trainepoch.%02d-%.4f.h5' % (mse_history.index(mse), mse))
221 |                 model = load_model(best_file)
222 |                 print('Loaded results from:', best_file)
223 | 
224 |             predictions_test = model.predict([X_testpromoterSubseq, X_testhalflife], batch_size=20).flatten()
225 |             slope, intercept, r_value, p_value, std_err = stats.linregress(predictions_test, y_test)
226 |             print('Test R^2 = %.3f' % r_value**2)
227 |             df = pd.DataFrame(np.column_stack((geneName_test, predictions_test, y_test)), columns=['Gene','Pred','Actual'])
228 |             df.to_csv(os.path.join(params['datadir'], params['trial']+params['cvfold']+'predictions.txt'), index=False, header=True, sep='\t')
229 | 
230 |         return {'loss': mse, 'status': STATUS_OK }
231 | 
232 |     except:
233 |         return {'loss': 1, 'status': STATUS_FAIL } # loss = 1 indicates a poor-performing model; reason model might fail include: incompatible parameters or insufficient memory resources available
234 | 
235 | if __name__ == '__main__':
236 |     main()
237 | 


--------------------------------------------------------------------------------
/Fig1_S2/choose_reference_genes.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | $file = shift;
 4 | 
 5 | open IN, "zgrep -P '\tfive_prime_utr|three_prime_utr|CDS\t' $file | ";
 6 | while(<IN>){
 7 | 	($region, $start, $stop, $last) = (split /\t/)[2,3,4,-1];
 8 | 	($parent, $id) = ($last =~ /gene_id "(ENS\w*GR?[\d|\.]+)"; gene_version "\d+"; transcript_id "(ENS\w*TR?[\d|\.]+)";/);
 9 | 	$lengths{$id}{$region} += ($stop-$start);
10 | }
11 | close IN;
12 | 
13 | open IN, "zgrep -P '\tCDS\t' $file | ";
14 | while(<IN>){
15 | 	($parent, $id) = ($_ =~ /gene_id "(ENS\w*GR?[\d|\.]+)"; gene_version "\d+"; transcript_id "(ENS\w*TR?[\d|\.]+)";/);
16 | 	$reptranscript{$parent} = $id if (! defined $reptranscript{$parent});
17 | 	$reptranscript{$parent} = $id if ($lengths{$id}{"CDS"} > $lengths{$reptranscript{$parent}}{"CDS"} && $lengths{$id}{"five_prime_utr"} > 0);
18 | 	$reptranscript{$parent} = $id if ($lengths{$id}{"CDS"} >= $lengths{$reptranscript{$parent}}{"CDS"} && $lengths{$id}{"five_prime_utr"} > $lengths{$reptranscript{$parent}}{"five_prime_utr"});
19 | 	$reptranscript{$parent} = $id if ($lengths{$id}{"CDS"} >= $lengths{$reptranscript{$parent}}{"CDS"} && $lengths{$id}{"five_prime_utr"} >= $lengths{$reptranscript{$parent}}{"five_prime_utr"}  && $lengths{$id}{"three_prime_utr"} > $lengths{$reptranscript{$parent}}{"three_prime_utr"});
20 | }
21 | close IN;
22 | 
23 | %okids = map { $_ => 1 } values %reptranscript;
24 | foreach (keys %reptranscript){
25 | 	$repid{$reptranscript{$_}} = $_;
26 | }
27 | 
28 | open IN, "zcat $file |";
29 | while(<IN>){
30 | 	($chr, $region, $start, $stop, $str, $last) = (split /\t/)[0,2,3,4,6,-1];
31 | 	($parent, $id) = ($last =~ /gene_id "(ENS\w*GR?[\d|\.]+)"; gene_version "\d+"; transcript_id "(ENS\w*TR?[\d|\.]+)";/);
32 | 	$repid = $repid{$id};
33 | 	if ($okids{$id}){
34 | 		@a = split /\t/, $_;
35 | 		$a[-1] = "$parent";
36 | 		$_ = join ("\t", @a)."\n";
37 | 		print "$_" if $chr =~ /^\d+|^X/; #keep non-chrY genes only
38 | 	}
39 | }
40 | close IN;


--------------------------------------------------------------------------------
/Fig1_S2/choose_reference_genes_forhg19.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | $file = shift;
 4 | 
 5 | open IN, "zgrep -P '\tUTR|CDS\t' $file | ";
 6 | while(<IN>){
 7 | 	($region, $start, $stop, $last) = (split /\t/)[2,3,4,-1];
 8 | 	($parent, $id) = ($last =~ /gene_id "(ENS\w*GR?[\d|\.]+.*)"; transcript_id "(ENS\w*TR?[\d|\.]+.*)"; gene_type/);
 9 | 	$lengths{$id}{$region} += ($stop-$start);
10 | }
11 | close IN;
12 | 
13 | open IN, "zgrep -P '\tCDS\t' $file | ";
14 | while(<IN>){
15 | 	($parent, $id) = ($_ =~ /gene_id "(ENS\w*GR?[\d|\.]+.*)"; transcript_id "(ENS\w*TR?[\d|\.]+.*)"; gene_type/);
16 | 	$reptranscript{$parent} = $id if (! defined $reptranscript{$parent});
17 | 	$reptranscript{$parent} = $id if ($lengths{$id}{"CDS"} > $lengths{$reptranscript{$parent}}{"CDS"} && $lengths{$id}{"UTR"} > 0);
18 | }
19 | close IN;
20 | 
21 | %okids = map { $_ => 1 } values %reptranscript;
22 | foreach (keys %reptranscript){
23 | 	$repid{$reptranscript{$_}} = $_;
24 | }
25 | 
26 | open IN, "zcat $file |";
27 | while(<IN>){
28 | 	($chr, $region, $start, $stop, $str, $last) = (split /\t/)[0,2,3,4,6,-1];
29 | 	($parent, $id) = ($last =~ /gene_id "(ENS\w*GR?[\d|\.]+.*)"; transcript_id "(ENS\w*TR?[\d|\.]+.*)"; gene_type/);
30 | 	$repid = $repid{$id};
31 | 	if ($okids{$id}){
32 | 		@a = split /\t/, $_;
33 | 		$a[-1] = "$parent";
34 | 		$_ = join ("\t", @a)."\n";
35 | 		print "$_";
36 | 	}
37 | }
38 | close IN;


--------------------------------------------------------------------------------
/Fig1_S2/extract_promoters.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | $file = shift;
 4 | $spec = shift;
 5 | 
 6 | $dist = 10000;
 7 | 
 8 | open IN, "zgrep -P '\texon\t' $file | ";
 9 | while(<IN>){
10 | 	($chr, $region, $start, $stop, $str, $last) = (split /\t/)[0,2,3,4,6,-1];
11 | 	($parent) = ($last =~ /(ENS\w*G\d+)/);
12 | 	$has5pUTR{$id} = 1;
13 | 	if($str eq '+'){
14 | 		$allregions{$parent} = $start if (! exists $allregions{$parent} || $start < $allregions{$parent});
15 | 	}
16 | 	else{
17 | 		$allregions{$parent} = $stop if (! exists $allregions{$parent} || $stop > $allregions{$parent});
18 | 	}
19 | }
20 | close IN;
21 | 
22 | open IN, "zgrep -P '\texon\t' $file | ";
23 | while(<IN>){
24 | 	($start, $stop, $str, $last) = (split /\t/)[3,4,6,-1];
25 | 	($parent) = ($last =~ /(ENS\w*G\d+)/);
26 | 	next if $seenids{$parent};
27 | 	next if $str eq '+' && $allregions{$parent} != $start;
28 | 	next if $str eq '-' && $allregions{$parent} != $stop;
29 | 	@a = split /\t/, $_;
30 | 	$a[-1] = $parent;
31 | 	$a[2] = $parent;
32 | 	$a[3] = $allregions{$parent} - $dist;
33 | 	$a[4] = $allregions{$parent} + $dist;
34 | 	if ($spec eq "mouse"){ print join("\t", 'chr'.$a[0], $a[3], $a[4], $a[-1], 0, $str), "\n" if $a[3] > 0; }
35 | 	else { print join("\t", $a[0], $a[3], $a[4], $a[-1], 0, $str), "\n" if $a[3] > 0; }
36 | 	$seenids{$parent} = 1;
37 | }
38 | close IN;


--------------------------------------------------------------------------------
/Fig1_S2/geneName2Ensembl.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | open IN, "<ensembl2geneName_v90.txt";
 4 | while(<IN>){ chomp;
 5 | 	@a = split /\t/, $_;
 6 | 	$id2parent{$a[1]} = $a[0] if $a[2] =~ /^\d+|^X/; #remove haplotypes when considering Ensembl ID
 7 | }
 8 | close IN;
 9 | 
10 | open IN, "<HGNC2Ensembl.txt";
11 | while(<IN>){ chomp;
12 | 	@a = split /\t/, $_;
13 | 	@b = split /, /, $a[1];
14 | 	@c = split /, /, $a[2];
15 | 	$hgnc2parent{$a[0]} = $a[3];
16 | 	foreach $i (@b){ $hgnc2parent{$i} = $a[3]; }
17 | 	foreach $i (@c){ $hgnc2parent{$i} = $a[3]; }
18 | }
19 | close IN;
20 | 
21 | open IN, "<hg38_cage_promoters.bed";
22 | open OUT, ">hg38_cage_promoters_ensemblID.bed";
23 | while(<IN>){ chomp; @a=split; $id = $a[3];
24 | 	@ids = split /,/, $id;
25 | 	foreach $id (@ids){
26 | 		($promoter, $gene) = split /\@/, $id;
27 | 		$a[3] = $id2parent{$gene};
28 | 		$a[3] = $hgnc2parent{$gene} if $a[3] eq '';
29 | 		print OUT join("\t", @a),"\n" if $promoter eq 'p1' && $a[3] ne '';
30 | 	}
31 | }
32 | close IN;
33 | close OUT;


--------------------------------------------------------------------------------
/Fig1_S2/geneName2EnsemblMouse.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | open IN, "<ensembl2geneName_v90_mm10.txt";
 4 | while(<IN>){ chomp;
 5 | 	@a = split /\t/, $_;
 6 | 	$id2parent{$a[1]} = $a[0] if $a[2] =~ /^\d+|^X/; #remove haplotypes when considering Ensembl ID
 7 | }
 8 | close IN;
 9 | 
10 | open IN, "cut -f 2,10 MGI_EntrezGene.rpt | ";
11 | while(<IN>){ chomp;
12 | 	@a = split /\t/, $_;
13 | 	$mgi2synonym{$a[0]} = $a[1];
14 | }
15 | close IN;
16 | 
17 | open IN, "cut -f 1,3 ensembl2entrezID_v90_mm10.txt | ";
18 | while(<IN>){ chomp;
19 | 	@a = split /\t/, $_;
20 | 	$entrez2ensembl{$a[1]} = $a[0] if $a[1] ne '';
21 | }
22 | close IN;
23 | 
24 | open IN, "cut -f 3,11 MGI_Gene_Model_Coord.rpt | ";
25 | while(<IN>){ chomp;
26 | 	@a = split /\t/, $_;
27 | 	$mgi2parent{$a[0]} = $a[1];
28 | 	@b = split /\|/, $mgi2synonym{$a[0]};
29 | 	foreach $i (@b){ $mgi2parent{$i} = $a[1]; }
30 | }
31 | close IN;
32 | 
33 | open IN, "cut -f 3,11 MGI_Gene_Model_Coord.rpt | ";
34 | while(<IN>){ chomp;
35 | 	@a = split /\t/, $_;
36 | 	$mgi2parent{$a[0]} = $a[1];
37 | }
38 | close IN;
39 | 
40 | open IN, "<Ouyang_mESC_RPKM.txt";
41 | open OUT, ">Ouyang_mESC_RPKM_ensemblID.txt";
42 | while(<IN>){ chomp; @a=split; $id = $a[0];
43 | 	$id = $entrez2ensembl{$a[0]};
44 | 	$id = $id2parent{$a[1]} if $id eq '';
45 | 	print OUT join("\t", $id, $a[2]),"\n" if $id ne '' && !$seenid{$id};
46 | 	print STDERR $_,"\n" if $id eq '' || $seenid{$id};
47 | 	$seenid{$id} = 1;
48 | }
49 | close IN;
50 | close OUT;
51 | 
52 | open IN, "<mm10_cage_promoters.bed";
53 | open OUT, ">mm10_cage_promoters_ensemblID.bed";
54 | while(<IN>){ chomp; @a=split; $id = $a[3];
55 | 	@ids = split /,/, $id;
56 | 	foreach $id (@ids){
57 | 		($promoter, $gene) = split /\@/, $id;
58 | 		$a[3] = $id2parent{$gene};
59 | 		$a[3] = $mgi2parent{$gene} if $a[3] eq '';
60 | 		print OUT join("\t", @a),"\n" if $promoter eq 'p1' && $a[3] =~ /ENS/ && !$seenid{$a[3]};
61 | 		$seenid{$a[3]} = 1;
62 | 	}
63 | }
64 | close IN;
65 | close OUT;


--------------------------------------------------------------------------------
/Fig1_S2/generate_training_input.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use allfxns;
 4 | 
 5 | $exprMat = shift;
 6 | $spec = ($exprMat =~ /mouse/)? 1 : 0; #species is mouse or human?
 7 | 
 8 | sub readBed{
 9 | 	local $bed = shift;
10 | 	local %bed = ();
11 | 	open BEDENTRY, "<$bed" || die "Could not open bed file for $bed\n";
12 | 	while ($line = <BEDENTRY>){
13 | 		@a = split /\t/, $line;
14 | 		$bed{$a[3]} = $line;
15 | 	}
16 | 	close BEDENTRY;
17 | 	return \%bed;
18 | }
19 | 
20 | sub readFasta{
21 | 	local $fasta = shift;
22 | 	local %fasta = ();
23 | 	open DNA, "zcat $fasta | " || die "Could not open fasta file for $fasta\n";
24 | 	while ($line = <DNA>){ chomp $line;
25 | 		if ($line =~ /^>\s?(\w+\d)\.?\d*/){ $header = $1; }
26 | 		else { $fasta{$header} .= $line; }
27 | 	}
28 | 	close DNA;
29 | 	return \%fasta;
30 | }
31 | 
32 | if ($spec){
33 | 	open IN, "zcat Mus_musculus.GRCm3.90.chosenTranscript.gtf.gz | ";
34 | }
35 | else{
36 | 	open IN, "zcat Homo_sapiens.GRCh38.90.chosenTranscript.gtf.gz | ";
37 | }
38 | while(<IN>){
39 | 	($region, $start, $stop, $last) = (split /\t/)[2,3,4,-1];
40 | 	($id) = ($last =~ /(ENS\w*GR?[\d|\.]+)/);
41 | 	$lengths{$id}{$region} += ($stop-$start);
42 | 	$cdsexoncount{$id}++ if $region eq 'CDS';
43 | }
44 | close IN;
45 | 
46 | if ($spec){
47 | 	%promoterbed = %{ readBed("mm10_promoters.bed") };
48 | 	%promoters = %{ readFasta("mm10_promoters.fa.gz") };
49 | 	%fantombed = %{ readBed("mm10_cage_promoters_ensemblID.bed") };
50 | 	%fantompromoters = %{ readFasta("mm10_cage_promoters_ensemblID.fa.gz") };
51 | 	%utr5p = %{ readFasta("mm10_ensembl90_5utrs.fa.gz") };
52 | 	%orfs = %{ readFasta("mm10_ensembl90_orfs.fa.gz") };
53 | 	%utr3p = %{ readFasta("mm10_ensembl90_3utrs.fa.gz") };
54 | }
55 | else{
56 | 	%promoterbed = %{ readBed("hg38_promoters.bed") };
57 | 	%fantombed = %{ readBed("hg38_cage_promoters_ensemblID.bed") };
58 | 	%promoters = %{ readFasta("hg38_promoters.fa.gz") };
59 | 	%fantompromoters = %{ readFasta("hg38_cage_promoters_ensemblID.fa.gz") };
60 | 	%utr5p = %{ readFasta("hg38_ensembl90_5utrs.fa.gz") };
61 | 	%orfs = %{ readFasta("hg38_ensembl90_orfs.fa.gz") };
62 | 	%utr3p = %{ readFasta("hg38_ensembl90_3utrs.fa.gz") };
63 | }
64 | 
65 | if ($spec){
66 | 	open BED, ">mm10_promoters_cage_corrected.bed";
67 | }
68 | else{
69 | 	open BED, ">hg38_promoters_cage_corrected.bed";
70 | }
71 | print join("\t", "ENSID", "EXPRESSION", "UTR5LEN", "CDSLEN", "INTRONLEN", "UTR3LEN", "UTR5GC", "CDSGC", "UTR3GC", "ORFEXONDENSITY", "PROMOTER"), "\n";
72 | open IN, "<$exprMat";
73 | while(<IN>){ chomp;
74 | 	@a=split /\t/;
75 | 	$id = $a[0];
76 | 	if (($promoters{$id} ne '' || $fantompromoters{$id} ne '') && $lengths{$id}{"CDS"} > 0){
77 | 		$promoter = $promoters{$id};
78 | 		$promoterbed = $promoterbed{$id};
79 | 		if ($fantompromoters{$id} ne ''){
80 | 			$promoter = $fantompromoters{$id};
81 | 			$promoterbed = $fantombed{$id};
82 | 		}
83 | 		print BED $promoterbed;
84 | 		print join("\t", $id, $a[1], int($lengths{$id}{"five_prime_utr"}), int($lengths{$id}{"CDS"}), 
85 | 			int($lengths{$id}{"transcript"})-(int($lengths{$id}{"three_prime_utr"})+int($lengths{$id}{"CDS"})+int($lengths{$id}{"five_prime_utr"})), 
86 | 			int($lengths{$id}{"three_prime_utr"}), gcContent($utr5p{$id}), gcContent($orfs{$id}), gcContent($utr3p{$id}), 
87 | 			sprintf("%.2f", $cdsexoncount{$id}*1000/$lengths{$id}{"CDS"}), $promoter), "\n";
88 | 	}
89 | 	else{ $count++; }
90 | }
91 | close BED;
92 | 
93 | print STDERR "$count IDs missing/revised due to annotation version changes\n";


--------------------------------------------------------------------------------
/Fig1_S2/print_losses.py:
--------------------------------------------------------------------------------
 1 | import pickle, sys
 2 | 
 3 | file = sys.argv[1]
 4 | trials = pickle.load(open(file, "rb"))
 5 | alldicts = trials.trials
 6 | loss = trials.losses()
 7 | 
 8 | for i in range(len(loss)):
 9 |     print(str(int(alldicts[i]['misc']['vals']['leftpos'][0]))+'\t'+str(int(alldicts[i]['misc']['vals']['rightpos'][0]))+'\t'+str(loss[i])+'\t'+str(alldicts[i]['misc']['vals']))
10 | 


--------------------------------------------------------------------------------
/Fig1_S2/process_RNAseq.R:
--------------------------------------------------------------------------------
1 | a=read.delim(gzfile("57epigenomes.RPKM.pc.gz"))
2 | a$median=apply(a[,3:ncol(a)],1,median)
3 | write.table(a[,c("gene_id","median")],quote=F,row.names=F,col.names=F,sep="\t", file="57epigenomes.median_expr.txt")


--------------------------------------------------------------------------------
/Fig1_S2/process_RNAseq_mouse.R:
--------------------------------------------------------------------------------
1 | a=read.delim(gzfile("mouse_FPKMs.tsv.gz"), F)
2 | colnames(a)[1]="gene_id"
3 | a[,1]=substring(a[,1],1,18)
4 | a$median=apply(a[,2:ncol(a)],1,median)
5 | write.table(a[,c("gene_id","median")],quote=F,row.names=F,col.names=F,sep="\t", file="mouse.median_expr.txt")


--------------------------------------------------------------------------------
/Fig1_S2/runme.sh:
--------------------------------------------------------------------------------
  1 | ########### MOST OF THESE STEPS HAVE PRECOMPUTED RESULTS
  2 | 
  3 | #run the following in the base Xpresso folder to retrieve these precomputed results:
  4 | wget -r -np -nH --reject "index.html*" --cut-dirs 5 https://krishna.gs.washington.edu/content/members/vagar/Xpresso/data/datasets/
  5 | 
  6 | ########### EXTRACT PROMOTERS FROM FANTOM5 CAGE PEAKS ############
  7 | 
  8 | # download human CAGE annotations
  9 | # the original link (now broken, stored in "datasets/ was downloaded from here)
 10 | # wget http://fantom.gsc.riken.jp/5/datahub/hg38/peaks/hg38.cage_peak_phase1and2combined.bb http://fantom.gsc.riken.jp/5/datahub/mm10/peaks/mm10.cage_peak_phase1and2combined.bb
 11 | 
 12 | # the revised link can be found here:
 13 | wget http://fantom.gsc.riken.jp/5/datahub/hg38/peaks/hg38.cage_peak.bb http://fantom.gsc.riken.jp/5/datahub/mm10/peaks/mm10.cage_peak.bb
 14 | bigBedToBed hg38.cage_peak_phase1and2combined.bb hg38.cage_peak_phase1and2combined.bed
 15 | bigBedToBed mm10.cage_peak_phase1and2combined.bb mm10.cage_peak_phase1and2combined.bed
 16 | 
 17 | # extracts best peak for each gene, removes chrY/M genes
 18 | grep -e 'p1@' hg38.cage_peak_phase1and2combined.bed | \
 19 |     perl -ne 'chomp; @a=split /\t/; $a[0]=substr($a[0],3); $mid = int($a[-2]); $start=$mid-10000; $stop=$mid+10000; print join("\t",$a[0],$start,$stop,$a[3],0,$a[5])."\n" if $start > 0;' | \
 20 |     grep -v -P "^Y|^M" >hg38_cage_promoters.bed
 21 | grep -e 'p1@' mm10.cage_peak_phase1and2combined.bed | \
 22 |     perl -ne 'chomp; @a=split /\t/; $mid = int($a[-2]); $start=$mid-10000; $stop=$mid+10000; print join("\t",$a[0],$start,$stop,$a[3],0,$a[5])."\n" if $start > 0;' | \
 23 |     grep -v -P "^chrY|^chrM" >mm10_cage_promoters.bed
 24 | 
 25 | #acquired 2 additional tables from BioMart and HGNC in addition to these for the mouse
 26 | wget http://www.informatics.jax.org/downloads/reports/MGI_Gene_Model_Coord.rpt http://www.informatics.jax.org/downloads/reports/MGI_EntrezGene.rpt
 27 | # converts IDs of protein-coding genes into Ensembl IDs for top promoter CAGE peak
 28 | ./geneName2Ensembl.pl
 29 | # acquire Ouyang_mESC_RPKM.txt from Supplementary table of Ouyang et al.
 30 | ./geneName2EnsemblMouse.pl
 31 | 
 32 | # extract CAGE-revised promoter sequence from hg38 genome -- REQUIRES DOWNLOAD OF HG38 AND MM10 GENOMES
 33 | bedtools getfasta -s -name -fi human_hs38_noAlt/whole_genome.fa -bed hg38_cage_promoters_ensemblID.bed -fo hg38_cage_promoters_ensemblID.fa
 34 | gzip hg38_cage_promoters_ensemblID.fa
 35 | bedtools getfasta -s -name -fi mus_musculus/mm10.fa -bed mm10_cage_promoters_ensemblID.bed -fo mm10_cage_promoters_ensemblID.fa
 36 | gzip mm10_cage_promoters_ensemblID.fa
 37 | 
 38 | 
 39 | ########### EXTRACT PROMOTERS FROM ENSEMBL ############
 40 | 
 41 | # download human/mouse gene annotations on hg38/mm10
 42 | wget ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_sapiens.GRCh38.90.gtf.gz
 43 | wget ftp://ftp.ensembl.org/pub/release-90/gtf/mus_musculus/Mus_musculus.GRCm38.90.gtf.gz
 44 | #Ensembl 90 on hg19
 45 | wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_27/GRCh37_mapping/gencode.v27lift37.basic.annotation.gtf.gz
 46 | 
 47 | # choose 1 representative transcript for each protein-coding gene, keep chrX or chr[1..22] genes only
 48 | ./choose_reference_genes.pl Homo_sapiens.GRCh38.90.gtf.gz | gzip -c >Homo_sapiens.GRCh38.90.chosenTranscript.gtf.gz
 49 | ./choose_reference_genes_forhg19.pl gencode.v27lift37.basic.annotation.gtf.gz | gzip -c >Homo_sapiens.hg19.90.chosenTranscript.gtf.gz
 50 | ./choose_reference_genes.pl Mus_musculus.GRCm38.90.gtf.gz | gzip -c >Mus_musculus.GRCm3.90.chosenTranscript.gtf.gz
 51 | zgrep transcript Homo_sapiens.hg19.90.chosenTranscript.gtf.gz | gzip -c >Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz
 52 | 
 53 | # generate input file for UCSC genome browser, extract 5' UTR, ORF, and 3' UTR sequences using these files using the Table Browser
 54 | perl -ne '@a = split; $a[-1] = "gene_id \"$a[-1]\"; transcript_id \"$a[-1]\""; print "chr".join("\t", @a), "\n";' \
 55 |     <(zcat Homo_sapiens.GRCh38.90.chosenTranscript.gtf.gz) | \
 56 |     gzip -c >Homo_sapiens.GRCh38.90.chr.gtf.gz
 57 | perl -ne '@a = split; $a[-1] = "gene_id \"$a[-1]\"; transcript_id \"$a[-1]\""; print "chr".join("\t", @a), "\n";' \
 58 |     <(zcat Mus_musculus.GRCm3.90.chosenTranscript.gtf.gz) | \
 59 |     gzip -c >Mus_musculus.GRCm3.90.chr.gtf.gz
 60 | 
 61 | # process into BED and extract +/- 10Kb region surrounding TSS
 62 | ./extract_promoters.pl Homo_sapiens.GRCh38.90.chosenTranscript.gtf.gz >hg38_promoters.bed
 63 | ./extract_promoters.pl Mus_musculus.GRCm3.90.chosenTranscript.gtf.gz mouse >mm10_promoters.bed
 64 | 
 65 | # extract Ensembl-annotated promoter sequence from hg38 genome
 66 | bedtools getfasta -s -name -fi human_hs38_noAlt/whole_genome.fa -bed hg38_promoters.bed -fo hg38_promoters.fa
 67 | gzip hg38_promoters.fa
 68 | bedtools getfasta -s -name -fi mus_musculus/mm10.fa -bed mm10_promoters.bed -fo mm10_promoters.fa
 69 | gzip mm10_promoters.fa
 70 | 
 71 | # histone genes to filter out, not quantified correctly due to lack of poly(A) tail
 72 | grep HIST ensembl2geneName_v90.txt | cut -f 1 >mask_histone_genes.txt
 73 | grep Hist ensembl2geneName_v90_mm10.txt | cut -f 1 >mask_histone_genes_mm10.txt
 74 | 
 75 | ########### COLLECT & PROCESS GENE EXPRESSION DATA ############
 76 | 
 77 | # download pre-processed RNA-seq data from 56 cell types (+1 universal reference)
 78 | wget http://egg2.wustl.edu/roadmap/data/byDataType/rna/expression/57epigenomes.RPKM.pc.gz http://egg2.wustl.edu/roadmap/data/byDataType/rna/expression/EG.name.txt
 79 | # extract median expression values
 80 | Rscript process_RNAseq.R
 81 | cut -f 1,56 <(zcat 57epigenomes.RPKM.pc.gz) | tail -n+2 >57epigenomes.K562.txt
 82 | cut -f 1,50 <(zcat 57epigenomes.RPKM.pc.gz) | tail -n+2 >57epigenomes.GM12878.txt
 83 | 
 84 | cut -f 11 files_geneQuant_Rep1.txt | tail -n+2 >urls.txt
 85 | while read p; do qsub "wget $p"; done <urls.txt
 86 | gzip *.tsv
 87 | for x in *.tsv.gz; do { X=`basename $x .tsv.gz`; zgrep -P 'ENS' $x | cut -f 1,7 | gzip -c >$X.FPKM.tsv.gz; } done
 88 | paste *tsv | cut -f 1,$(echo `seq 2 2 1000` | perl -ne '@a=split / /, $_; print join(",",@a);') | sort | gzip -c >mouse_FPKMs.tsv.gz
 89 | Rscript process_RNAseq_mouse.R #generates mouse.median_expr.txt
 90 | 
 91 | ########### GENERATE TRAINING/VALIDATION/TEST SET AND OPTIMIZE ############
 92 | 
 93 | #generate training/validation/test sets
 94 | perl generate_training_input.pl 57epigenomes.median_expr.txt | gzip -c >Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz
 95 | perl generate_training_input.pl 57epigenomes.K562.txt | gzip -c >Roadmap_FantomAnnotations.InputData.pM10Kb.K562expr.txt.gz
 96 | perl generate_training_input.pl 57epigenomes.GM12878.txt | gzip -c >Roadmap_FantomAnnotations.InputData.pM10Kb.GM12878expr.txt.gz
 97 | perl generate_training_input.pl mouse.median_expr.txt | gzip -c >Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz
 98 | perl generate_training_input.pl Ouyang_mouseESC_RPKM_ensemblID.txt | gzip -c >Mouse_FantomAnnotations.InputData.pM10Kb.mESC.txt.gz
 99 | 
100 | python setup_training_files.py -t 1000 -v 1000 Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz pM10Kb_1KTest
101 | python setup_training_files.py -t 1000 -v 1000 Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz pM10Kb_1KTest_Mouse
102 | 
103 | #run hyperparameter search for ~1000 iterations on a fast GPU (can take ~1-2 days to run)
104 | python Xpresso.py tune tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest/
105 | 
106 | Rscript Fig1B_S2A.R FILE1 FILE2
107 | 
108 | for x in {1..10}; do { python Xpresso.py test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest/ >trial_$x.txt; } done &
109 | 
110 | Rscript Fig1C.R FILE1
111 | 


--------------------------------------------------------------------------------
/Fig1_S2/setup_training_files.py:
--------------------------------------------------------------------------------
  1 | import sys, os, h5py
  2 | import numpy.random as npr
  3 | import numpy as np
  4 | from optparse import OptionParser
  5 | import pandas as pd
  6 | from sklearn import preprocessing
  7 | from sklearn.model_selection import KFold
  8 | 
  9 | def main():
 10 |     usage = 'usage: %prog [options] <data_file> <out_dir>'
 11 |     parser = OptionParser(usage)
 12 |     parser.add_option('-t', dest='testCount', default=1000, type='int', help='Number of test examples: [Default: %default]')
 13 |     parser.add_option('-v', dest='validCount', default=1000, type='int', help='Number of validation examples: [Default: %default]')
 14 |     parser.add_option('--cv', dest='crossVal', default=False, action='store_true', help='Generate samples for 10-fold cross-validated predictions? [Default: %default]')
 15 |     parser.add_option('--orthologs', dest='orthologMode', default=False, action='store_true', help='Mouse file to prepare human and mouse 1-1 ortholog set (Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz recommended to accompany human data_file): [Default: %default]')
 16 |     parser.add_option('--over', dest='overwrite', default=False, action='store_true', help='Overwrite directory? [Default: %default]')
 17 |     (options,args) = parser.parse_args()
 18 | 
 19 |     if len(args) != 2:
 20 |         print(args)
 21 |         parser.error('Must provide data file and output directory')
 22 |     else:
 23 |         data_file = args[0]
 24 |         out_dir = args[1]
 25 |         compress_args = {'compression': 'gzip', 'compression_opts': 1}
 26 |         trainfile = os.path.join(out_dir, 'train.h5')
 27 |         validfile = os.path.join(out_dir, 'valid.h5')
 28 |         testfile = os.path.join(out_dir, 'test.h5')
 29 | 
 30 |     if options.orthologMode:
 31 |         trainfile = os.path.join(out_dir, 'train_human1to1.h5')
 32 |         validfile = os.path.join(out_dir, 'valid_human1to1.h5')
 33 |         testfile = os.path.join(out_dir, 'test_human1to1.h5')
 34 |         trainfile2 = os.path.join(out_dir, 'train_mouse1to1.h5')
 35 |         validfile2 = os.path.join(out_dir, 'valid_mouse1to1.h5')
 36 |         testfile2 = os.path.join(out_dir, 'test_mouse1to1.h5')
 37 | 
 38 |     if options.overwrite or not os.path.exists(out_dir):
 39 |         if not os.path.exists(out_dir):
 40 |             os.mkdir(out_dir)
 41 | 
 42 |         # load data
 43 |         promoters, halflifedata, labels, geneNames = preprocess(data_file, options.orthologMode)
 44 | 
 45 |         # check that the sum is valid
 46 |         assert(options.testCount + options.validCount <= promoters.shape[0])
 47 |         test_count = options.testCount
 48 |         valid_count = options.validCount
 49 | 
 50 |         train_count = promoters.shape[0] - test_count - valid_count
 51 | 
 52 |         if options.crossVal:
 53 |             print('running 10-fold cross val w/ %d sequences ' % promoters.shape[0])
 54 |             kf = KFold(n_splits=10, random_state=42, shuffle=False)
 55 |             fold = 0
 56 |             for train_index, test_index in kf.split(promoters): #keep aside 1000 examples of train indices for validation set
 57 |                 fold += 1
 58 |                 print('fold %d' % fold)
 59 |                 h5f_train = h5py.File(os.path.join(out_dir, str(fold)+'train.h5'), 'w')
 60 |                 h5f_valid = h5py.File(os.path.join(out_dir, str(fold)+'valid.h5'), 'w')
 61 |                 h5f_test = h5py.File(os.path.join(out_dir, str(fold)+'test.h5'), 'w')
 62 |                 valid_index = train_index[0:1000]
 63 |                 train_index = train_index[1000:len(train_index)]
 64 |                 h5f_train.create_dataset('data'    , data=halflifedata[train_index,:], **compress_args)
 65 |                 h5f_train.create_dataset('promoter', data=promoters[train_index,:], **compress_args)
 66 |                 h5f_train.create_dataset('label'   , data=labels[train_index], **compress_args)
 67 |                 h5f_train.create_dataset('geneName' , data=np.array(geneNames)[train_index].tolist(), **compress_args)
 68 |                 h5f_train.close()
 69 |                 h5f_valid.create_dataset('data'    , data=halflifedata[valid_index,:], **compress_args)
 70 |                 h5f_valid.create_dataset('promoter', data=promoters[valid_index,:], **compress_args)
 71 |                 h5f_valid.create_dataset('label'   , data=labels[valid_index], **compress_args)
 72 |                 h5f_valid.create_dataset('geneName' , data=np.array(geneNames)[valid_index].tolist(), **compress_args)
 73 |                 h5f_valid.close()
 74 |                 h5f_test.create_dataset('data'    , data=halflifedata[test_index,:], **compress_args)
 75 |                 h5f_test.create_dataset('promoter', data=promoters[test_index,:], **compress_args)
 76 |                 h5f_test.create_dataset('label'   , data=labels[test_index], **compress_args)
 77 |                 h5f_test.create_dataset('geneName' , data=np.array(geneNames)[test_index].tolist(), **compress_args)
 78 |                 h5f_test.close()
 79 |         else:
 80 |             print('%d training sequences ' % train_count)
 81 |             print('%d test sequences ' % test_count)
 82 |             print('%d validation sequences ' % valid_count)
 83 |             h5f_train = h5py.File(trainfile, 'w')
 84 |             h5f_valid = h5py.File(validfile, 'w')
 85 |             h5f_test = h5py.File(testfile, 'w')
 86 |             i = 0
 87 |             if train_count > 0:
 88 |                 h5f_train.create_dataset('data'    , data=halflifedata[i:i+train_count,:], **compress_args)
 89 |                 h5f_train.create_dataset('promoter', data=promoters[i:i+train_count,:], **compress_args)
 90 |                 h5f_train.create_dataset('label'   , data=labels[i:i+train_count], **compress_args)
 91 |                 h5f_train.create_dataset('geneName' , data=geneNames[i:i+train_count], **compress_args)
 92 |                 h5f_train.close()
 93 |             i += train_count
 94 |             if valid_count > 0:
 95 |                 h5f_valid.create_dataset('data'    , data=halflifedata[i:i+valid_count,:], **compress_args)
 96 |                 h5f_valid.create_dataset('promoter', data=promoters[i:i+valid_count,:], **compress_args)
 97 |                 h5f_valid.create_dataset('label'   , data=labels[i:i+valid_count], **compress_args)
 98 |                 h5f_valid.create_dataset('geneName' , data=geneNames[i:i+valid_count], **compress_args)
 99 |                 h5f_valid.close()
100 |             i += valid_count
101 |             if test_count > 0:
102 |                 h5f_test.create_dataset('data'    , data=halflifedata[i:i+test_count,:], **compress_args)
103 |                 h5f_test.create_dataset('promoter', data=promoters[i:i+test_count,:], **compress_args)
104 |                 h5f_test.create_dataset('label'   , data=labels[i:i+test_count], **compress_args)
105 |                 h5f_test.create_dataset('geneName' , data=geneNames[i:i+test_count], **compress_args)
106 |                 h5f_test.close()
107 | 
108 |             if options.orthologMode:
109 |                 print("Finding 1-1 orthologs...")
110 |                 h5f_train = h5py.File(trainfile2, 'w')
111 |                 h5f_valid = h5py.File(validfile2, 'w')
112 |                 h5f_test = h5py.File(testfile2, 'w')
113 |                 promoters2, halflifedata2, labels2, geneNames2 = preprocess(options.orthologMode, options.orthologMode)
114 |                 orthologs = pd.read_table("human2mouse_one2one_orthologs.txt", header=None)
115 | 
116 |                 i = 0
117 |                 orthoids = orthologs[orthologs[0].isin(geneNames[i:i+train_count])][1] #transform human to mouse IDs
118 |                 idxs = np.where(np.isin(geneNames2,orthoids))[0].tolist()
119 |                 h5f_train.create_dataset('data'    , data=halflifedata2[idxs,:], **compress_args)
120 |                 h5f_train.create_dataset('promoter', data=promoters2[idxs,:], **compress_args)
121 |                 h5f_train.create_dataset('label'   , data=labels2[idxs], **compress_args)
122 |                 h5f_train.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args)
123 |                 print('%d 1-1 mouse orthologs found for training set' % labels2[idxs].shape)
124 |                 h5f_train.close()
125 |                 i += train_count
126 |                 orthoids = orthologs[orthologs[0].isin(geneNames[i:i+valid_count])][1]
127 |                 idxs = np.where(np.isin(geneNames2,orthoids))[0].tolist()
128 |                 h5f_valid.create_dataset('data'    , data=halflifedata2[idxs,:], **compress_args)
129 |                 h5f_valid.create_dataset('promoter', data=promoters2[idxs,:], **compress_args)
130 |                 h5f_valid.create_dataset('label'   , data=labels2[idxs], **compress_args)
131 |                 h5f_valid.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args)
132 |                 print('%d 1-1 mouse orthologs found validation set' % labels2[idxs].shape)
133 |                 h5f_valid.close()
134 |                 i += valid_count
135 |                 orthoids = orthologs[orthologs[0].isin(geneNames[i:i+test_count])][1]
136 |                 idxs = np.isin(geneNames2,orthoids)
137 |                 h5f_test.create_dataset('data'    , data=halflifedata2[idxs,:], **compress_args)
138 |                 h5f_test.create_dataset('promoter', data=promoters2[idxs,:], **compress_args)
139 |                 h5f_test.create_dataset('label'   , data=labels2[idxs], **compress_args)
140 |                 h5f_test.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args)
141 |                 print('%d 1-1 mouse orthologs found for test set ' % labels2[idxs].shape)
142 |                 h5f_test.close()
143 |     else:
144 |         parser.error('Nothing done...Run with --over to overwrite')
145 | 
146 | def one_hot(seq):
147 |     seq_len = len(seq.item(0))
148 |     seqindex = {'A':0, 'C':1, 'G':2, 'T':3, 'a':0, 'c':1, 'g':2, 't':3}
149 |     seq_vec = np.zeros((len(seq),seq_len,4), dtype='bool')
150 |     for i in range(len(seq)):
151 |         thisseq = seq.item(i)
152 |         for j in range(seq_len):
153 |             try:
154 |                 seq_vec[i,j,seqindex[thisseq[j]]] = 1
155 |             except:
156 |                 pass
157 |     return seq_vec
158 | 
159 | def preprocess(data_file, orthologMode):
160 |     table = pd.read_table(data_file, index_col=0)
161 |     maskedIDs = pd.read_table("mask_histone_genes_mm10.txt", header=None) #mask histone genes, chrY genes already filtered out
162 |     maskedIDs2 = pd.read_table("mask_histone_genes.txt", header=None) #mask histone genes, chrY genes already filtered out
163 |     table = table[~table.index.isin(maskedIDs[0])] #remove rows corresponding to chrY or histone sequences
164 |     table = table[~table.index.isin(maskedIDs2[0])] #remove rows corresponding to chrY or histone sequences
165 |     if orthologMode:
166 |         orthologs = pd.read_table("1to1_orthologs_expression.txt", header=None)
167 |         table = table[table.index.isin(orthologs[[0,1]].values.flatten())] #must match human or mouse 1-1 ortholog IDs
168 |     table[table.columns[range(0,5)+[8]]] = np.log10(table[table.columns[range(0,5)+[8]]]+0.1)
169 |     table = table.sample(table.shape[0], replace=False, random_state=1)
170 |     table[table.columns[range(0,9)]] = preprocessing.scale(table[table.columns[range(0,9)]])
171 |     print("\nPre-processed data...one-hot encoding...")
172 |     promoters = one_hot(table['PROMOTER'].as_matrix())
173 |     halflifedata = table[table.columns[range(1,9)]].as_matrix()
174 |     labels = table['EXPRESSION'].as_matrix()
175 |     geneNames = list(table.index)
176 |     print("Processed data from %s" % data_file)
177 |     return promoters, halflifedata, labels, geneNames
178 | 
179 | if __name__ == '__main__':
180 |     main()
181 | 


--------------------------------------------------------------------------------
/Fig2/57epigenomes.median_expr.txt:
--------------------------------------------------------------------------------
1 | ../datasets/57epigenomes.median_expr.txt


--------------------------------------------------------------------------------
/Fig2/Fig2A.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(latticeExtra)
 3 | 
 4 | setwd("subsampled_10fold/")
 5 | 
 6 | files = list.files(path='.', pattern='trial', recursive=T)
 7 | # files = files[grepl("^2000|4000|6000|8000|10000|14000|16000",files)]
 8 | table <- as.data.frame(do.call("rbind", lapply(files, FUN=function(file){
 9 | 	cmd = paste("tail -2", file, "| perl -ne \'@a=split /= /; print $a[1];\'")
10 | 	tmp = t(read.table(textConnection(system(cmd, intern=TRUE))))
11 | 	tmp$sample = as.numeric(dirname(file))
12 | 	tmp$rep = as.numeric(strsplit(basename(file), "_")[[1]][1])
13 | 	tmp$trial = as.numeric(strsplit(strsplit(file, "_trial")[[1]][2], '\\.')[[1]][1])
14 | 	names(tmp) = c("r2","MSE","sample","rep","trial")
15 | 	tmp
16 | })))
17 | 
18 | table = as.data.frame(apply(table,2,function(x) as.numeric(as.character(x))))
19 | table = do.call("rbind", lapply(unique(table$sample), function(sub) { do.call("rbind", lapply(unique(table$rep), function(x) { tmp=table[table$rep==x & table$sample==sub,]; tmp[which( tmp$MSE == min(tmp$MSE) ),] } )) }) )
20 | 
21 | table = as.data.frame(aggregate(.~sample,table,function(x) c(mean=mean(x), sd=sd(x))))
22 | table
23 | table[,2][,2]=table[,2][,2]/sqrt(10) #std err
24 | table[,3][,2]=table[,3][,2]/sqrt(10) #std err
25 | table
26 | 
27 | pdf("Fig2A.pdf", height=3, width=5)
28 | obj1 = xyplot(MSE[,1] ~ sample, table,
29 |        panel = function(x, y, ...){
30 |          panel.arrows(x, y, x, table[,3][,1]+1.96*table[,3][,2], length = 0, angle = 90)
31 |          panel.arrows(x, y, x, table[,3][,1]-1.96*table[,3][,2], length = 0, angle = 90)
32 |          panel.xyplot(x, y, ...)
33 | 	 }, type = "o" , ylim=c(0.4,0.5), lwd=2, scales = list(x = list(at = seq(2000,16000,2000) ))) #limits = c(-500,500)
34 | obj2 = xyplot(r2[,1] ~ sample, table,
35 |        panel = function(x, y, ...){
36 |          panel.arrows(x, y, x, table[,2][,1]+1.96*table[,2][,2], length = 0, angle = 90)
37 |          panel.arrows(x, y, x, table[,2][,1]-1.96*table[,2][,2], length = 0, angle = 90)
38 |          panel.xyplot(x, y, ...)
39 | 	 }, type = "o", lwd=2, ylim=c(0.5,0.6))
40 | doubleYScale(obj1, obj2, add.ylab2 = TRUE)
41 | dev.off()
42 | 


--------------------------------------------------------------------------------
/Fig2/Fig2BC.R:
--------------------------------------------------------------------------------
 1 | library(LSD)
 2 | library(data.table)
 3 | 
 4 | #############################
 5 | ##Species-specific analysis##
 6 | #############################
 7 | 
 8 | pdf("Fig2B.pdf")
 9 | a=read.delim("pM10Kb_1KTest/predictions.txt")
10 | actual=read.delim("57epigenomes.median_expr.txt",F)
11 | colnames(actual)=c("Gene","UnscaledExpr")
12 | actual$UnscaledExpr=log10(actual$UnscaledExpr+0.1)
13 | a=merge(a,actual,by=1)
14 | model=lm(UnscaledExpr~Actual,a)
15 | a$Pred=predict(model,newdata=data.frame(Actual=a$Pred))
16 | a$Actual=predict(model)
17 | 
18 | "Human r^2:"
19 | cor(a$Pred,a$Actual)^2
20 | heatscatter(a$Pred, a$Actual, bty='n', xlim=c(-1,3), ylim=c(-1,3), cex.axis=2, cex.lab=2, las=1)
21 | a=read.delim("pM10Kb_1KTest_Mouse/predictions.txt")
22 | actual=read.delim("mouse.median_expr.txt",F)
23 | colnames(actual)=c("Gene","UnscaledExpr")
24 | actual$UnscaledExpr=log10(actual$UnscaledExpr+0.1)
25 | a=merge(a,actual,by=1)
26 | model=lm(UnscaledExpr~Actual,a)
27 | a$Pred=predict(model,newdata=data.frame(Actual=a$Pred))
28 | a$Actual=predict(model)
29 | dev.off()
30 | 
31 | "Mouse r^2:"
32 | cor(a$Pred,a$Actual)^2
33 | pdf("Fig2C.pdf")
34 | heatscatter(a$Pred, a$Actual, bty='n', xlim=c(-1,3), ylim=c(-1,3), cex.axis=2, cex.lab=2, las=1)
35 | dev.off()
36 | 


--------------------------------------------------------------------------------
/Fig2/Fig2D.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(latticeExtra)
 3 | 
 4 | getbest <- function(dir){
 5 | 	files = list.files(path=dir, pattern='.txt', full.names=T)
 6 | 	sbtable <- as.data.frame(do.call("rbind", lapply(files, FUN=function(file){
 7 | 		cmd = paste("tail -2", file, "| perl -ne \'@a=split /= /; print $a[1];\'")
 8 | 		tmp = t(read.table(textConnection(system(cmd, intern=TRUE))))
 9 | 		tmp$file = strsplit(file, "/")[[1]][2]
10 | 		names(tmp) = c("r2","MSE","samples")
11 | 		tmp
12 | 	})))
13 | 
14 | 	sbtable$samples = as.character(sbtable$samples)
15 | 	sbtable = data.table(sbtable)
16 | 	sbtable = sbtable[ , .SD[which.min(MSE)], by = samples]
17 | 	sbtable
18 | }
19 | 
20 | a=list()
21 | a[[1]]=getbest("ortholog_results/train_human_test_human/")
22 | a[[2]]=getbest("ortholog_results/train_human_test_mouse/")
23 | a[[3]]=getbest("ortholog_results/train_mouse_test_human/")
24 | a[[4]]=getbest("ortholog_results/train_mouse_test_mouse/")
25 | 
26 | a=as.data.frame(do.call("rbind", a))
27 | a$samples=as.factor(a$samples)
28 | 
29 | a
30 | pdf("Fig2D.pdf", height=4, width=5)
31 | obj1 = xyplot(MSE ~ samples, a, type = "p", pch=19, lwd=2, scales=list(x=list(rot=45)))
32 | obj2 = xyplot(r2 ~ samples, a, type = "p", pch=19, lwd=2, scales=list(x=list(rot=45)))
33 | doubleYScale(obj1, obj2, add.ylab2 = TRUE)
34 | dev.off()
35 | 


--------------------------------------------------------------------------------
/Fig2/Fig2E.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig2/Fig2E.pdf


--------------------------------------------------------------------------------
/Fig2/Fig2EFG.R:
--------------------------------------------------------------------------------
 1 | library(LSD)
 2 | library(data.table)
 3 | library(ROCR)
 4 | 
 5 | ####################
 6 | ##One2One analysis##
 7 | ####################
 8 | a=read.delim("human2mouse_one2one_orthologs.txt",F)
 9 | b=fread("zcat Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1-2",header=T,data.table=F,sep="\t")
10 | c=fread("zcat Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1-2",header=T,data.table=F,sep="\t")
11 | b$EXPRESSION = log10(b$EXPRESSION+0.1)
12 | c$EXPRESSION = log10(c$EXPRESSION+0.1)
13 | d=merge(a,c,by.x=2,by.y=1)
14 | d=merge(d,b,by.x=2,by.y=1)
15 | 
16 | colnames(d)=c("hid","mid","type","mexpr","hexpr")
17 | head(d)
18 | 
19 | say(nrow(d), "genes")
20 | say("Pearson corr =", cor(d$hexpr,d$mexpr))
21 | 
22 | pdf("Fig2E.pdf")
23 | par(mar=c(7,7,5,5), mgp = c(5, 1, 0))
24 | plot.ecdf(b$EXPRESSION, xlim=c(-1,3), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="purple",
25 |     ylab="Cumulative fraction", xlab="log10(Median expression level + 0.1)", col.01line = "white", lwd=2, cex.axis=2, cex.lab=2, bty="n", las=1)
26 | plot.ecdf(c$EXPRESSION, verticals= TRUE, do.points = FALSE, add = TRUE, col = "red", col.01line = "white", lwd=2, bty="n")
27 | plot.ecdf(d$hexpr, verticals= TRUE, do.points = FALSE, add = TRUE, col = "blue", col.01line = "white", lwd=2, bty="n")
28 | plot.ecdf(d$mexpr, verticals= TRUE, do.points = FALSE, add = TRUE, col = "cyan", col.01line = "white", lwd=2, bty="n")
29 | 
30 | legend("bottomright", bg="white", bty="n", legend =
31 |   c( paste("human (", length(b$EXPRESSION), ")") , paste("mouse (", length(c$EXPRESSION), ")"),
32 |   paste("human, one-to-one orthologs (", length(d$hexpr), ")"), paste("mouse, one-to-one orthologs (", length(d$mexpr), ")")),
33 |   text.col = c("purple","red","blue","cyan"))
34 | dev.off()
35 | 
36 | pdf("Fig2F.pdf")
37 | heatscatter(d$hexpr, d$mexpr, xlab="Human", ylab="Mouse", bty='n', cex=0.3, xlim=c(-1,3), ylim=c(-1,3), cex.axis=2, cex.lab=2, las=1)
38 | dev.off()
39 | 
40 | writefile(d,"1to1_orthologs_expression.txt", col.names=F)
41 | 
42 | pdf("Fig2G.pdf")
43 | b=read.delim("all_crossvalidated_predictions.txt")
44 | c=read.delim("all_crossvalidated_predictions_mouse.txt")
45 | d=merge(d,b,by=1)
46 | colnames(d)[4:5]=c("mouse_expr","human_expr")
47 | colnames(d)[6:7]=c("human_pred","human_Actual")
48 | e=merge(d,c,by.x=2,by.y=1)
49 | colnames(e)[8:9]=c("mouse_pred","mouse_Actual")
50 | attach(e)
51 | head(e)
52 | 
53 | e$diff = mouse_expr-human_expr
54 | f=e[abs(e$diff) > 1,]
55 | f$mouseOrHuman = ifelse(f$diff > 0, 1, 0)
56 | "human-specific"
57 | sum(f$diff < 0)
58 | "mouse-specific"
59 | sum(f$diff > 0)
60 | head(f)
61 | plot(performance( prediction( f$mouse_pred-f$human_pred, f$mouseOrHuman), "tpr", "fpr"), col="blue", las=1, cex.axis=2, cex.lab=2, bty='n')
62 | text(0.2, 1, labels = paste("AUC = ", round(performance( prediction(f$mouse_pred-f$human_pred, f$mouseOrHuman), "auc")@y.values[[1]],2), ' (n = ', nrow(f), ')', sep=''), offset = 1.5, col="black")
63 | abline(0,1,col="grey")
64 | dev.off()
65 | 


--------------------------------------------------------------------------------
/Fig2/Fig2F.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig2/Fig2F.pdf


--------------------------------------------------------------------------------
/Fig2/Fig2G.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig2/Fig2G.pdf


--------------------------------------------------------------------------------
/Fig2/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz


--------------------------------------------------------------------------------
/Fig2/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz


--------------------------------------------------------------------------------
/Fig2/Xpresso.py:
--------------------------------------------------------------------------------
1 | ../Fig1_S2/Xpresso.py


--------------------------------------------------------------------------------
/Fig2/all_crossvalidated_predictions.txt:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/all_crossvalidated_predictions.txt


--------------------------------------------------------------------------------
/Fig2/all_crossvalidated_predictions_mouse.txt:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/all_crossvalidated_predictions_mouse.txt


--------------------------------------------------------------------------------
/Fig2/ensembl2geneName_v90_mm10.txt:
--------------------------------------------------------------------------------
1 | ../datasets/ensembl2geneName_v90_mm10.txt


--------------------------------------------------------------------------------
/Fig2/mouse.median_expr.txt:
--------------------------------------------------------------------------------
1 | ../datasets/mouse.median_expr.txt


--------------------------------------------------------------------------------
/Fig2/ortholog_results:
--------------------------------------------------------------------------------
1 | ../datasets/ortholog_results/


--------------------------------------------------------------------------------
/Fig2/pM10Kb_1KTest:
--------------------------------------------------------------------------------
1 | ../datasets/pM10Kb_1KTest


--------------------------------------------------------------------------------
/Fig2/pM10Kb_1KTest_Mouse:
--------------------------------------------------------------------------------
1 | ../datasets/pM10Kb_1KTest_Mouse/


--------------------------------------------------------------------------------
/Fig2/pM10Kb_1KTest_one2oneOrthologs:
--------------------------------------------------------------------------------
1 | ../datasets/pM10Kb_1KTest_one2oneOrthologs/


--------------------------------------------------------------------------------
/Fig2/runme.sh:
--------------------------------------------------------------------------------
 1 | ### Ran from datasets/ directory on GPU
 2 | mkdir subsampled_10fold
 3 | for x in `seq 2000 2000 16000`; do { echo $x; python subsample.py $x pM10Kb_1KTest subsampled_10fold/$x; } done
 4 | for z in `seq 2000 2000 16000`; do { for y in {0..9}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x train tpe_1K_10epochs_optimized_0to20K.hyperopt subsampled_10fold/$z/ >subsampled_10fold/$z/$y\_trial$x.txt; } done } done } done
 5 | 
 6 | Rscript Fig2A.R
 7 | Rscript Fig2BC.R
 8 | 
 9 | #Acquired 1 to 1 ortholog predictions from Ensembl BioMart
10 | grep one2one human2mouse_orthologs.txt >human2mouse_one2one_orthologs.txt
11 | python setup_training_files.py -t 1000 -v 1000 --orthologs Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz pM10Kb_1KTest_one2oneOrthologs
12 | 
13 | mkdir ortholog_results
14 | mkdir ortholog_results/train_human_test_human
15 | mkdir ortholog_results/train_mouse_test_human
16 | mkdir ortholog_results/train_human_test_mouse
17 | mkdir ortholog_results/train_mouse_test_mouse
18 | for x in {1..10}; do { python Xpresso.py train tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_one2oneOrthologs/ >ortholog_results/train_human_test_human/trial_$x.txt; } done &
19 | for x in {1..10}; do { python Xpresso.py train tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_one2oneOrthologs/ >ortholog_results/train_human_test_mouse/trial_$x.txt; } done &
20 | for x in {1..10}; do { python Xpresso.py train tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_one2oneOrthologs/ >ortholog_results/train_mouse_test_human/trial_$x.txt; } done &
21 | for x in {1..10}; do { python Xpresso.py train tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_one2oneOrthologs/ >ortholog_results/train_mouse_test_mouse/trial_$x.txt; } done &
22 | 
23 | #Stored results from runs in ortholog_results/
24 | Rscript Fig2D.R
25 | Rscript Fig2EFG.R
26 | 


--------------------------------------------------------------------------------
/Fig2/setup_training_files.py:
--------------------------------------------------------------------------------
  1 | import sys, os, h5py
  2 | import numpy.random as npr
  3 | import numpy as np
  4 | from optparse import OptionParser
  5 | import pandas as pd
  6 | from sklearn import preprocessing
  7 | from sklearn.model_selection import KFold
  8 | 
  9 | def main():
 10 |     usage = 'usage: %prog [options] <data_file> <out_dir>'
 11 |     parser = OptionParser(usage)
 12 |     parser.add_option('-t', dest='testCount', default=1000, type='int', help='Number of test examples: [Default: %default]')
 13 |     parser.add_option('-v', dest='validCount', default=1000, type='int', help='Number of validation examples: [Default: %default]')
 14 |     parser.add_option('--cv', dest='crossVal', default=False, action='store_true', help='Generate samples for 10-fold cross-validated predictions? [Default: %default]')
 15 |     parser.add_option('--orthologs', dest='orthologMode', default=False, action='store_true', help='Mouse file to prepare human and mouse 1-1 ortholog set (Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz recommended to accompany human data_file): [Default: %default]')
 16 |     parser.add_option('--over', dest='overwrite', default=False, action='store_true', help='Overwrite directory? [Default: %default]')
 17 |     (options,args) = parser.parse_args()
 18 | 
 19 |     if len(args) != 2:
 20 |         print(args)
 21 |         parser.error('Must provide data file and output directory')
 22 |     else:
 23 |         data_file = args[0]
 24 |         out_dir = args[1]
 25 |         compress_args = {'compression': 'gzip', 'compression_opts': 1}
 26 |         trainfile = os.path.join(out_dir, 'train.h5')
 27 |         validfile = os.path.join(out_dir, 'valid.h5')
 28 |         testfile = os.path.join(out_dir, 'test.h5')
 29 | 
 30 |     if options.orthologMode:
 31 |         trainfile = os.path.join(out_dir, 'train_human1to1.h5')
 32 |         validfile = os.path.join(out_dir, 'valid_human1to1.h5')
 33 |         testfile = os.path.join(out_dir, 'test_human1to1.h5')
 34 |         trainfile2 = os.path.join(out_dir, 'train_mouse1to1.h5')
 35 |         validfile2 = os.path.join(out_dir, 'valid_mouse1to1.h5')
 36 |         testfile2 = os.path.join(out_dir, 'test_mouse1to1.h5')
 37 | 
 38 |     if options.overwrite or not os.path.exists(out_dir):
 39 |         if not os.path.exists(out_dir):
 40 |             os.mkdir(out_dir)
 41 | 
 42 |         # load data
 43 |         promoters, halflifedata, labels, geneNames = preprocess(data_file, options.orthologMode)
 44 | 
 45 |         # check that the sum is valid
 46 |         assert(options.testCount + options.validCount <= promoters.shape[0])
 47 |         test_count = options.testCount
 48 |         valid_count = options.validCount
 49 | 
 50 |         train_count = promoters.shape[0] - test_count - valid_count
 51 | 
 52 |         if options.crossVal:
 53 |             print('running 10-fold cross val w/ %d sequences ' % promoters.shape[0])
 54 |             kf = KFold(n_splits=10, random_state=42, shuffle=False)
 55 |             fold = 0
 56 |             for train_index, test_index in kf.split(promoters): #keep aside 1000 examples of train indices for validation set
 57 |                 fold += 1
 58 |                 print('fold %d' % fold)
 59 |                 h5f_train = h5py.File(os.path.join(out_dir, str(fold)+'train.h5'), 'w')
 60 |                 h5f_valid = h5py.File(os.path.join(out_dir, str(fold)+'valid.h5'), 'w')
 61 |                 h5f_test = h5py.File(os.path.join(out_dir, str(fold)+'test.h5'), 'w')
 62 |                 valid_index = train_index[0:1000]
 63 |                 train_index = train_index[1000:len(train_index)]
 64 |                 h5f_train.create_dataset('data'    , data=halflifedata[train_index,:], **compress_args)
 65 |                 h5f_train.create_dataset('promoter', data=promoters[train_index,:], **compress_args)
 66 |                 h5f_train.create_dataset('label'   , data=labels[train_index], **compress_args)
 67 |                 h5f_train.create_dataset('geneName' , data=np.array(geneNames)[train_index].tolist(), **compress_args)
 68 |                 h5f_train.close()
 69 |                 h5f_valid.create_dataset('data'    , data=halflifedata[valid_index,:], **compress_args)
 70 |                 h5f_valid.create_dataset('promoter', data=promoters[valid_index,:], **compress_args)
 71 |                 h5f_valid.create_dataset('label'   , data=labels[valid_index], **compress_args)
 72 |                 h5f_valid.create_dataset('geneName' , data=np.array(geneNames)[valid_index].tolist(), **compress_args)
 73 |                 h5f_valid.close()
 74 |                 h5f_test.create_dataset('data'    , data=halflifedata[test_index,:], **compress_args)
 75 |                 h5f_test.create_dataset('promoter', data=promoters[test_index,:], **compress_args)
 76 |                 h5f_test.create_dataset('label'   , data=labels[test_index], **compress_args)
 77 |                 h5f_test.create_dataset('geneName' , data=np.array(geneNames)[test_index].tolist(), **compress_args)
 78 |                 h5f_test.close()
 79 |         else:
 80 |             print('%d training sequences ' % train_count)
 81 |             print('%d test sequences ' % test_count)
 82 |             print('%d validation sequences ' % valid_count)
 83 |             h5f_train = h5py.File(trainfile, 'w')
 84 |             h5f_valid = h5py.File(validfile, 'w')
 85 |             h5f_test = h5py.File(testfile, 'w')
 86 |             i = 0
 87 |             if train_count > 0:
 88 |                 h5f_train.create_dataset('data'    , data=halflifedata[i:i+train_count,:], **compress_args)
 89 |                 h5f_train.create_dataset('promoter', data=promoters[i:i+train_count,:], **compress_args)
 90 |                 h5f_train.create_dataset('label'   , data=labels[i:i+train_count], **compress_args)
 91 |                 h5f_train.create_dataset('geneName' , data=geneNames[i:i+train_count], **compress_args)
 92 |                 h5f_train.close()
 93 |             i += train_count
 94 |             if valid_count > 0:
 95 |                 h5f_valid.create_dataset('data'    , data=halflifedata[i:i+valid_count,:], **compress_args)
 96 |                 h5f_valid.create_dataset('promoter', data=promoters[i:i+valid_count,:], **compress_args)
 97 |                 h5f_valid.create_dataset('label'   , data=labels[i:i+valid_count], **compress_args)
 98 |                 h5f_valid.create_dataset('geneName' , data=geneNames[i:i+valid_count], **compress_args)
 99 |                 h5f_valid.close()
100 |             i += valid_count
101 |             if test_count > 0:
102 |                 h5f_test.create_dataset('data'    , data=halflifedata[i:i+test_count,:], **compress_args)
103 |                 h5f_test.create_dataset('promoter', data=promoters[i:i+test_count,:], **compress_args)
104 |                 h5f_test.create_dataset('label'   , data=labels[i:i+test_count], **compress_args)
105 |                 h5f_test.create_dataset('geneName' , data=geneNames[i:i+test_count], **compress_args)
106 |                 h5f_test.close()
107 | 
108 |             if options.orthologMode:
109 |                 print("Finding 1-1 orthologs...")
110 |                 h5f_train = h5py.File(trainfile2, 'w')
111 |                 h5f_valid = h5py.File(validfile2, 'w')
112 |                 h5f_test = h5py.File(testfile2, 'w')
113 |                 promoters2, halflifedata2, labels2, geneNames2 = preprocess(options.orthologMode, options.orthologMode)
114 |                 orthologs = pd.read_table("human2mouse_one2one_orthologs.txt", header=None)
115 | 
116 |                 i = 0
117 |                 orthoids = orthologs[orthologs[0].isin(geneNames[i:i+train_count])][1] #transform human to mouse IDs
118 |                 idxs = np.where(np.isin(geneNames2,orthoids))[0].tolist()
119 |                 h5f_train.create_dataset('data'    , data=halflifedata2[idxs,:], **compress_args)
120 |                 h5f_train.create_dataset('promoter', data=promoters2[idxs,:], **compress_args)
121 |                 h5f_train.create_dataset('label'   , data=labels2[idxs], **compress_args)
122 |                 h5f_train.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args)
123 |                 print('%d 1-1 mouse orthologs found for training set' % labels2[idxs].shape)
124 |                 h5f_train.close()
125 |                 i += train_count
126 |                 orthoids = orthologs[orthologs[0].isin(geneNames[i:i+valid_count])][1]
127 |                 idxs = np.where(np.isin(geneNames2,orthoids))[0].tolist()
128 |                 h5f_valid.create_dataset('data'    , data=halflifedata2[idxs,:], **compress_args)
129 |                 h5f_valid.create_dataset('promoter', data=promoters2[idxs,:], **compress_args)
130 |                 h5f_valid.create_dataset('label'   , data=labels2[idxs], **compress_args)
131 |                 h5f_valid.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args)
132 |                 print('%d 1-1 mouse orthologs found validation set' % labels2[idxs].shape)
133 |                 h5f_valid.close()
134 |                 i += valid_count
135 |                 orthoids = orthologs[orthologs[0].isin(geneNames[i:i+test_count])][1]
136 |                 idxs = np.isin(geneNames2,orthoids)
137 |                 h5f_test.create_dataset('data'    , data=halflifedata2[idxs,:], **compress_args)
138 |                 h5f_test.create_dataset('promoter', data=promoters2[idxs,:], **compress_args)
139 |                 h5f_test.create_dataset('label'   , data=labels2[idxs], **compress_args)
140 |                 h5f_test.create_dataset('geneName', data=np.array(geneNames2)[idxs].tolist(), **compress_args)
141 |                 print('%d 1-1 mouse orthologs found for test set ' % labels2[idxs].shape)
142 |                 h5f_test.close()
143 |     else:
144 |         parser.error('Nothing done...Run with --over to overwrite')
145 | 
146 | def one_hot(seq):
147 |     seq_len = len(seq.item(0))
148 |     seqindex = {'A':0, 'C':1, 'G':2, 'T':3, 'a':0, 'c':1, 'g':2, 't':3}
149 |     seq_vec = np.zeros((len(seq),seq_len,4), dtype='bool')
150 |     for i in range(len(seq)):
151 |         thisseq = seq.item(i)
152 |         for j in range(seq_len):
153 |             try:
154 |                 seq_vec[i,j,seqindex[thisseq[j]]] = 1
155 |             except:
156 |                 pass
157 |     return seq_vec
158 | 
159 | def preprocess(data_file, orthologMode):
160 |     table = pd.read_table(data_file, index_col=0)
161 |     maskedIDs = pd.read_table("mask_histone_genes_mm10.txt", header=None) #mask histone genes, chrY genes already filtered out
162 |     maskedIDs2 = pd.read_table("mask_histone_genes.txt", header=None) #mask histone genes, chrY genes already filtered out
163 |     table = table[~table.index.isin(maskedIDs[0])] #remove rows corresponding to chrY or histone sequences
164 |     table = table[~table.index.isin(maskedIDs2[0])] #remove rows corresponding to chrY or histone sequences
165 |     if orthologMode:
166 |         orthologs = pd.read_table("1to1_orthologs_expression.txt", header=None)
167 |         table = table[table.index.isin(orthologs[[0,1]].values.flatten())] #must match human or mouse 1-1 ortholog IDs
168 |     table[table.columns[range(0,5)+[8]]] = np.log10(table[table.columns[range(0,5)+[8]]]+0.1)
169 |     table = table.sample(table.shape[0], replace=False, random_state=1)
170 |     table[table.columns[range(0,9)]] = preprocessing.scale(table[table.columns[range(0,9)]])
171 |     print("\nPre-processed data...one-hot encoding...")
172 |     promoters = one_hot(table['PROMOTER'].as_matrix())
173 |     halflifedata = table[table.columns[range(1,9)]].as_matrix()
174 |     labels = table['EXPRESSION'].as_matrix()
175 |     geneNames = list(table.index)
176 |     print("Processed data from %s" % data_file)
177 |     return promoters, halflifedata, labels, geneNames
178 | 
179 | if __name__ == '__main__':
180 |     main()
181 | 


--------------------------------------------------------------------------------
/Fig2/subsample.py:
--------------------------------------------------------------------------------
 1 | import sys, os, h5py
 2 | from numpy.random import choice
 3 | import numpy as np
 4 | from optparse import OptionParser
 5 | 
 6 | def main():
 7 |     usage = 'usage: %prog [options] <subsamples> <indir> <outdir>'
 8 |     parser = OptionParser(usage)
 9 |     (options,args) = parser.parse_args()
10 |     if len(args) != 3:
11 |         print(args)
12 |         sys.exit('Must provide number of subsamples, input directory, and output directory')
13 |     counts, in_dir, out_dir = args
14 |     counts = int(counts)
15 |     if not os.path.exists(out_dir):
16 |         os.mkdir(out_dir)
17 |     file = h5py.File(os.path.join(in_dir, 'train.h5'), 'r')
18 |     X_halflife, X_promoter, y, geneName = file['data'], file['promoter'], file['label'], file['geneName']
19 |     for i in range(10):
20 |         print('sample %d' % i)
21 |         os.symlink(os.path.join(os.path.realpath(in_dir), 'valid.h5'), os.path.join(out_dir, str(i)+'valid.h5'))
22 |         os.symlink(os.path.join(os.path.realpath(in_dir), 'test.h5'), os.path.join(out_dir, str(i)+'test.h5'))
23 |         h5f = h5py.File(os.path.join(out_dir, str(i)+'train.h5'), 'w')
24 |         index = np.sort(choice(X_promoter.shape[0], size=counts, replace=False))
25 |         compress_args = {'compression': 'gzip', 'compression_opts': 1}
26 |         h5f.create_dataset('data'    , data=X_halflife[index,:], **compress_args)
27 |         h5f.create_dataset('promoter', data=X_promoter[index,:,:], **compress_args)
28 |         h5f.create_dataset('label'   , data=y[index.tolist()], **compress_args)
29 |         h5f.create_dataset('geneName', data=geneName[index.tolist()], **compress_args)
30 |         h5f.close()
31 | 
32 | if __name__ == '__main__':
33 |     main()
34 | 


--------------------------------------------------------------------------------
/Fig2/subsampled_10fold:
--------------------------------------------------------------------------------
1 | ../datasets/subsampled_10fold/


--------------------------------------------------------------------------------
/Fig2/subsampling_10fold.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(latticeExtra)
 3 | 
 4 | setwd("subsampled_10fold/")
 5 | 
 6 | files = list.files(path='.', pattern='trial', recursive=T)
 7 | # files = files[grepl("^2000|4000|6000|8000|10000|14000|16000",files)]
 8 | table <- as.data.frame(do.call("rbind", lapply(files, FUN=function(file){
 9 | 	cmd = paste("tail -2", file, "| perl -ne \'@a=split /= /; print $a[1];\'")
10 | 	tmp = t(read.table(textConnection(system(cmd, intern=TRUE))))
11 | 	tmp$sample = as.numeric(dirname(file))
12 | 	tmp$rep = as.numeric(strsplit(basename(file), "_")[[1]][1])
13 | 	tmp$trial = as.numeric(strsplit(strsplit(file, "_trial")[[1]][2], '\\.')[[1]][1])
14 | 	names(tmp) = c("r2","MSE","sample","rep","trial")
15 | 	tmp
16 | })))
17 | 
18 | table = as.data.frame(apply(table,2,function(x) as.numeric(as.character(x))))
19 | table = do.call("rbind", lapply(unique(table$sample), function(sub) { do.call("rbind", lapply(unique(table$rep), function(x) { tmp=table[table$rep==x & table$sample==sub,]; tmp[which( tmp$MSE == min(tmp$MSE) ),] } )) }) )
20 | 
21 | table = as.data.frame(aggregate(.~sample,table,function(x) c(mean=mean(x), sd=sd(x))))
22 | table
23 | table[,2][,2]=table[,2][,2]/sqrt(10) #std err
24 | table[,3][,2]=table[,3][,2]/sqrt(10) #std err
25 | table
26 | 
27 | pdf("subsample.pdf", height=3, width=5)
28 | obj1 = xyplot(MSE[,1] ~ sample, table,
29 |        panel = function(x, y, ...){
30 |          panel.arrows(x, y, x, table[,3][,1]+1.96*table[,3][,2], length = 0, angle = 90)
31 |          panel.arrows(x, y, x, table[,3][,1]-1.96*table[,3][,2], length = 0, angle = 90)
32 |          panel.xyplot(x, y, ...)
33 | 	 }, type = "o" , ylim=c(0.4,0.5), lwd=2, scales = list(x = list(at = seq(2000,16000,2000) ))) #limits = c(-500,500)
34 | obj2 = xyplot(r2[,1] ~ sample, table,
35 |        panel = function(x, y, ...){
36 |          panel.arrows(x, y, x, table[,2][,1]+1.96*table[,2][,2], length = 0, angle = 90)
37 |          panel.arrows(x, y, x, table[,2][,1]-1.96*table[,2][,2], length = 0, angle = 90)
38 |          panel.xyplot(x, y, ...)
39 | 	 }, type = "o", lwd=2, ylim=c(0.5,0.6))
40 | doubleYScale(obj1, obj2, add.ylab2 = TRUE)
41 | dev.off()
42 | 


--------------------------------------------------------------------------------
/Fig2/tpe_1K_10epochs_optimized_0to20K.hyperopt:
--------------------------------------------------------------------------------
1 | ../Fig1_S2/tpe_1K_10epochs_optimized_0to20K.hyperopt


--------------------------------------------------------------------------------
/Fig3_S3/57epigenomes.RPKM.pc.gz:
--------------------------------------------------------------------------------
1 | ../datasets/57epigenomes.RPKM.pc.gz


--------------------------------------------------------------------------------
/Fig3_S3/Boyer_et_al_PCG_repressed.txt:
--------------------------------------------------------------------------------
  1 | 1200009O22Rik
  2 | 1300014I06Rik
  3 | 1810031K17Rik
  4 | 2010001J22Rik
  5 | 2310045A20Rik
  6 | 2410080H04Rik
  7 | 2610017I09Rik
  8 | 2610024A01Rik
  9 | 2810451A06Rik
 10 | 2900005J15Rik
 11 | 4631426J05Rik
 12 | 4930447C04Rik
 13 | 4930577M16Rik
 14 | 4933407N01Rik
 15 | 4933408F15
 16 | 4933436C20Rik
 17 | 5330439J01Rik
 18 | 5730446D14Rik
 19 | 5730467H21Rik
 20 | 5730557B15Rik
 21 | 5730596B20Rik
 22 | 6030405A18
 23 | 6330514A18Rik
 24 | 6332401O19Rik
 25 | 6530402A20
 26 | 9030611O19Rik
 27 | 9030612E09Rik
 28 | 9030623N16Rik
 29 | 9130213B05Rik
 30 | 9430022A14
 31 | 9430023B20Rik
 32 | 9430076C15Rik
 33 | 9530027K23Rik
 34 | 9930014A18Rik
 35 | A330008L17Rik
 36 | A930041G11Rik
 37 | AB041550
 38 | AI314180
 39 | AI851790
 40 | AW125753
 41 | Adamts1
 42 | Adamts5
 43 | Adcy8
 44 | Adm
 45 | Adra1a
 46 | Adra1d
 47 | Adra2b
 48 | Adrb1
 49 | Aldh1a2
 50 | Alx3
 51 | Ar
 52 | Arg2
 53 | Arhgap20
 54 | Asb3
 55 | Ascl1
 56 | Atf3
 57 | Atoh1
 58 | Atoh8
 59 | Avpr1a
 60 | BC014699
 61 | BC038286
 62 | BC055811
 63 | BC061194
 64 | Bach2
 65 | Barx1
 66 | Barx2
 67 | Bcan
 68 | Bcl11a
 69 | Bcl2l11
 70 | Bhlhb3
 71 | Bhlhb4
 72 | Bmp4
 73 | Bmp6
 74 | Bmp7
 75 | Car7
 76 | Cart1
 77 | Cav1
 78 | Cbln2
 79 | Cbx4
 80 | Cbx8
 81 | Ccbe1
 82 | Ccnd2
 83 | Ccr9
 84 | Cd14
 85 | Cd24a
 86 | Cdh11
 87 | Cdh13
 88 | Cdh2
 89 | Cdh22
 90 | Cdh4
 91 | Cdh8
 92 | Cdk5r2
 93 | Cdkn2c
 94 | Cdx2
 95 | Cebpa
 96 | Chst2
 97 | Chst8
 98 | Chx10
 99 | Clstn1
100 | Cnih3
101 | Cnr1
102 | Cntfr
103 | Cntnap1
104 | Col12a1
105 | Col19a1
106 | Col27a1
107 | Col2a1
108 | Col4a1
109 | Col4a2
110 | Colec12
111 | Comp
112 | Corin
113 | Cpne7
114 | Crabp1
115 | Creb1
116 | Crhr1
117 | Cxcl12
118 | Cxcr4
119 | Cxxc4
120 | Cyp1b1
121 | Cyp24a1
122 | Cyp26a1
123 | Cyp27b1
124 | D13Bwg1146e
125 | D230002A01Rik
126 | D230039L06Rik
127 | D330050I23Rik
128 | D9Ucla1
129 | Dach1
130 | Dbx1
131 | Dgat2
132 | Dio3
133 | Dkk2
134 | Dll1
135 | Dll4
136 | Dlx1
137 | Dlx2
138 | Dlx3
139 | Dmrt2
140 | Dmrt3
141 | Dmrta1
142 | Dmrta2
143 | Dpf3
144 | E130018O15Rik
145 | E130112H22Rik
146 | E130306M17Rik
147 | E130307J07Rik
148 | E130309B19Rik
149 | E330016A19Rik
150 | Ebf1
151 | Ebf2
152 | Ebf3
153 | Efhd1
154 | Efna5
155 | Efnb2
156 | Egr3
157 | En1
158 | En2
159 | Eomes
160 | Epas1
161 | Epha5
162 | Evx1
163 | Evx2
164 | Fbn1
165 | Fbn2
166 | Fev
167 | Fgf15
168 | Fgf3
169 | Fgf5
170 | Fgf8
171 | Fgf9
172 | Fgfr3
173 | Fli1
174 | Flrt2
175 | Flt1
176 | Flt3
177 | Flt4
178 | Foxb2
179 | Foxc1
180 | Foxc2
181 | Foxd1
182 | Foxd4
183 | Foxe1
184 | Foxf1a
185 | Foxf2
186 | Foxl2
187 | Foxq1
188 | Fras1
189 | Frzb
190 | Fzd1
191 | Fzd2
192 | Gab1
193 | Gabra1
194 | Gad2
195 | Gadd45g
196 | Galgt2
197 | Gata3
198 | Gata4
199 | Gata5
200 | Gata6
201 | Gbx2
202 | Gdf6
203 | Gdf7
204 | Gdnf
205 | Gfra1
206 | Gfra2
207 | Ghsr
208 | Gm644
209 | Gm996
210 | Gnal
211 | Gpr120
212 | Gpr124
213 | Grid1
214 | Grik2
215 | Grin2a
216 | Gsc
217 | Gscl
218 | Gsh1
219 | Gsh2
220 | H2-Q1
221 | H2-Q10
222 | H2-Q7
223 | H2-Q8
224 | Hand1
225 | Hapln4
226 | Helt
227 | Hes5
228 | Hes7
229 | Hey2
230 | Hlx1
231 | Hlxb9
232 | Hmga2
233 | Hmx1
234 | Hoxa1
235 | Hoxa10
236 | Hoxa11
237 | Hoxa2
238 | Hoxa3
239 | Hoxa4
240 | Hoxa5
241 | Hoxa6
242 | Hoxb1
243 | Hoxb13
244 | Hoxb2
245 | Hoxb3
246 | Hoxb4
247 | Hoxb6
248 | Hoxb7
249 | Hoxb8
250 | Hoxc10
251 | Hoxc12
252 | Hoxc4
253 | Hoxc5
254 | Hoxc6
255 | Hoxc9
256 | Hoxd10
257 | Hoxd11
258 | Hoxd13
259 | Hoxd9
260 | Hpcal4
261 | Hs3st1
262 | Hs3st3b1
263 | Hsf4
264 | Hspa1a
265 | Hspa1b
266 | Hspa1l
267 | Htr1a
268 | Htr1b
269 | Htr6
270 | Id3
271 | Igf2
272 | Igfbp5
273 | Il15ra
274 | Insm1
275 | Ipf1
276 | Irf2
277 | Irf5
278 | Irs4
279 | Irx1
280 | Irx2
281 | Irx3
282 | Irx5
283 | Isl2
284 | Jun
285 | Kcna1
286 | Kcna6
287 | Kcnc4
288 | Kcnk12
289 | Kirrel3
290 | LOC385769
291 | LOC386518
292 | LOC432662
293 | LOC432907
294 | LOC434573
295 | LOC436493
296 | Lbxcor1
297 | Lef1
298 | Lhx2
299 | Lhx5
300 | Lhx6
301 | Lmbrd1
302 | Lmx1a
303 | Lrat
304 | Lrba
305 | Lrfn5
306 | Lrp8
307 | Lrrn1
308 | Lrrtm1
309 | MGC68323
310 | MGI:1920501
311 | MGI:1930803
312 | MGI:2143217
313 | MGI:2183445
314 | MGI:2669849
315 | MGI:2684334
316 | Mab21l1
317 | Mab21l2
318 | Mafa
319 | Mafb
320 | Mamdc1
321 | Meis1
322 | Mfsd2
323 | Mrg1
324 | Msx1
325 | Msx2
326 | Myc
327 | Nebl
328 | Nef3
329 | Nefl
330 | Neto1
331 | Neurod2
332 | Neurog1
333 | Neurog2
334 | Neurog3
335 | Nfatc1
336 | Nfix
337 | Nhlh2
338 | Nkx2-2
339 | Nkx2-3
340 | Nkx2-4
341 | Nkx2-5
342 | Nkx2-9
343 | Nkx6-1
344 | Nol4
345 | Npas2
346 | Npnt
347 | Nptx1
348 | Nr2e1
349 | Nr2f2
350 | Nrn1
351 | Nrp1
352 | Nrp2
353 | Ntf3
354 | Ntn1
355 | Ntrk3
356 | Ocln
357 | Olig2
358 | Olig3
359 | Onecut1
360 | Onecut2
361 | Onecut3
362 | Osr1
363 | Osr2
364 | Otp
365 | Otx1
366 | Otx3
367 | Ovol1
368 | Paqr9
369 | Pax1
370 | Pax2
371 | Pax3
372 | Pax6
373 | Pax7
374 | Pax8
375 | Pax9
376 | Pcdh1
377 | Pcdh10
378 | Pcdh18
379 | Pcdh7
380 | Pcdh8
381 | Pcdhga10
382 | Pcdhga11
383 | Pcdhga12
384 | Pcdhga3
385 | Pcdhga8
386 | Pcdhga9
387 | Pcdhgb5
388 | Pcdhgb6
389 | Pcdhgb7
390 | Pcdhgc3
391 | Pcdhgc4
392 | Pcdhgc5
393 | Pdgfra
394 | Pfdn4
395 | Pftk1
396 | Phlda2
397 | Phox2b
398 | Pitx1
399 | Pitx3
400 | Pkp1
401 | Plxnc1
402 | Podxl
403 | Pou3f2
404 | Pou3f3
405 | Pou3f4
406 | Pou4f2
407 | Pou4f3
408 | Ppm1l
409 | Ppp1r14c
410 | Prdm8
411 | Prkag2
412 | Ptger4
413 | Ptprt
414 | Ptpru
415 | Rab20
416 | Rasgrf1
417 | Rax
418 | Reln
419 | Rem1
420 | Rgl3
421 | Rgs20
422 | Rnf150
423 | Rtn4rl1
424 | Rtn4rl2
425 | Ryr2
426 | Sca1
427 | Scarf2
428 | Sdccag33
429 | Sema5b
430 | Sema6a
431 | Sema6d
432 | Serpine2
433 | Sez6l
434 | Sfrp5
435 | Shc3
436 | Shh
437 | Shox2
438 | Sidt1
439 | Sim1
440 | Six1
441 | Six2
442 | Six3
443 | Six6
444 | Slc16a2
445 | Slc30a3
446 | Slc32a1
447 | Slit2
448 | Slitrk3
449 | Smarca2
450 | Sox18
451 | Sox21
452 | Sox7
453 | Sox9
454 | Spon1
455 | Srd5a2
456 | Sstr1
457 | Sstr4
458 | St8sia3
459 | Stxbp6
460 | Svep1
461 | Tacstd2
462 | Tal1
463 | Tbr1
464 | Tbx15
465 | Tbx18
466 | Tbx2
467 | Tbx20
468 | Tbx4
469 | Tbx5
470 | Tcf21
471 | Tcfap2b
472 | Tcfap2d
473 | Tcfcp2l3
474 | Tdrd6
475 | Thbd
476 | Tiam2
477 | Tlx1
478 | Tlx3
479 | Tmeff2
480 | Tox
481 | Trhde
482 | Trp73
483 | Twist2
484 | Ube2j1
485 | Ucn
486 | Unc5c
487 | Unc5d
488 | Uncx4.1
489 | Vax1
490 | Vax2
491 | Vgll2
492 | Vsx1
493 | Wbscr17
494 | Wdr8
495 | Wnt1
496 | Wnt16
497 | Wnt2b
498 | Wnt3
499 | Wnt5a
500 | Wt1
501 | Zar1
502 | Zc3hav1
503 | Zfhx1b
504 | Zfp312
505 | Zfp339
506 | Zfp467
507 | Zfp503
508 | Zfpm1
509 | Zfpm2
510 | Zic1
511 | Zic4
512 | Znrf4
513 | 


--------------------------------------------------------------------------------
/Fig3_S3/EnsemblID2GeneName.txt:
--------------------------------------------------------------------------------
1 | ../datasets/EnsemblID2GeneName.txt


--------------------------------------------------------------------------------
/Fig3_S3/Fig3ABCDEF_S3ABC.R:
--------------------------------------------------------------------------------
  1 | #### MOUSE ######
  2 | 
  3 | a=read.delim("all_crossvalidated_predictions_mESC.txt")
  4 | b=read.delim("all_crossvalidated_predictions_mouse.txt")
  5 | colnames(a)[2:3]=c("mESCPred","mESCActual")
  6 | a=merge(a,b,by=1)
  7 | c=read.delim("ensembl2geneName_v90_mm10.txt")
  8 | colnames(c)[2]="geneName"
  9 | a=merge(a,c,by=1,all.x=T)
 10 | a[a=='']="NA"
 11 | 
 12 | summary(lm(mESCActual~mESCPred, a))
 13 | summary(lm(mESCActual~Pred, a))
 14 | 
 15 | # Table1 Moorthy et al, 2017
 16 | a$color='black'
 17 | a$color[a$geneName=="Sall1"]='red'
 18 | a$color[a$geneName=="Tet1"]='red'
 19 | a$color[a$geneName=="Prkcg"]='red'
 20 | a$color[a$geneName=="AU018091"]='red'
 21 | a$color[a$geneName=="Med13l"]='red'
 22 | a$color[a$geneName=="Macf1"]='red'
 23 | a$color[a$geneName=="Ranbp17"]='red'
 24 | a$color[a$geneName=="Cbfa2t2"]='red'
 25 | a$color[a$geneName=="Esrrb"]='red'
 26 | a$color[a$geneName=="Dppa5a"]='red'
 27 | a$color[a$geneName=="Ooep"]='red'
 28 | a$color[a$geneName=="Mcl1"]='red'
 29 | a$color[a$geneName=="Etl4"]='red'
 30 | 
 31 | # A few genes from Whyte et al, 2013; Dowen et al, 2014; Hnisz et al 2013
 32 | a$color[a$geneName=="Pou5f1"]='red' #same as Oct4
 33 | a$color[a$geneName=="Sox2"]='red'
 34 | a$color[a$geneName=="Nanog"]='red'
 35 | a$color[a$geneName=="Klf4"]='red'
 36 | a$color[a$geneName=="Tbx3"]='red'
 37 | a$color[a$geneName=="Sall4"]='red'
 38 | a$color[a$geneName=="Lefty1"]='red'
 39 | a$color[a$geneName=="Lefty2"]='red'
 40 | a$color[a$geneName=="Utf1"]='red'
 41 | a$color[a$geneName=="Phc1"]='red'
 42 | a$color[a$geneName=="Nr5a2"]='red'
 43 | a$color[a$geneName=="Lrrc2"]='red'
 44 | a$color[a$geneName=="Dppa3"]='red'
 45 | a$color[a$geneName=="Prdm14"]='red'
 46 | 
 47 | a$PredAdj=predict(lm(mESCActual~mESCPred, a))
 48 | a$MedianPred=predict(lm(mESCActual~Pred, a))
 49 | a$resid = a$mESCActual-a$PredAdj
 50 | 
 51 | nrow(a)
 52 | 
 53 | pdf("Fig3DEF_S3C.pdf", width=8, height=8)
 54 | par(mar=c(7,7,5,5), mgp = c(5, 1.5, 0))
 55 | 
 56 | #Fig3D
 57 | smoothScatter(a$mESCPred, a$mESCActual, cex.axis=2, cex.lab=2, bty="n", xlab="Predicted Expression Level, mESC model", ylab="mESC expression level (log10)", xlim=c(-1.5, 2), ylim=c(-1, 4), las=1, cex=.5) #, pch=1, col = a$color
 58 | abline(0,1, col="red")
 59 | text(a[a$color=="red","mESCPred"], a[a$color=="red","mESCActual"], labels = a[a$color=="red","geneName"], offset = 0.5, col="red")
 60 | text(1.5, 4, labels = paste("r^2 =", round(cor(a$mESCPred, a$mESCActual)^2,2)), offset = 0.5, col="black")
 61 | 
 62 | #FigS3C
 63 | plot.ecdf(a$resid[a$color=='black'], xlim=c(-4,4), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="black")
 64 | plot.ecdf(a$resid[a$color=='red'], verticals= TRUE, do.points = FALSE, col="red", add=T)
 65 | legend("topleft", bg="white", bty="n", legend = c(paste("non-enhancer-driven genes, n = ", length(a$resid[a$color=='black']), sep=''),
 66 | paste("enhancer-driven genes, n = ", length(a$resid[a$color=='red']), sep=''),
 67 | paste("P value: ", formatC(ks.test(a$resid[a$color=='red'],a$resid[a$color=='black'],alternative="less")$p.value, digits = 2, format = 'g'), sep='')), text.col = c("black","red", "black"))
 68 | 
 69 | #Fig3E
 70 | silenced = read.delim("Boyer_et_al_PCG_repressed.txt",F) #from Boyer et al
 71 | silenced = merge(silenced, c, by.x=1, by.y=2)
 72 | nrow(silenced)
 73 | active = read.delim("Whyte_et_al_superenhancers.txt",F) #from Whyte et al
 74 | active = merge(active, c, by.x=1, by.y=2)
 75 | nrow(active)
 76 | a$color='black'
 77 | a$color[a$Gene %in% silenced[,2] & !(a$Gene %in% active[,2])]='blue'
 78 | a$color[a$Gene %in% active[,2] & !(a$Gene %in% silenced[,2])]='red'
 79 | plot.ecdf(a$resid[a$color=='black'], xlim=c(-2,2), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="black", main="mESC")
 80 | plot.ecdf(a$resid[a$color=='blue'], verticals= TRUE, do.points = FALSE, col="blue", add=T)
 81 | plot.ecdf(a$resid[a$color=='red'], verticals= TRUE, do.points = FALSE, col="red", add=T)
 82 | legend("topleft", bg="white", bty="n", legend = c(paste("Other genes, n = ", length(a$resid[a$color=='black']), sep=''),
 83 | paste("PCG-silenced genes, n = ", length(a$resid[a$color=='blue']), sep=''),
 84 | paste("Super-enhancer-associated genes, n = ", length(a$resid[a$color=='red']), sep=''),
 85 | paste("PCG-silenced vs Black P value: ", formatC(ks.test(a$resid[a$color=='blue'],a$resid[a$color=='black'],alternative="greater")$p.value, digits = 2, format = 'g'), sep=''),  
 86 | paste("Super-enhancer vs Black P value: ", formatC(ks.test(a$resid[a$color=='red'],a$resid[a$color=='black'],alternative="less")$p.value, digits = 2, format = 'g'), sep='')), 
 87 | text.col = c("black","blue","red","black"))
 88 | 
 89 | #Fig3F
 90 | halflife = read.delim("Herzog_mESC_half_life.txt") #from Herzog et al
 91 | halflife = halflife[,c(4,7)]
 92 | colnames(halflife)[2]='half_life'
 93 | halflife$half_life=log2(halflife$half_life)
 94 | halflife = merge(halflife, c, by.x=1, by.y=2)
 95 | a1=merge(halflife, a, by.x=3, by.y=1)
 96 | "mESC half lives measured for this many genes:"
 97 | nrow(a1)
 98 | a1$quintile <- cut(a1$half_life, breaks=quantile(a1$half_life, probs=seq(0,1, by=0.2), na.rm=TRUE), include.lowest=TRUE)
 99 | boxplot(a1$resid~a1$quintile,outline=F, cex=1.5, cex.axis=2, cex.lab=2, cex.main=2, las=2, notch=T, col="red")
100 | cor.test(a1$half_life, a1$resid)
101 | cor.test(a1$half_life, a1$resid, method='spearman')
102 | 
103 | mouse = a
104 | 
105 | a=read.delim("all_crossvalidated_predictions_mESC.txt",stringsAsFactors=F)
106 | colnames(a)[2:3]=c("mESCPred","mESCActual")
107 | b=read.delim("Ouyang_mESC_RPKM_ensemblID.txt",F)
108 | b$V2=log10(b$V2+0.1)
109 | a=merge(a,b,by=1,all.x=T)
110 | a$mESCActual=a$V2
111 | a$V2=NULL
112 | b=read.delim("all_crossvalidated_predictions_mouse.txt",stringsAsFactors=F)
113 | colnames(b)[2:3]=c("Pred","Actual")
114 | c=read.delim("mouse.median_expr.txt",F)
115 | c$V2=log10(c$V2+0.1)
116 | b=merge(b,c,by=1)
117 | b$Actual=b$V2
118 | b$V2=NULL
119 | a=merge(a,b,by=1,all=T)
120 | c=read.delim("ensembl2geneName_v90_mm10.txt",F,stringsAsFactors=F)
121 | c=c[,c(1,2,4)]
122 | colnames(c)[2:3]=c("geneName","Description")
123 | a=merge(a,c,by=1,all.x=T)
124 | a[a=='']=NA
125 | writefile(cbind(a$Gene, a$geneName, a$Description, round(a$Pred,3), round(a$Actual,3), round(a$mESCPred,3), round(a$mESCActual,3)), "TableS1_mouse.txt", col.names=T)
126 | 
127 | ### HUMAN #####
128 | 
129 | a=read.delim(gzfile("57epigenomes.RPKM.pc.gz"))
130 | a$E000=NULL
131 | a[,2:ncol(a)]=log10(a[,2:ncol(a)]+0.1)
132 | a$medianExpr=apply(a[,2:ncol(a)], 1, median)
133 | 
134 | nrow(a)
135 | b=read.delim("all_crossvalidated_predictions.txt")
136 | a=merge(a,b,by=1)
137 | b=read.delim("all_crossvalidated_predictions_K562.txt")
138 | colnames(b)[2:3]=c("K562Pred","K562Actual")
139 | a=merge(a,b,by=1)
140 | b=read.delim("all_crossvalidated_predictions_GM12878.txt")
141 | colnames(b)[2:3]=c("GM12878Pred","GM12878Actual")
142 | a=merge(a,b,by=1)
143 | nrow(a)
144 | 
145 | c=read.delim("GSE78709_sure23.plasmid.norm.combined.45.55.minus.promoters.bigWigSignal",F) #from van Aresbergen et al
146 | d=read.delim("GSE78709_sure23.plasmid.norm.combined.45.55.plus.promoters.bigWigSignal",F)
147 | c=rbind(c,d)
148 | c$V6=log10(c$V6+0.1)
149 | c=c[,c(1,6)]
150 | colnames(c)[2]='SuRE'
151 | a=merge(a,c,by=1,all.x=T)
152 | 
153 | c=read.delim("EnsemblID2GeneName.txt",F)
154 | colnames(c)[2:3]=c("geneName","Description")
155 | a=merge(a,c,by=1)
156 | a[a=='']="NA"
157 | 
158 | pdf("Fig3ABC_S3ABC.pdf", width=8, height=8)
159 | par(mar=c(7,7,5,5), mgp = c(5, 1.5, 0))
160 | 
161 | a$resid = a$GM12878Actual-a$GM12878Pred
162 | 
163 | #FigS3B
164 | silenced = read.delim("diHMM/GM12878/H3K27me3_silenced.txt",F) #from Marco et al
165 | nrow(silenced)
166 | active = read.delim("diHMM/GM12878/superenhancer.txt",F)
167 | nrow(active)
168 | a$color='black'
169 | a$color[a$gene_id %in% silenced[,1] & !(a$gene_id %in% active[,1])]='blue'
170 | a$color[a$gene_id %in% active[,1] & !(a$gene_id %in% silenced[,1])]='red'
171 | plot.ecdf(a$resid[a$color=='black'], xlim=c(-2,2), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="black", main="GM12878")
172 | plot.ecdf(a$resid[a$color=='blue'], verticals= TRUE, do.points = FALSE, col="blue", add=T)
173 | plot.ecdf(a$resid[a$color=='red'], verticals= TRUE, do.points = FALSE, col="red", add=T)
174 | legend("topleft", bg="white", bty="n", legend = c(paste("Other genes, n = ", length(a$resid[a$color=='black']), sep=''),
175 | paste("Silenced genes, n = ", length(a$resid[a$color=='blue']), sep=''),
176 | paste("Stretch-enhancer-associated genes, n = ", length(a$resid[a$color=='red']), sep=''),
177 | paste("Silenced vs Black P value: ", formatC(ks.test(a$resid[a$color=='blue'],a$resid[a$color=='black'],alternative="greater")$p.value, digits = 2, format = 'g'), sep=''),  
178 | paste("Enhancer vs Black P value: ", formatC(ks.test(a$resid[a$color=='red'],a$resid[a$color=='black'],alternative="less")$p.value, digits = 2, format = 'g'), sep='')), 
179 | text.col = c("black","blue","red","black"))
180 | 
181 | #Fig3B
182 | a$resid = a$K562Actual-a$K562Pred
183 | silenced = read.delim("diHMM/K562/H3K27me3_silenced.txt",F) #from Marco et al
184 | nrow(silenced)
185 | active = read.delim("diHMM/K562/superenhancer.txt",F)
186 | nrow(active)
187 | a$color='black'
188 | a$color[a$gene_id %in% silenced[,1] & !(a$gene_id %in% active[,1])]='blue'
189 | a$color[a$gene_id %in% active[,1] & !(a$gene_id %in% silenced[,1])]='red'
190 | plot.ecdf(a$resid[a$color=='black'], xlim=c(-2,2), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="black", main="K562")
191 | plot.ecdf(a$resid[a$color=='blue'], verticals= TRUE, do.points = FALSE, col="blue", add=T)
192 | plot.ecdf(a$resid[a$color=='red'], verticals= TRUE, do.points = FALSE, col="red", add=T)
193 | legend("topleft", bg="white", bty="n", legend = c(paste("Other genes, n = ", length(a$resid[a$color=='black']), sep=''),
194 | paste("silenced genes, n = ", length(a$resid[a$color=='blue']), sep=''),
195 | paste("stretch-enhancer-associated genes, n = ", length(a$resid[a$color=='red']), sep=''),
196 | paste("Silenced vs Black P value: ", formatC(ks.test(a$resid[a$color=='blue'],a$resid[a$color=='black'],alternative="greater")$p.value, digits = 2, format = 'g'), sep=''), 
197 | paste("Enhancer vs Black P value: ", formatC(ks.test(a$resid[a$color=='red'],a$resid[a$color=='black'],alternative="less")$p.value, digits = 2, format = 'g'), sep='')), 
198 | text.col = c("black","blue","red","black"))
199 | 
200 | #Fig3C
201 | halflife = read.delim("Schofield_K562_half_lives.txt") #from Schofield et al
202 | halflife = halflife[,c(1,6)]
203 | colnames(halflife)[2]='half_life'
204 | halflife$half_life=log2(halflife$half_life)
205 | halflife = merge(halflife, c, by.x=1, by.y=2)
206 | a1=merge(halflife, a, by.x=3, by.y=1)
207 | "K562 half lives measured for this many genes:"
208 | nrow(a1)
209 | a1$quintile <- cut(a1$half_life, breaks=quantile(a1$half_life, probs=seq(0,1, by=0.2), na.rm=TRUE), include.lowest=TRUE)
210 | boxplot(a1$resid~a1$quintile,outline=F, cex=1.5, cex.axis=2, cex.lab=2, cex.main=2, las=2, notch=T, col="red")
211 | cor.test(a1$half_life, a1$resid)
212 | cor.test(a1$half_life, a1$resid, method='spearman')
213 | 
214 | a$color="black"
215 | #many groups
216 | a$color[grep("hemoglobin subunit", a$Description)]='red'
217 | #Xie et al 2017
218 | a$color[a$geneName=="PIM1"]='red'
219 | a$color[a$geneName=="SMYD3"]='red'
220 | a$color[a$geneName=="FADS1"]='red'
221 | a$color[a$geneName=="PRKAR2B"]='red'
222 | #Fulco et al 2016
223 | a$color[a$geneName=="GATA1"]='red'
224 | a$color[a$geneName=="MYC"]='red'
225 | 
226 | #FigS3A
227 | plot.ecdf(a$resid[a$color=='black'], xlim=c(-4,4), ylim=c(0,1), verticals= TRUE, do.points = FALSE, col="black")
228 | plot.ecdf(a$resid[a$color=='red'], verticals= TRUE, do.points = FALSE, col="red", add=T)
229 | legend("topleft", bg="white", bty="n", legend = c(paste("non-enhancer-driven genes, n = ", length(a$resid[a$color=='black']), sep=''),
230 | paste("enhancer-driven genes, n = ", length(a$resid[a$color=='red']), sep=''),
231 | paste("P value: ", formatC(ks.test(a$resid[a$color=='red'],a$resid[a$color=='black'],alternative="less")$p.value, digits = 2, format = 'g'), sep='')), text.col = c("black","red", "black"))
232 | 
233 | a$K562PredAdj=predict(lm(E123~K562Pred, a))
234 | 
235 | #Fig3A
236 | smoothScatter(a$K562PredAdj, a$E123, cex.axis=2, cex.lab=2, bty="n", xlab="Predicted K562 expression level", ylab="K562 expression level (log10)", xlim=c(-1.5, 2), ylim=c(-1, 4), las=1, cex=.5)
237 | abline(0,1, col="red")
238 | text(a[a$color=="red","K562PredAdj"], a[a$color=="red","E123"], labels = a[a$color=="red","geneName"], offset = 0.5, col="red")
239 | text(1.5, 4, labels = paste("r^2 =", round(cor(a$K562Pred, a$E123)^2,2)), offset = 0.5, col="black")
240 | 
241 | writefile(cbind(a$gene_id, a$geneName, a$Description, round(a$Pred,3), round(a$K562Pred,3), round(a$GM12878Pred,3), round(a$SuRE,3), round(a[,2:58],3)), "TableS1_human.txt")


--------------------------------------------------------------------------------
/Fig3_S3/Fig3ABC_S3ABC.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig3_S3/Fig3ABC_S3ABC.pdf


--------------------------------------------------------------------------------
/Fig3_S3/Fig3DEF_S3C.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig3_S3/Fig3DEF_S3C.pdf


--------------------------------------------------------------------------------
/Fig3_S3/Fig3GH.R:
--------------------------------------------------------------------------------
 1 | library(ROCR)
 2 | 
 3 | a=read.delim("all_crossvalidated_predictions_mESC.txt")
 4 | colnames(a)[2:3]=c("mESCPred","mESCActual")
 5 | c=read.delim("ensembl2geneName_v90_mm10.txt")
 6 | colnames(c)[2]="geneName"
 7 | a=merge(a,c,by=1,all.x=T)
 8 | 
 9 | a$PredAdj=predict(lm(mESCActual~mESCPred, a))
10 | a$resid = a$mESCActual-a$PredAdj
11 | nrow(a)
12 | a = a[a$mESCActual > min(a$mESCActual)+1,]
13 | nrow(a)
14 | miR = fastread("zcat Summary_Counts.default_predictions.txt.gz")
15 | nrow(a)
16 | 
17 | values = sapply(unique(miR$"miRNA family"), function(fam){
18 | 	miR2 = miR[miR$"Species ID"==10090 & miR$"miRNA family"==fam,c(2,3,16)]
19 | 	merged = merge(miR2, a, by.x=1,by.y=4, all.y=T)
20 | 	merged[is.na(merged)]=0
21 | 	
22 | 	if (sum(merged[,"Cumulative weighted context++ score"] != 0) > 10){
23 | 		c(0,cor(merged$resid, as.numeric(merged[,"Cumulative weighted context++ score"]), method='spearman'))
24 | 	}
25 | 	else{
26 | 		c(0,0)
27 | 	}
28 | })
29 | values=t(values)
30 | 
31 | pdf("Fig3GH.pdf", width=8, height=8)
32 | par(mar=c(7,7,5,5), mgp = c(5, 1.5, 0))
33 | miRs=read.delim("mouseESC_GSE76288_miRNA_counts_Denzler.txt")
34 | miRs=miRs[,c("miRNA_seed","Embryonic_stem_cells._Average_RPM")]
35 | miRs$miRNA_seed=gsub("T","U",miRs$miRNA_seed)
36 | miRs=aggregate(miRs$Embryonic_stem_cells._Average_RPM,by=list(miRs$miRNA_seed), sum)
37 | miRs=miRs[order(miRs$x, decreasing=T),]
38 | miRs2=rbind(miRs[1:10,], c("Other", sum(miRs[11:nrow(miRs),"x"])))
39 | pie(as.numeric(miRs2[,2]), labels = miRs2[,1],col=c('orange','blue','red','purple','darkolivegreen1','magenta','brown', 'cyan', 'yellow','grey','black'), main=paste("Top 10 miRNA families in mESCs"), clockwise = T, cex.main = 2, cex=1.8)
40 | 
41 | miRs=merge(values, miRs, by.x=0, by.y=1, all.x=T)
42 | miRs$color="black"
43 | miRs$color[miRs$Row.names %in% miRs2[,1]]="red"
44 | colnames(miRs)[3]="spearman"
45 | p1 <- hist(miRs$spearman[miRs$color!="red"], 50, plot=F)
46 | p2 <- hist(miRs$spearman[miRs$color=="red"], 50, plot=F)
47 | plot( p1, col=rgb(0,0,1,1/2), xlim=c(-0.02,0.06))
48 | plot( p2, col=rgb(1,0,0,1/2), xlim=c(-0.02,0.06), add=T)
49 | dev.off()


--------------------------------------------------------------------------------
/Fig3_S3/Fig3GH.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig3_S3/Fig3GH.pdf


--------------------------------------------------------------------------------
/Fig3_S3/Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz


--------------------------------------------------------------------------------
/Fig3_S3/Mouse_FantomAnnotations.InputData.pM10Kb.mESC.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Mouse_FantomAnnotations.InputData.pM10Kb.mESC.txt.gz


--------------------------------------------------------------------------------
/Fig3_S3/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz


--------------------------------------------------------------------------------
/Fig3_S3/Ouyang_mESC_RPKM_ensemblID.txt:
--------------------------------------------------------------------------------
1 | ../datasets/Ouyang_mESC_RPKM_ensemblID.txt


--------------------------------------------------------------------------------
/Fig3_S3/Roadmap_FantomAnnotations.InputData.pM10Kb.GM12878expr.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.GM12878expr.txt.gz


--------------------------------------------------------------------------------
/Fig3_S3/Roadmap_FantomAnnotations.InputData.pM10Kb.K562expr.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.K562expr.txt.gz


--------------------------------------------------------------------------------
/Fig3_S3/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz


--------------------------------------------------------------------------------
/Fig3_S3/Summary_Counts.default_predictions.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Summary_Counts.default_predictions.txt.gz


--------------------------------------------------------------------------------
/Fig3_S3/TableS1_human.txt:
--------------------------------------------------------------------------------
1 | ../datasets/TableS1_human.txt


--------------------------------------------------------------------------------
/Fig3_S3/TableS1_mouse.txt:
--------------------------------------------------------------------------------
1 | ../datasets/TableS1_mouse.txt


--------------------------------------------------------------------------------
/Fig3_S3/Whyte_et_al_superenhancers.txt:
--------------------------------------------------------------------------------
  1 | Usp7
  2 | Mreg
  3 | Btbd11
  4 | Zbtb45
  5 | Brd1
  6 | Tmem2
  7 | Tnfsfm13
  8 | Fam134b
  9 | Zfhx2
 10 | Tet2
 11 | Trim71
 12 | Smad7
 13 | Rbpj
 14 | Gli2
 15 | Ankrd35
 16 | Kif13b
 17 | Prdm14
 18 | Ccnd3
 19 | Tns3
 20 | Zbtb34
 21 | Rbfox2
 22 | Gm9104
 23 | H2-M5
 24 | Ppp2r5c
 25 | Zfp710
 26 | Upp1
 27 | Qk
 28 | Pum1
 29 | Prrc2b
 30 | Nr5a2
 31 | Zfp281
 32 | Sgk1
 33 | Sgk1
 34 | Tgif1
 35 | Tgif1
 36 | Zfp638
 37 | Tead1
 38 | Rara
 39 | Etl4
 40 | Fnbp1
 41 | Ahi1
 42 | Gm6724
 43 | Alpl
 44 | Cd9
 45 | Socs3
 46 | Gadd45a
 47 | Phc1
 48 | Enc1
 49 | Smarcad1
 50 | F2rl1
 51 | Ftl2-ps
 52 | Gpx4
 53 | Hsd17b3
 54 | Igfbp2
 55 | Inhbb
 56 | Klf2
 57 | Klf3
 58 | Lamc2
 59 | Ldhb
 60 | Mcl1
 61 | Mybl2
 62 | Mycn
 63 | Pim1
 64 | Pipox
 65 | Pura
 66 | Slc6a6
 67 | Tcf15
 68 | Tsc22d1
 69 | Utf1
 70 | Zfp42
 71 | Macf1
 72 | Agtrap
 73 | Klf5
 74 | Capns1
 75 | Cbfa2t2
 76 | Cldn4
 77 | Col18a1
 78 | Ctbp2
 79 | Lefty1
 80 | Enah
 81 | Epha2
 82 | Fgf4
 83 | Gbx2
 84 | Id1
 85 | Ier2
 86 | Klf4
 87 | Klf9
 88 | Sik1
 89 | Mapt
 90 | Nfkbia
 91 | Uri1
 92 | Ski
 93 | Slc2a1
 94 | Slc2a3
 95 | Sox2
 96 | Tbx3
 97 | Ubtf
 98 | Vdac1
 99 | Spry2
100 | Spry4
101 | Esrrb
102 | Ppp2r5c
103 | Eif4a2
104 | Dmtn
105 | Pou5f1
106 | Dusp1
107 | Sema4b
108 | Dlc1
109 | Hs6st1
110 | Dmrt1
111 | Kat6b
112 | Mkrn1
113 | Abhd2
114 | Tmem131
115 | Rbpms
116 | Kras
117 | Tnip1
118 | Klf13
119 | Sall1
120 | Ppp1r1a
121 | Tdh
122 | Gpa33
123 | Ndfip1
124 | Ranbp17
125 | Mesdc2
126 | Tfcp2l1
127 | Jam2
128 | Spaca7
129 | Derl3
130 | Dppa5a
131 | Hsd17b14
132 | Fbxo36
133 | Ssr2
134 | Camk2n1
135 | Hmg20a
136 | Rpl14
137 | Gtf3c6
138 | Kctd16
139 | Bcas2
140 | Stoml1
141 | Glod5
142 | Polr3gl
143 | Manba
144 | Tet1
145 | Rpap3
146 | Nanog
147 | Sulf2
148 | Cenpv
149 | Lrrc2
150 | C2cd5
151 | Ddit4
152 | 1700012A03Rik
153 | Hspb8
154 | Uck2
155 | Msi2
156 | Elovl6
157 | Usp48
158 | Zfp704
159 | Opa1
160 | Ube2s
161 | Dst
162 | Gpr37l1
163 | Dppa3
164 | Mllt6
165 | Kazn
166 | Otx2
167 | Mbip
168 | Pitpnc1
169 | Irf2bpl
170 | Olfr90
171 | Ift52
172 | Med13l
173 | Cobl
174 | Itpk1
175 | Kank4
176 | Mtcl1
177 | Idh2
178 | Gpt2
179 | Rhof
180 | Trak1
181 | Nav2
182 | Chchd10
183 | 6430573F11Rik
184 | Lefty2
185 | Chd9
186 | Tmem220
187 | Amigo2
188 | Fam53a
189 | Reep3
190 | Pirt
191 | Dlgap3
192 | Ctif
193 | Platr26
194 | Sall4
195 | 


--------------------------------------------------------------------------------
/Fig3_S3/all_crossvalidated_predictions.txt:
--------------------------------------------------------------------------------
1 | ../datasets/all_crossvalidated_predictions.txt


--------------------------------------------------------------------------------
/Fig3_S3/all_crossvalidated_predictions_GM12878.txt:
--------------------------------------------------------------------------------
1 | ../datasets/all_crossvalidated_predictions_GM12878.txt


--------------------------------------------------------------------------------
/Fig3_S3/all_crossvalidated_predictions_K562.txt:
--------------------------------------------------------------------------------
1 | ../datasets/all_crossvalidated_predictions_K562.txt


--------------------------------------------------------------------------------
/Fig3_S3/all_crossvalidated_predictions_mESC.txt:
--------------------------------------------------------------------------------
1 | ../datasets/all_crossvalidated_predictions_mESC.txt


--------------------------------------------------------------------------------
/Fig3_S3/all_crossvalidated_predictions_mouse.txt:
--------------------------------------------------------------------------------
1 | ../datasets/all_crossvalidated_predictions_mouse.txt


--------------------------------------------------------------------------------
/Fig3_S3/cross_valid:
--------------------------------------------------------------------------------
1 | ../datasets/cross_valid


--------------------------------------------------------------------------------
/Fig3_S3/cross_valid_GM12878:
--------------------------------------------------------------------------------
1 | ../datasets/cross_valid_GM12878


--------------------------------------------------------------------------------
/Fig3_S3/cross_valid_K562:
--------------------------------------------------------------------------------
1 | ../datasets/cross_valid_K562


--------------------------------------------------------------------------------
/Fig3_S3/cross_valid_mESC:
--------------------------------------------------------------------------------
1 | ../datasets/cross_valid_mESC


--------------------------------------------------------------------------------
/Fig3_S3/cross_valid_mouse:
--------------------------------------------------------------------------------
1 | ../datasets/cross_valid_mouse


--------------------------------------------------------------------------------
/Fig3_S3/diHMM:
--------------------------------------------------------------------------------
1 | ../datasets/diHMM/


--------------------------------------------------------------------------------
/Fig3_S3/ensembl2geneName_v90_mm10.txt:
--------------------------------------------------------------------------------
1 | ../datasets/ensembl2geneName_v90_mm10.txt


--------------------------------------------------------------------------------
/Fig3_S3/integrate_cv_results.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(latticeExtra)
 3 | 
 4 | cv_folder = args[1]
 5 | predfolder = args[2]
 6 | outfile = args[3]
 7 | 
 8 | files = list.files(path=cv_folder, pattern='.txt', full.names=T)
 9 | 
10 | table <- do.call("rbind", lapply(files, FUN=function(file){
11 | 	cmd = paste("tail -2", file, "| perl -ne \'@a=split /= /; print $a[1];\'")
12 | 	tmp = t(read.table(textConnection(system(cmd, intern=TRUE))))
13 | 	tmp$fold = as.numeric(strsplit(basename(file), "_")[[1]][1])
14 | 	tmp$trial = as.numeric(strsplit(strsplit(file, "_trial")[[1]][2], '\\.')[[1]][1])
15 | 	names(tmp) = c("r2","MSE","fold","trial")
16 | 	tmp
17 | }))
18 | 
19 | table=as.data.frame(apply(table,2,function(x) as.numeric(as.character(x))))
20 | head(table)
21 | 
22 | do.call("rbind", lapply(unique(table$fold), function(x) { tmp=table[table$fold==x,]; tmp[which( tmp$MSE == min(tmp$MSE) ),] } ) )
23 | table = do.call("rbind",lapply(unique(table$fold), function(x) { tmp=table[table$fold==x,]; tmp[which( tmp$MSE==min(tmp$MSE) ),c("fold","trial")] } ))
24 | 
25 | if (nrow(table) == 10){
26 |     files = apply(table, 1, function(x) { paste(predfolder,x[2],x[1],"predictions.txt",sep='') } )
27 |     say(files)
28 |     table = do.call("rbind", lapply(files, function(x) { read.delim(x) } ) )
29 |     write.table(table,file=outfile, quote=F, row.names=F, sep='\t')
30 | }
31 | #otherwise cant do, select which trial to use from table due to tie
32 | 


--------------------------------------------------------------------------------
/Fig3_S3/mouse.median_expr.txt:
--------------------------------------------------------------------------------
1 | ../datasets/mouse.median_expr.txt


--------------------------------------------------------------------------------
/Fig3_S3/pM10Kb_1KTest_GM12878expr_cv:
--------------------------------------------------------------------------------
1 | ../datasets/pM10Kb_1KTest_GM12878expr_cv/


--------------------------------------------------------------------------------
/Fig3_S3/pM10Kb_1KTest_K562expr_cv:
--------------------------------------------------------------------------------
1 | ../datasets/pM10Kb_1KTest_K562expr_cv/


--------------------------------------------------------------------------------
/Fig3_S3/pM10Kb_1KTest_mESCexpr_cv:
--------------------------------------------------------------------------------
1 | ../datasets/pM10Kb_1KTest_mESCexpr_cv/


--------------------------------------------------------------------------------
/Fig3_S3/pM10Kb_Mouse_cv:
--------------------------------------------------------------------------------
1 | ../datasets/pM10Kb_Mouse_cv/


--------------------------------------------------------------------------------
/Fig3_S3/pM10Kb_cv:
--------------------------------------------------------------------------------
1 | ../datasets/pM10Kb_cv/


--------------------------------------------------------------------------------
/Fig3_S3/runme.sh:
--------------------------------------------------------------------------------
 1 | # precomputed h5 files for human (pM10Kb_cv) and mouse (pM10Kb_Mouse_cv) are provided only to save space
 2 | # but all can be generated as below:
 3 | python setup_training_files.py --cv Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz pM10Kb_cv
 4 | python setup_training_files.py --cv Roadmap_FantomAnnotations.InputData.pM10Kb.K562expr.txt.gz pM10Kb_1KTest_K562expr_cv
 5 | python setup_training_files.py --cv Roadmap_FantomAnnotations.InputData.pM10Kb.GM12878expr.txt.gz pM10Kb_1KTest_GM12878expr_cv
 6 | python setup_training_files.py --cv Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz pM10Kb_Mouse_cv
 7 | python setup_training_files.py --cv Mouse_FantomAnnotations.InputData.pM10Kb.mESC.txt.gz pM10Kb_1KTest_mESCexpr_cv
 8 | 
 9 | # RUN ON GPU USING FOLDERS ABOVE, TAKES MANY HOURS TO RUN ON GPU
10 | for y in {1..10}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_cv/ >pM10Kb_cv/fold$y\_trial$x.txt; } done } done &
11 | for y in {1..10}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_K562expr_cv/ >pM10Kb_1KTest_K562expr_cv/fold$y\_trial$x.txt; } done } done &
12 | for y in {1..10}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_GM12878expr_cv/ >pM10Kb_1KTest_GM12878expr_cv/fold$y\_trial$x.txt; } done } done &
13 | for y in {1..10}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_Mouse_cv/ >pM10Kb_Mouse_cv/fold$y\_trial$x.txt; } done } done &
14 | for y in {1..10}; do { for x in {0..9}; do { python Xpresso.py --fold $y --trial $x test tpe_1K_10epochs_optimized_0to20K.hyperopt pM10Kb_1KTest_mESCexpr_cv/ >pM10Kb_1KTest_mESCexpr_cv/fold$y\_trial$x.txt; } done } done &
15 | 
16 | #MERGED RESULTS INTO ALL-CROSSVALIDATED PREDICTIONS
17 | Rscript integrate_cv_results.R cross_valid pM10Kb_cv all_crossvalidated_predictions.txt
18 | Rscript integrate_cv_results.R cross_valid_K562 pM10Kb_1KTest_K562expr_cv all_crossvalidated_predictions_K562.txt
19 | Rscript integrate_cv_results.R cross_valid_GM12878 pM10Kb_1KTest_GM12878expr_cv all_crossvalidated_predictions_GM12878.txt
20 | Rscript integrate_cv_results.R cross_valid_mouse pM10Kb_Mouse_cv all_crossvalidated_predictions_mouse.txt
21 | Rscript integrate_cv_results.R cross_valid_mESC pM10Kb_1KTest_mESCexpr_cv all_crossvalidated_predictions_mESC.txt
22 | 
23 | mkdir diHMM
24 | cd diHMM
25 | wget http://bcb.dfci.harvard.edu/~gcyuan/data/diHMM/diHMM_Annotations.zip
26 | unzip diHMM_Annotations.zip
27 | 
28 | cd K562
29 | bedtools intersect -wo -a K562_nD30_nB30_domainLevelStatesColor.bed -b ../../Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz >K562_overlapping_genes.bed
30 | grep -P 'D7|D8|D9|D23' K562_overlapping_genes.bed | cut -f 18 | cut -b 1-15 | sort | uniq >H3K27me3_silenced.txt
31 | grep -P 'D10|D11|D12|D13' K562_overlapping_genes.bed | cut -f 18 | cut -b 1-15 | sort | uniq >superenhancer.txt
32 | cd ../GM12878/
33 | bedtools intersect -wo -a GM12878_nD30_nB30_domainLevelStatesColor.bed -b ../../Homo_sapiens.hg19.90.chosenTranscript.geneBoundaries.gtf.gz >GM12878_overlapping_genes.bed
34 | grep -P 'D7|D8|D9|D23' GM12878_overlapping_genes.bed | cut -f 18 | cut -b 1-15 | sort | uniq >H3K27me3_silenced.txt
35 | grep -P 'D10|D11|D12|D13' GM12878_overlapping_genes.bed | cut -f 18 | cut -b 1-15 | sort | uniq >superenhancer.txt
36 | cd ../..
37 | 
38 | Rscript Fig3ABCDEF_S3ABC.R
39 | 
40 | wget http://www.targetscan.org/mmu_71/mmu_71_data_download/Summary_Counts.default_predictions.txt.zip
41 | unzip Summary_Counts.default_predictions.txt.zip
42 | gzip Summary_Counts.default_predictions.txt.gz
43 | Rscript Fig3GH.R
44 | 


--------------------------------------------------------------------------------
/Fig3_S3/setup_training_files.py:
--------------------------------------------------------------------------------
1 | ../Fig2/setup_training_files.py


--------------------------------------------------------------------------------
/Fig4_S4/57epigenomes.RPKM.pc.gz:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/57epigenomes.RPKM.pc.gz


--------------------------------------------------------------------------------
/Fig4_S4/Fig4ABCD.R:
--------------------------------------------------------------------------------
 1 | library(ROCR)
 2 | 
 3 | #### MOUSE ######
 4 | 
 5 | a=read.delim("all_crossvalidated_predictions_mESC.txt")
 6 | b=read.delim("all_crossvalidated_predictions_mouse.txt")
 7 | colnames(a)[2:3]=c("mESCPred","mESCActual")
 8 | a=merge(a,b,by=1)
 9 | a[a=='']="NA"
10 | mouse = a
11 | 
12 | ### HUMAN #####
13 | 
14 | a=read.delim(gzfile("57epigenomes.RPKM.pc.gz"))
15 | a$E000=NULL
16 | a[,2:ncol(a)]=log10(a[,2:ncol(a)]+0.1)
17 | 
18 | nrow(a)
19 | b=read.delim("all_crossvalidated_predictions.txt")
20 | a=merge(a,b,by=1)
21 | b=read.delim("all_crossvalidated_predictions_K562.txt")
22 | colnames(b)[2:3]=c("K562Pred","K562Actual")
23 | a=merge(a,b,by=1)
24 | b=read.delim("all_crossvalidated_predictions_GM12878.txt")
25 | colnames(b)[2:3]=c("GM12878Pred","GM12878Actual")
26 | a=merge(a,b,by=1)
27 | nrow(a)
28 | 
29 | c=read.delim("GSE78709_sure23.plasmid.norm.combined.45.55.minus.promoters.bigWigSignal",F) #from van Aresbergen et al
30 | d=read.delim("GSE78709_sure23.plasmid.norm.combined.45.55.plus.promoters.bigWigSignal",F)
31 | c=rbind(c,d)
32 | c$V6=log10(c$V6+0.1)
33 | c=c[,c(1,6)]
34 | colnames(c)[2]='SuRE'
35 | a=merge(a,c,by=1,all.x=T)
36 | 
37 | pdf("Fig4ABCD.pdf", width=8, height=8)
38 | par(mar=c(7,7,5,5), mgp = c(5, 1.5, 0))
39 | 
40 | a$SuREAdj=predict(lm(E123~SuRE, a))
41 | a$K562PredAdj=predict(lm(E123~K562Pred, a))
42 | a$PromoterActivity=predict(lm(E123~SuRE+K562Pred, a))
43 | 
44 | #Fig4A
45 | (cors = data.frame( K562=c(cor(a$K562Actual,a$Pred)^2, cor(a$K562Actual,a$K562Pred)^2),
46 |                     GM12878=c(cor(a$GM12878Actual,a$Pred)^2, cor(a$GM12878Actual,a$K562Pred)^2),
47 |                     mESC=c(cor(mouse$mESCActual,mouse$Pred)^2, cor(mouse$mESCActual,mouse$mESCPred)^2) ))
48 | barplot(as.matrix(cors), las=2, beside=TRUE, col=c("red","blue"), border=F, ylim=c(0, 0.6), ylab="r^2 to gene expression level" )
49 | 
50 | #Fig4D
51 | smoothScatter(a$SuREAdj, a$E123, cex.axis=2, cex.lab=2, bty="n", xlab="SuRE activity", ylab="K562 expression level (log10)", xlim=c(-1.5, 2), ylim=c(-1, 4), las=1, cex=.5)
52 | abline(0,1, col="red")
53 | text(1.5, 4, labels = paste("r^2 =", round(cor(a$E123, a$SuRE)^2,2)), offset = 0.5, col="black")
54 | 
55 | smoothScatter(a$PromoterActivity, a$E123, cex.axis=2, cex.lab=2, bty="n", xlab="Predicted expression level, joint model", ylab="K562 expression level (log10)", xlim=c(-1.5, 2), ylim=c(-1, 4), las=1, cex=.5)
56 | abline(0,1, col="red")
57 | text(1.5, 4, labels = paste("r^2 =", round(cor(a$PromoterActivity, a$E123)^2,2)), offset = 0.5, col="black")
58 | 
59 | #Fig4BC
60 | a$deltaActual = a$K562Actual-a$GM12878Actual
61 | a$deltaPred = a$K562Pred-a$GM12878Pred
62 | nrow(a)
63 | smoothScatter(a$GM12878Actual, a$K562Actual, cex.axis=2, cex.lab=2, bty="n", xlab="GM12878 expression level (log10)", ylab="K562 expression level (log10)", xlim=c(-1, 4), ylim=c(-1, 4), las=1, cex=.5)
64 | abline(1,1, col="red")
65 | abline(-1,1, col="red")
66 | text(0, 4, labels = paste("Upregulated in K562:", nrow(a[a$deltaActual > 1,])), offset = 0.5, col="black")
67 | text(3, -1, labels = paste("Upregulated in GM12878:", nrow(a[a$deltaActual < -1,])), offset = 0.5, col="black")
68 | 
69 | b=a[abs(a$deltaActual) > 1,]
70 | b$deltaActual = ifelse(b$deltaActual > 0, 1, 0)
71 | plot(performance( prediction( b$deltaPred, b$deltaActual), "tpr", "fpr"), col="blue", las=1, cex.axis=2, cex.lab=2)
72 | text(0.2, 1, labels = paste("AUC = ", round(performance( prediction(b$deltaPred, b$deltaActual), "auc")@y.values[[1]],2), ' (n = ', nrow(b), ')', sep=''), offset = 1.5, col="black")
73 | abline(0,1,col="grey")
74 | dev.off()


--------------------------------------------------------------------------------
/Fig4_S4/Fig4ABCD.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/Fig4ABCD.pdf


--------------------------------------------------------------------------------
/Fig4_S4/Fig4E.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/Fig4E.pdf


--------------------------------------------------------------------------------
/Fig4_S4/Fig4E_S4.R:
--------------------------------------------------------------------------------
 1 | options(warn=-1)
 2 | 
 3 | a=read.delim("model_comparison.txt", sep=' ')
 4 | a
 5 | 
 6 | pdf("FigS4A.pdf", height=8, width=10)
 7 | par(oma=c(1,20,1,1))
 8 | barplot(rbind(a$test_r_squared,a$test_r_squared_withHL),beside=T,horiz=T,
 9 | names.arg=a$model,las=1,col=c("red","blue"), border=F, xlim=c(0,0.8))
10 | legend("bottomright", bg="white", bty="n", legend = c("with half life", "without half life"), text.col = c("blue","red"))
11 | dev.off()
12 | 
13 | a=read.delim("model_comparison_Fig3.txt", sep=' ')
14 | a
15 | pdf("Fig4E.pdf", height=8, width=10)
16 | par(oma=c(1,20,1,1))
17 | barplot(a$r_squared,horiz=T, names.arg=a$model,las=1,col=c(rep("red",11),rep("blue",22)), border=F, xlim=c(0,0.8))
18 | legend("bottomright", bg="white", bty="n", legend = c("mouse", "human"), text.col = c("red","blue"))
19 | dev.off()
20 | 


--------------------------------------------------------------------------------
/Fig4_S4/FigS4A.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/FigS4A.pdf


--------------------------------------------------------------------------------
/Fig4_S4/FigS4B.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(Biostrings)
 3 | library(rhdf5)
 4 | library(reshape2)
 5 | source("coefplot.r")
 6 | 
 7 | h5dir = args[1]
 8 | testIDs = h5read(paste(h5dir, "test.h5", sep='/'),"geneName")
 9 | trainIDs = h5read(paste(h5dir, "train.h5", sep='/'),"geneName")
10 | valIDs = h5read(paste(h5dir, "valid.h5", sep='/'),"geneName")
11 | file1 = "Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz"
12 | file2 = "promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz"
13 | kmerlen = 1
14 | 
15 | if (grepl("Mouse",h5dir)) {
16 |     file1 = "Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz"
17 |     file2 = "promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz"
18 | }
19 | 
20 | inp.tbl <- fread(paste("zcat", file1),header=T,data.table=F,sep="\t")
21 | rownames(inp.tbl) = inp.tbl[,1]
22 | inp.tbl[,1] = NULL
23 | 
24 | inp.tbl$PROMOTER = substring(inp.tbl$PROMOTER,8500,11500)
25 | 
26 | inp.tbl=cbind(inp.tbl, do.call(rbind, lapply(inp.tbl$PROMOTER, function(x){
27 |     y = oligonucleotideFrequency(DNAStringSet(x), kmerlen)
28 |     y/sum(y)
29 | })))
30 | inp.tbl$T=NULL #remove TT dinucleotide to ensure matrix is full rank
31 | 
32 | inp.tbl$PROMOTER = NULL
33 | inp.tbl[,c(1:5, 9)] = log10(inp.tbl[,c(1:5, 9)]+0.1)
34 | inp.tbl=as.data.frame(scale(inp.tbl))
35 | 
36 | # save(inp.tbl, file="5merInputTable.RData")
37 | # # load("TriInputTable.RData")
38 | 
39 | motif_hits <- fread(paste("zcat", file2),header=T,data.table=F,sep="\t")
40 | motif_hits <- dcast(motif_hits, motif_hits[,2] ~ motif_hits[,1], function(x) 1, fill = 0)
41 | inp.tbl=merge(inp.tbl, motif_hits, by.x=0, by.y=1, all.x=T)
42 | 
43 | sum(is.na(inp.tbl))
44 | inp.tbl[is.na(inp.tbl)] = 0
45 | rownames(inp.tbl) = inp.tbl[,1]
46 | inp.tbl[,1] = NULL
47 | 
48 | train = inp.tbl[rownames(inp.tbl) %in% trainIDs, ]
49 | valid = inp.tbl[rownames(inp.tbl) %in% valIDs, ]
50 | test = inp.tbl[rownames(inp.tbl) %in% testIDs, ]
51 | 
52 | mod1 = lm(EXPRESSION ~ ., data=train)
53 | mod2 = lm(EXPRESSION ~ UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY, data=train)
54 | mod3 = lm(EXPRESSION ~ .-(UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY), data=train)
55 | 
56 | cor(test$EXPRESSION, predict(mod1, newdata = test))^2
57 | cor(test$EXPRESSION, predict(mod2, newdata = test))^2
58 | cor(test$EXPRESSION, predict(mod3, newdata = test))^2
59 | 
60 | summary(mod2)
61 | pdf("FigS4B_human.pdf") #change to mouse if dir is mouse
62 | coefplot(mod2, parm = -1)
63 | 


--------------------------------------------------------------------------------
/Fig4_S4/FigS4B.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/FigS4B.pdf


--------------------------------------------------------------------------------
/Fig4_S4/FigS4B_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/FigS4B_2.pdf


--------------------------------------------------------------------------------
/Fig4_S4/FigS4C.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(Biostrings)
 3 | library(rhdf5)
 4 | library(reshape2)
 5 | library(beeswarm)
 6 | 
 7 | h5dir = args[1]
 8 | 
 9 | file1 = "Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz"
10 | file2 = "promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz"
11 | kmerlen = 4
12 | 
13 | if (grepl("Mouse",h5dir)) {
14 |     file1 = "Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz"
15 |     file2 = "promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz"
16 | }
17 | 
18 | inp.tbl <- fread(paste("zcat", file1),header=T,data.table=F,sep="\t")
19 | rownames(inp.tbl) = inp.tbl[,1]
20 | inp.tbl[,1] = NULL
21 | 
22 | inp.tbl$PROMOTER = substring(inp.tbl$PROMOTER,8500,11500)
23 | 
24 | inp.tbl=cbind(inp.tbl, do.call(rbind, lapply(inp.tbl$PROMOTER, function(x){
25 |     y = oligonucleotideFrequency(DNAStringSet(x), kmerlen)
26 |     y/sum(y)
27 | })))
28 | inp.tbl$T=NULL #remove TT dinucleotide to ensure matrix is full rank
29 | 
30 | inp.tbl$PROMOTER = NULL
31 | inp.tbl[,c(1:5, 9)] = log10(inp.tbl[,c(1:5, 9)]+0.1)
32 | inp.tbl=as.data.frame(scale(inp.tbl))
33 | 
34 | # save(inp.tbl, file="5merInputTable.RData")
35 | # # load("TriInputTable.RData")
36 | 
37 | motif_hits <- fread(paste("zcat", file2),header=T,data.table=F,sep="\t")
38 | motif_hits <- dcast(motif_hits, motif_hits[,2] ~ motif_hits[,1], function(x) 1, fill = 0)
39 | inp.tbl=merge(inp.tbl, motif_hits, by.x=0, by.y=1, all.x=T)
40 | 
41 | sum(is.na(inp.tbl))
42 | inp.tbl[is.na(inp.tbl)] = 0
43 | rownames(inp.tbl) = inp.tbl[,1]
44 | inp.tbl[,1] = NULL
45 | 
46 | z = do.call("rbind", lapply(1:10, function(i){
47 |     testIDs = h5read(paste(h5dir, '/', i, "test.h5", sep=''),"geneName")
48 |     trainIDs = h5read(paste(h5dir, '/', i, "train.h5", sep=''),"geneName")
49 |     valIDs = h5read(paste(h5dir, '/', i, "valid.h5", sep=''),"geneName")
50 |     train = inp.tbl[rownames(inp.tbl) %in% trainIDs | rownames(inp.tbl) %in% valIDs, ]
51 |     # valid = inp.tbl[rownames(inp.tbl) %in% valIDs, ]
52 |     test = inp.tbl[rownames(inp.tbl) %in% testIDs, ]
53 | 
54 |     mod1 = lm(EXPRESSION ~ ., data=train)
55 |     c(i, cor(test$EXPRESSION, predict(mod1, newdata = test))^2)
56 | }))
57 | colnames(z)=c("fold","baseliner2")
58 | 
59 | cv_folder = paste0(h5dir, 2)
60 | files = list.files(path=cv_folder, pattern='.txt.gz', full.names=T)
61 | table <- do.call("rbind", lapply(files, FUN=function(file){
62 |         cmd = paste("zcat ", file, "| tail -2 | perl -ne \'@a=split /= /; print $a[1];\'")
63 |         tmp = t(read.table(textConnection(system(cmd, intern=TRUE))))
64 |         tmp$fold = as.numeric(strsplit(basename(file), "_")[[1]][1])
65 |         tmp$trial = as.numeric(strsplit(strsplit(file, "_trial")[[1]][2], '\\.')[[1]][1])
66 |         names(tmp) = c("r2","MSE","fold","trial")
67 |         tmp
68 | }))
69 | 
70 | table=as.data.frame(apply(table,2,function(x) as.numeric(as.character(x))))
71 | table = do.call("rbind",lapply(unique(table$fold), function(x) { tmp=table[table$fold==x,]; tmp[which( tmp$MSE==min(tmp$MSE) ),c("r2","fold")] } ))
72 | table = aggregate(table$r2, by=list(fold=table$fold), mean)
73 | colnames(table)[2]="Xpressor2"
74 | table
75 | 
76 | cvr2 = merge(table,z,by=1)
77 | t.test(cvr2[,2],cvr2[,3],paired=T)
78 | cvr2
79 | pdf("FigS4C_human.pdf") #change to mouse if dir is mouse
80 | beeswarm(cvr2[,2:3],ylim=c(0,1), las=2, bty='n', pch=19) #0.4,0.8
81 | dev.off()
82 | 


--------------------------------------------------------------------------------
/Fig4_S4/FigS4C_human.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/FigS4C_human.pdf


--------------------------------------------------------------------------------
/Fig4_S4/FigS4C_mouse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig4_S4/FigS4C_mouse.pdf


--------------------------------------------------------------------------------
/Fig4_S4/JASPAR_CORE_2016_vertebrates.meme:
--------------------------------------------------------------------------------
1 | ../datasets/JASPAR_CORE_2016_vertebrates.meme


--------------------------------------------------------------------------------
/Fig4_S4/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz


--------------------------------------------------------------------------------
/Fig4_S4/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz


--------------------------------------------------------------------------------
/Fig4_S4/all_crossvalidated_predictions.txt:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/all_crossvalidated_predictions.txt


--------------------------------------------------------------------------------
/Fig4_S4/all_crossvalidated_predictions_GM12878.txt:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/all_crossvalidated_predictions_GM12878.txt


--------------------------------------------------------------------------------
/Fig4_S4/all_crossvalidated_predictions_K562.txt:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/all_crossvalidated_predictions_K562.txt


--------------------------------------------------------------------------------
/Fig4_S4/all_crossvalidated_predictions_mESC.txt:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/all_crossvalidated_predictions_mESC.txt


--------------------------------------------------------------------------------
/Fig4_S4/all_crossvalidated_predictions_mouse.txt:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/all_crossvalidated_predictions_mouse.txt


--------------------------------------------------------------------------------
/Fig4_S4/baseline_models.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(Biostrings)
 3 | library(rhdf5)
 4 | library(reshape2)
 5 | 
 6 | h5dir = args[1]
 7 | kmerlen = args[2]
 8 | 
 9 | testIDs = h5read(paste(h5dir, "test.h5", sep='/'),"geneName")
10 | trainIDs = h5read(paste(h5dir, "train.h5", sep='/'),"geneName")
11 | valIDs = h5read(paste(h5dir, "valid.h5", sep='/'),"geneName")
12 | file1 = "Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz"
13 | file2 = "promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz"
14 | 
15 | if (grepl("Mouse",h5dir)) {
16 |     file1 = "Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz"
17 |     file2 = "promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz"
18 | }
19 | 
20 | inp.tbl <- fread(paste("zcat", file1),header=T,data.table=F,sep="\t")
21 | rownames(inp.tbl) = inp.tbl[,1]
22 | inp.tbl[,1] = NULL
23 | 
24 | inp.tbl$PROMOTER = substring(inp.tbl$PROMOTER,8500,11500)
25 | 
26 | inp.tbl=cbind(inp.tbl, do.call(rbind, lapply(inp.tbl$PROMOTER, function(x){
27 |     y = oligonucleotideFrequency(DNAStringSet(x), kmerlen)
28 |     y/sum(y)
29 | })))
30 | 
31 | inp.tbl$PROMOTER = NULL
32 | inp.tbl[,c(1:5, 9)] = log10(inp.tbl[,c(1:5, 9)]+0.1)
33 | inp.tbl=as.data.frame(scale(inp.tbl))
34 | 
35 | train = inp.tbl[rownames(inp.tbl) %in% trainIDs, ]
36 | valid = inp.tbl[rownames(inp.tbl) %in% valIDs, ]
37 | test = inp.tbl[rownames(inp.tbl) %in% testIDs, ]
38 | 
39 | #full model
40 | mod1 = lm(EXPRESSION ~ ., data=train)
41 | #half life only model
42 | mod2 = lm(EXPRESSION ~ UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY, data=train)
43 | #promoter only model
44 | mod3 = lm(EXPRESSION ~ .-(UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY), data=train)
45 | summary(mod3)
46 | 
47 | cor(test$EXPRESSION, predict(mod1, newdata = test))^2
48 | cor(test$EXPRESSION, predict(mod2, newdata = test))^2
49 | cor(test$EXPRESSION, predict(mod3, newdata = test))^2
50 | 
51 | plot(predict(mod1, newdata = test), test$EXPRESSION)
52 | test = test[test$EXPRESSION >= -1, ]
53 | cor(test$EXPRESSION, predict(mod1, newdata = test))^2
54 | plot(predict(mod1, newdata = test), test$EXPRESSION)
55 | 
56 | motif_hits <- fread(paste("zcat", file2),header=T,data.table=F,sep="\t")
57 | motif_hits <- dcast(motif_hits, motif_hits[,2] ~ motif_hits[,1], function(x) 1, fill = 0)
58 | 
59 | inp.tbl=merge(inp.tbl, motif_hits, by.x=0, by.y=1, all.x=T)
60 | sum(is.na(inp.tbl))
61 | inp.tbl[is.na(inp.tbl)] = 0
62 | rownames(inp.tbl) = inp.tbl[,1]
63 | inp.tbl[,1] = NULL
64 | 
65 | train = inp.tbl[rownames(inp.tbl) %in% trainIDs, ]
66 | valid = inp.tbl[rownames(inp.tbl) %in% valIDs, ]
67 | test = inp.tbl[rownames(inp.tbl) %in% testIDs, ]
68 | 
69 | mod1 = lm(EXPRESSION ~ ., data=train)
70 | mod2 = lm(EXPRESSION ~ UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY, data=train)
71 | mod3 = lm(EXPRESSION ~ .-(UTR5LEN+CDSLEN+INTRONLEN+UTR3LEN+UTR5GC+CDSGC+UTR3GC+ORFEXONDENSITY), data=train)
72 | 
73 | cor(test$EXPRESSION, predict(mod1, newdata = test))^2
74 | cor(test$EXPRESSION, predict(mod2, newdata = test))^2
75 | cor(test$EXPRESSION, predict(mod3, newdata = test))^2
76 | 


--------------------------------------------------------------------------------
/Fig4_S4/coefplot.r:
--------------------------------------------------------------------------------
 1 | # published on http://www.r-statistics.com/2010/07/visualization-of-regression-coefficients-in-r
 2 | # originally written by "<a href="http://statmath.wu.ac.at/~zeileis/">Achim Zeileis</a>"
 3 | # GPL-2
 4 | 
 5 | coefplot <- function(object, df = NULL, level = 0.95, parm = NULL,
 6 | 					 labels = TRUE, xlab = "Coefficient confidence intervals", ylab = "",
 7 | 					 xlim = NULL, ylim = NULL,
 8 | 					 las = 1, lwd = 1, lty = c(1, 2), pch = 19, col = 1,
 9 | 					 length = 0, angle = 30, code = 3, ...)
10 | {
11 | 	 cf <- coef(object)
12 | 	 se <- sqrt(diag(vcov(object)))
13 | 	 if(is.null(parm)) parm <- seq_along(cf)
14 | 	 if(is.numeric(parm) | is.logical(parm)) parm <- names(cf)[parm]
15 | 	 if(is.character(parm)) parm <- which(names(cf) %in% parm)
16 | 	 cf <- cf[parm]
17 | 	 se <- se[parm]
18 | 	 k <- length(cf)
19 | 
20 | 	 if(is.null(df)) {
21 | 	   df <- if(identical(class(object), "lm")) df.residual(object) else 0
22 | 	 }
23 | 
24 | 	 critval <- if(df > 0 & is.finite(df)) {
25 | 	   qt((1 - level)/2, df = df)
26 | 	 } else {
27 | 	   qnorm((1 - level)/2)
28 | 	 }
29 | 	 ci1 <- cf + critval * se
30 | 	 ci2 <- cf - critval * se
31 | 
32 | 	 lwd <- rep(lwd, length.out = 2)
33 | 	 lty <- rep(lty, length.out = 2)
34 | 	 pch <- rep(pch, length.out = k)
35 | 	 col <- rep(col, length.out = k)
36 | 
37 | 	 if(is.null(xlim)) xlim <- range(c(0, min(ci1), max(ci2)))
38 | 	 if(is.null(ylim)) ylim <- c(1 - 0.05 * k, 1.05 * k)
39 | 
40 | 	 if(isTRUE(labels)) labels <- names(cf)
41 | 	 if(identical(labels, FALSE)) labels <- ""
42 | 	 labels <- rep(labels, length.out = k)
43 | 
44 | 	 plot(0, 0, xlim = xlim, ylim = ylim, xlab = xlab, ylab = ylab,
45 | 	   axes = FALSE, type = "n", las = las, ...)
46 | 	 arrows(ci1, 1:k, ci2, 1:k, lty = lty[1], lwd = lwd[1], col = col,
47 | 	   length = length, angle = angle, code = code)
48 | 	 points(cf, 1:k, pch = pch, col = col)
49 | 	 abline(v = 0, lty = lty[2], lwd = lwd[2])
50 | 	 axis(1)
51 | 	 axis(2, at = 1:k, labels = labels, las = las)
52 | 	 box()
53 | }
54 | 


--------------------------------------------------------------------------------
/Fig4_S4/gencode.v27lift37.basic.annotation.gtf.gz:
--------------------------------------------------------------------------------
1 | ../datasets/gencode.v27lift37.basic.annotation.gtf.gz


--------------------------------------------------------------------------------
/Fig4_S4/model_comparison.txt:
--------------------------------------------------------------------------------
 1 | model test_r_squared test_r_squared_withHL
 2 | "HL-only" 0 0.1702954
 3 | "1mer" 0.1566485 0.2727732
 4 | "2mer" 0.3316107 0.3807903
 5 | "3mer" 0.4012669 0.4329388
 6 | "4mer" 0.4450406 0.468374
 7 | "5mer" 0.4465227 0.469725
 8 | "5mer" 0.4465227 0.469725
 9 | "JASPAR TFs+4mer" 0.4564599 0.4778438
10 | "Xpresso, Hyperparameter-tuned, mononucleotide-shuffled input" 0.224 0.279
11 | "Xpresso, Hyperparameter-tuned, dinucleotide-shuffled input" 0.286 0.333
12 | "Xpresso, Manually-discovered hyperparameters" 0.511 0.532
13 | "Xpresso, Hyperparameter-tuned" 0.504 0.590
14 | "HL-only, mouse" 0 0.3045577
15 | "1mer, mouse" 0.2517215 0.3691906
16 | "2mer, mouse" 0.4498092 0.5138408
17 | "3mer, mouse" 0.5070994 0.5509732
18 | "4mer, mouse" 0.5395627 0.5721985
19 | "5mer, mouse" 0.5703819 0.5925751
20 | "6mer, mouse" 0.5101936 0.5359272
21 | "JASPAR TFs, mouse" 0.4555412 0.5042129
22 | "JASPAR TFs+5mer, mouse" 0.5790679 0.593618
23 | "Xpresso, Hyperparameter-tuned, mononucleotide-shuffled input, mouse" 0.413 0.456
24 | "Xpresso, Hyperparameter-tuned, dinucleotide-shuffled input, mouse" 0.502 0.544
25 | "Xpresso, Manually-discovered hyperparameters, mouse" 0.636 0.667
26 | "Xpresso, Hyperparameter-tuned, mouse" 0.632 0.710


--------------------------------------------------------------------------------
/Fig4_S4/model_comparison_Fig3.txt:
--------------------------------------------------------------------------------
 1 | model r_squared species
 2 | "Histone+ChIP-seq+PWM matches, ESC (Ouyang 2009)" 0.65 mouse
 3 | "Chromatin, ESC (Cheng 2011)" 0.55 mouse
 4 | "TF+DNase, ESC (Duren 2017)" 0.47 mouse
 5 | "DNase, ESC (Duren 2017)" 0.35 mouse
 6 | "Histone+TF+PWM matches, ESC (McLeay 2012)" 0.695 mouse
 7 | "PWM matches with Histone support, ESC (McLeay 2012)" 0.52 mouse
 8 | "PWM matches, ESC (McLeay 2012)" 0.28 mouse
 9 | "Xpresso, mESC (this study)" 0.5884967 mouse
10 | "MPRA, neuron, CAGE correlation (Nguyen 2016)" 0.073 mouse
11 | "5mer and half life features, median expression levels (this study)" 0.5925751 mouse
12 | "Sequence and half life features, median expression levels (this study)" 0.710 mouse
13 | "Histone marks, CD4+ T cells (Karlić 2010)" 0.56 human
14 | "Sequence+Histone+TF+DNase, K562 (Zhou 2018)" 0.535 human
15 | "PWM matches with DNase/Histone support, GM12878 (Schmidt 2017)" 0.34 human
16 | "Histone+TF+PWM matches, GM12878 (McLeay 2012)" 0.41 human
17 | "PWM matches with Histone support, GM12878 (McLeay 2012)" 0.28 human
18 | "PWM matches, GM12878 (McLeay 2012)" 0.08 human
19 | "Xpresso, GM12878 (this study)" 0.4259194 human
20 | "Sequence+Histone+TF+DNase, K562 (Zhou 2018)" 0.569 human
21 | "PWM matches with DNase/Histone support, K562 (Schmidt 2017)" 0.47 human
22 | "MPRA, K562, CAGE correlation (van Arensbergen 2016)" 0.49 human
23 | "Chromatin marks, K562 (Cheng 2011)" 0.39 human
24 | "Xpresso, K562 (this study)" 0.5040678 human
25 | "PWM matches with DNase/Histone support, HepG2 (Schmidt 2017)" 0.46 human
26 | "MPRA, many (Cooper 2006)" 0.28 human
27 | "MPRA, many (Landolin 2010)" 0.185 human
28 | "TF ChIP, median among cell types (Cheng 2012)" 0.39 human
29 | "Histone marks, median among cell types (Dong 2012)" 0.62 human
30 | "Histone marks, median among cell types (Abdalla 2018)" 0.52 human
31 | "Sequence only, median among cell types (Bressiere 2018)" 0.336 human
32 | "Sequence only, median among cell types (Abdalla 2018)" 0.17 human
33 | "4mer and half life features, median expression levels (this study)" 0.468374 human
34 | "Sequence and half life features, median expression levels (this study)" 0.590 human
35 | 


--------------------------------------------------------------------------------
/Fig4_S4/promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz


--------------------------------------------------------------------------------
/Fig4_S4/promoters_pM1.5Kb.FIMO_scanned.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/promoters_pM1.5Kb.FIMO_scanned.txt.gz


--------------------------------------------------------------------------------
/Fig4_S4/promoters_pM1.5Kb.fa.gz:
--------------------------------------------------------------------------------
1 | ../datasets/promoters_pM1.5Kb.fa.gz


--------------------------------------------------------------------------------
/Fig4_S4/promoters_pM1.5Kb.firstOrderMarkov_background:
--------------------------------------------------------------------------------
1 | ../datasets/promoters_pM1.5Kb.firstOrderMarkov_background


--------------------------------------------------------------------------------
/Fig4_S4/promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz


--------------------------------------------------------------------------------
/Fig4_S4/promoters_pM1.5Kb.mouse.FIMO_scanned.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/promoters_pM1.5Kb.mouse.FIMO_scanned.txt.gz


--------------------------------------------------------------------------------
/Fig4_S4/promoters_pM1.5Kb.mouse.fa.gz:
--------------------------------------------------------------------------------
1 | ../datasets/promoters_pM1.5Kb.mouse.fa.gz


--------------------------------------------------------------------------------
/Fig4_S4/promoters_pM1.5Kb.mouse.firstOrderMarkov_background:
--------------------------------------------------------------------------------
1 | ../datasets/promoters_pM1.5Kb.mouse.firstOrderMarkov_background


--------------------------------------------------------------------------------
/Fig4_S4/runme.sh:
--------------------------------------------------------------------------------
 1 | perl -ne 'print "chr$_";' hg38_promoters_cage_corrected.bed >hg38_promoters_cage_corrected_withChr.bed
 2 | liftOver -bedPlus=6 hg38_promoters_cage_corrected_withChr.bed hg38ToHg19.over.chain hg19_promoters_cage_corrected_withChr.bed unmapped
 3 | ./supplement_ids.pl >hg19_promoters_cage_corrected_withChr_andOthers.bed
 4 | grep -P '\-$' hg19_promoters_cage_corrected_withChr_andOthers.bed >hg19_promoters_cage_corrected_withChr_andOthers_minus.bed
 5 | grep -P '\+$' hg19_promoters_cage_corrected_withChr_andOthers.bed >hg19_promoters_cage_corrected_withChr_andOthers_plus.bed
 6 | 
 7 | wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE78nnn/GSE78709/suppl/GSE78709_sure23.plasmid.norm.combined.45.55.plus.160504.bw
 8 | wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE78nnn/GSE78709/suppl/GSE78709_sure23.plasmid.norm.combined.45.55.minus.160504.bw
 9 | 
10 | bigWigAverageOverBed -sampleAroundCenter=1000 GSE78709_sure23.plasmid.norm.combined.45.55.plus.160504.bw hg19_promoters_cage_corrected_withChr_andOthers_plus.bed GSE78709_sure23.plasmid.norm.combined.45.55.plus.promoters.bigWigSignal
11 | bigWigAverageOverBed -sampleAroundCenter=1000 GSE78709_sure23.plasmid.norm.combined.45.55.minus.160504.bw hg19_promoters_cage_corrected_withChr_andOthers_minus.bed GSE78709_sure23.plasmid.norm.combined.45.55.minus.promoters.bigWigSignal
12 | 
13 | Rscript Fig4ABCD.R
14 | 
15 | #generate baseline dinucleotide model, and prepare to extract features for PWM-based model
16 | zcat Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1,11 | tail -n+2 | perl -ne '@a=split; print ">$a[0]\n".substr($a[1],8500,3000)."\n";' >promoters_pM1.5Kb.fa
17 | zcat Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1,11 | tail -n+2 | perl -ne '@a=split; print ">$a[0]\n".substr($a[1],8500,3000)."\n";' >promoters_pM1.5Kb.mouse.fa
18 | fasta-get-markov promoters_pM1.5Kb.fa >promoters_pM1.5Kb.firstOrderMarkov_background
19 | fasta-get-markov promoters_pM1.5Kb.mouse.fa >promoters_pM1.5Kb.mouse.firstOrderMarkov_background
20 | 
21 | fimo --bgfile promoters_pM1.5Kb.firstOrderMarkov_background --verbosity 1 --text --skip-matched-sequence JASPAR_CORE_2016_vertebrates.meme promoters_pM1.5Kb.fa | gzip -c >promoters_pM1.5Kb.FIMO_scanned.txt.gz
22 | fimo --bgfile promoters_pM1.5Kb.mouse.firstOrderMarkov_background --verbosity 1 --text --skip-matched-sequence JASPAR_CORE_2016_vertebrates.meme promoters_pM1.5Kb.mouse.fa | gzip -c >promoters_pM1.5Kb.mouse.FIMO_scanned.txt.gz
23 | 
24 | zcat promoters_pM1.5Kb.FIMO_scanned.txt.gz | cut -f 1,2 | uniq | gzip -c >promoters_pM1.5Kb.FIMO_scanned.condensed.txt.gz
25 | zcat promoters_pM1.5Kb.mouse.FIMO_scanned.txt.gz | cut -f 1,2 | uniq | gzip -c >promoters_pM1.5Kb.mouse.FIMO_scanned.condensed.txt.gz
26 | 
27 | #generate 1mer-6mer baseline models for human and mouse, respectively
28 | for x in {1..6}; do { Rscript baseline_models.R pM10Kb_1KTest $x; } done &
29 | for x in {1..6}; do { Rscript baseline_models.R pM10Kb_1KTest_Mouse $x; } done &
30 | 
31 | # empirical results from Xpresso and baselines are stored in model_comparison.txt
32 | Rscript Fig4E_S4.R
33 | 
34 | #generates for human, can change directory and code to generate for mouse
35 | Rscript FigS4B.R
36 | 
37 | #generates for human, can change directory and code to generate for mouse
38 | Rscript FigS4C.R
39 | 


--------------------------------------------------------------------------------
/Fig4_S4/supplement_ids.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | open IN, "<hg19_promoters_cage_corrected_withChr.bed";
 4 | while(<IN>){
 5 | 	($id) = ($_ =~ /(ENSG\d+)/);
 6 | 	$seen{$id}=1;
 7 | 	print $_;
 8 | }
 9 | close IN;
10 | 
11 | open IN, "zgrep -P '\tgene\t' gencode.v27lift37.basic.annotation.gtf.gz | grep protein_coding | ";
12 | while(<IN>){ chomp;
13 | 	@a = split /\t/;
14 | 	($id) = ($a[-1] =~ /(ENSG\d+)/);
15 | 	print join("\t", $a[0], $a[4]-1, $a[4]+1, $id, '0', $a[6]), "\n" if !$seen{$id} && $a[6] eq '-';
16 | 	print join("\t", $a[0], $a[3]-1, $a[3]+1, $id, '0', $a[6]), "\n" if !$seen{$id} && $a[6] eq '+';
17 | 	$seen{$id}=1;
18 | }
19 | close IN;


--------------------------------------------------------------------------------
/Fig5_S5/hg19.chrom.sizes:
--------------------------------------------------------------------------------
 1 | chr1	249250621
 2 | chr2	243199373
 3 | chr3	198022430
 4 | chr4	191154276
 5 | chr5	180915260
 6 | chr6	171115067
 7 | chr7	159138663
 8 | chrX	155270560
 9 | chr8	146364022
10 | chr9	141213431
11 | chr10	135534747
12 | chr11	135006516
13 | chr12	133851895
14 | chr13	115169878
15 | chr14	107349540
16 | chr15	102531392
17 | chr16	90354753
18 | chr17	81195210
19 | chr18	78077248
20 | chr20	63025520
21 | chrY	59373566
22 | chr19	59128983
23 | chr22	51304566
24 | chr21	48129895
25 | chr6_ssto_hap7	4928567
26 | chr6_mcf_hap5	4833398
27 | chr6_cox_hap2	4795371
28 | chr6_mann_hap4	4683263
29 | chr6_apd_hap1	4622290
30 | chr6_qbl_hap6	4611984
31 | chr6_dbb_hap3	4610396
32 | chr17_ctg5_hap1	1680828
33 | chr4_ctg9_hap1	590426
34 | chr1_gl000192_random	547496
35 | chrUn_gl000225	211173
36 | chr4_gl000194_random	191469
37 | chr4_gl000193_random	189789
38 | chr9_gl000200_random	187035
39 | chrUn_gl000222	186861
40 | chrUn_gl000212	186858
41 | chr7_gl000195_random	182896
42 | chrUn_gl000223	180455
43 | chrUn_gl000224	179693
44 | chrUn_gl000219	179198
45 | chr17_gl000205_random	174588
46 | chrUn_gl000215	172545
47 | chrUn_gl000216	172294
48 | chrUn_gl000217	172149
49 | chr9_gl000199_random	169874
50 | chrUn_gl000211	166566
51 | chrUn_gl000213	164239
52 | chrUn_gl000220	161802
53 | chrUn_gl000218	161147
54 | chr19_gl000209_random	159169
55 | chrUn_gl000221	155397
56 | chrUn_gl000214	137718
57 | chrUn_gl000228	129120
58 | chrUn_gl000227	128374
59 | chr1_gl000191_random	106433
60 | chr19_gl000208_random	92689
61 | chr9_gl000198_random	90085
62 | chr17_gl000204_random	81310
63 | chrUn_gl000233	45941
64 | chrUn_gl000237	45867
65 | chrUn_gl000230	43691
66 | chrUn_gl000242	43523
67 | chrUn_gl000243	43341
68 | chrUn_gl000241	42152
69 | chrUn_gl000236	41934
70 | chrUn_gl000240	41933
71 | chr17_gl000206_random	41001
72 | chrUn_gl000232	40652
73 | chrUn_gl000234	40531
74 | chr11_gl000202_random	40103
75 | chrUn_gl000238	39939
76 | chrUn_gl000244	39929
77 | chrUn_gl000248	39786
78 | chr8_gl000196_random	38914
79 | chrUn_gl000249	38502
80 | chrUn_gl000246	38154
81 | chr17_gl000203_random	37498
82 | chr8_gl000197_random	37175
83 | chrUn_gl000245	36651
84 | chrUn_gl000247	36422
85 | chr9_gl000201_random	36148
86 | chrUn_gl000235	34474
87 | chrUn_gl000239	33824
88 | chr21_gl000210_random	27682
89 | chrUn_gl000231	27386
90 | chrUn_gl000229	19913
91 | chrM	16571
92 | chrUn_gl000226	15008
93 | chr18_gl000207_random	4262
94 | 


--------------------------------------------------------------------------------
/Fig5_S5/human_trainepoch.11-0.426.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/human_trainepoch.11-0.426.h5


--------------------------------------------------------------------------------
/Fig5_S5/mouse_trainepoch.05-0.278.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/mouse_trainepoch.05-0.278.h5


--------------------------------------------------------------------------------
/Fig5_S5/predict_seqs.py:
--------------------------------------------------------------------------------
 1 | import sys, pickle
 2 | import pandas as pd
 3 | import numpy as np
 4 | from optparse import OptionParser
 5 | from keras.models import Model, load_model
 6 | 
 7 | def main():
 8 |     usage = 'usage: %prog [options] <param_file> <trained_model> <test_file> <outfile>'
 9 |     parser = OptionParser(usage)
10 |     parser.add_option('--revCom', dest='revcom', default=False, action='store_true', help='Make predictions for minus strand instead of plus? % [Default: %default]')
11 |     (options,args) = parser.parse_args()
12 | 
13 |     if len(args) != 4:
14 |         print(args)
15 |         parser.error('Must provide mode hyperparameter file and 2-column file to generate predictions for')
16 |     else:
17 |         param_file = args[0]
18 |         trained_model = args[1]
19 |         test_file = args[2]
20 |         outfile = args[3]
21 | 
22 |     def revCom(x):
23 |         for y in range(0,x.shape[0]):
24 |             x[y] = np.fliplr(np.flipud(x[y]))
25 |         return x
26 | 
27 |     trials = pickle.load(open(param_file, "rb"))
28 |     best = trials.argmin
29 |     model = load_model(trained_model, compile=False)
30 | 
31 |     table = pd.read_table(test_file, index_col=0, header=None)
32 |     seqs = one_hot(table.as_matrix())
33 |     if options.revcom:
34 |         seqs = revCom(seqs)
35 |     if seqs.shape[1] != 10500:
36 |         tsspos = 7000
37 |         leftpos = tsspos - seqs.shape[1] / 2
38 |         if seqs.shape[1] <= tsspos:
39 |             tmpseqs = np.zeros((seqs.shape[0],10500,4), dtype='bool')
40 |             tmpseqs[:,leftpos:(leftpos+seqs.shape[1]),:] = seqs
41 |             seqs = tmpseqs
42 |         else:
43 |             print('Sequences are above the allowable size of 10500nt')
44 |             sys.exit()
45 |     halflifedata = np.zeros((seqs.shape[0],6), dtype='float16')
46 |     print("Processed data from %s" % test_file)
47 |     predictions_test = model.predict([seqs, halflifedata], batch_size=20).flatten()
48 |     df = pd.DataFrame(np.column_stack((table.index, predictions_test)), columns=['Info','Pred'])
49 |     df.to_csv(outfile, index=False, header=True, sep='\t')
50 | 
51 | def one_hot(seq):
52 |     seq_len = len(seq.item(0))
53 |     seqindex = {'A':0, 'C':1, 'G':2, 'T':3, 'a':0, 'c':1, 'g':2, 't':3}
54 |     seq_vec = np.zeros((len(seq),seq_len,4), dtype='bool')
55 |     for i in range(len(seq)):
56 |         thisseq = seq.item(i)
57 |         for j in range(seq_len):
58 |             try:
59 |                 seq_vec[i,j,seqindex[thisseq[j]]] = 1
60 |             except:
61 |                 pass
62 |     return seq_vec
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 


--------------------------------------------------------------------------------
/Fig5_S5/region.1Mb.bed:
--------------------------------------------------------------------------------
1 | 1	109500000	110300000
2 | 


--------------------------------------------------------------------------------
/Fig5_S5/region.1Mb.intervals.100ntStep.Minus.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/region.1Mb.intervals.100ntStep.Minus.bw


--------------------------------------------------------------------------------
/Fig5_S5/region.1Mb.intervals.100ntStep.Plus.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/region.1Mb.intervals.100ntStep.Plus.bw


--------------------------------------------------------------------------------
/Fig5_S5/region.1Mb.intervals.100ntStep.input.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/region.1Mb.intervals.100ntStep.input.txt.gz


--------------------------------------------------------------------------------
/Fig5_S5/region.1Mb.intervals.100ntStep.mouse.input.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/region.1Mb.intervals.100ntStep.mouse.input.txt.gz


--------------------------------------------------------------------------------
/Fig5_S5/region.1Mb.intervals.100ntStep.mouse.minus.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/region.1Mb.intervals.100ntStep.mouse.minus.bw


--------------------------------------------------------------------------------
/Fig5_S5/region.1Mb.intervals.100ntStep.mouse.plus.bw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig5_S5/region.1Mb.intervals.100ntStep.mouse.plus.bw


--------------------------------------------------------------------------------
/Fig5_S5/region.1Mb.mouse.bed:
--------------------------------------------------------------------------------
1 | chr3	107800000	108500000
2 | 


--------------------------------------------------------------------------------
/Fig5_S5/runme.sh:
--------------------------------------------------------------------------------
 1 | #region.1Mb.bed is human locus; region.1Mb.mouse.bed is mouse locus
 2 | 
 3 | BASEFILE="region.1Mb.intervals.100ntStep" #for Mouse locus use "region.1Mb.intervals.100ntStep.mouse"
 4 | 
 5 | #generate 100nt step with 10.5Kb window size
 6 | bedtools makewindows -b region.1Mb.bed -w 10500 -s 100 | perl -ne '@a=split/\t/; print $_ if $a[2]-$a[1] == 10500;' | sort | uniq >$BASEFILE.bed
 7 | #extract sequences from fasta of human or mouse genome
 8 | bedtools getfasta -tab -fi Homo_sapiens_assembly19.fasta -bed $BASEFILE.bed -fo $BASEFILE.input.txt
 9 | 
10 | #generate predictions
11 | python predict_seqs.py tpe_1K_10epochs_optimized_0to20K.hyperopt human_trainepoch.11-0.426.h5 $BASEFILE.input.txt $BASEFILE.Plus.txt
12 | python predict_seqs.py --revCom tpe_1K_10epochs_optimized_0to20K.hyperopt human_trainepoch.11-0.426.h5 $BASEFILE.input.txt $BASEFILE.Minus.txt
13 | 
14 | #center plus and minus stranded predictions for given 100nt "TSS"
15 | tail -n+2 $BASEFILE.Plus.txt | perl -ne '@a=($_ =~ /(.*):(\d+)-(\d+)\t(.*)/); print "chr".join("\t", $a[0], $a[1]+7000-5000, $a[1]+7000+5000, $a[3]+1), "\n";' >$BASEFILE.Plus.bedGraph
16 | tail -n+2 $BASEFILE.Minus.txt | perl -ne '@a=($_ =~ /(.*):(\d+)-(\d+)\t(.*)/); print "chr".join("\t", $a[0], $a[1]+3500-5000, $a[1]+3500+5000, $a[3]+1), "\n";' >$BASEFILE.Minus.bedGraph
17 | 
18 | #convert to bigwig to view on UCSC browser
19 | bedGraphToBigWig $BASEFILE.Plus.bedGraph hg19.chrom.sizes $BASEFILE.Plus.bw
20 | bedGraphToBigWig $BASEFILE.Minus.bedGraph hg19.chrom.sizes $BASEFILE.Minus.bw
21 | 


--------------------------------------------------------------------------------
/Fig5_S5/tpe_1K_10epochs_optimized_0to20K.hyperopt:
--------------------------------------------------------------------------------
1 | ../Fig1_S2/tpe_1K_10epochs_optimized_0to20K.hyperopt


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/Fig6B.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/Fig6B.pdf


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/Fig6C_S7_S8.R:
--------------------------------------------------------------------------------
 1 | file = args[1]
 2 | outfile = args[2]
 3 | 
 4 | b=list()
 5 | mononuc=list()
 6 | 
 7 | ################ cpg
 8 | a=fastread(paste("zcat ", file,sep=''))
 9 | a$PROMOTER = substring(a$PROMOTER,6000,14000)
10 | a$EXPRESSION=log10(a$EXPRESSION+0.1)
11 | a$EXPRESSION=(a$EXPRESSION-min(a$EXPRESSION))/(max(a$EXPRESSION)-min(a$EXPRESSION))
12 | a$bin <- cut(a$EXPRESSION, breaks=c(0,0.001,0.33,0.66,1), include.lowest = TRUE)
13 | 
14 | # b=lapply(levels(a$bin), function(x){ table(do.call(c, lapply(a[a$bin==x,"PROMOTER"], function(y) unlist(gregexpr("CG|cG|Gc|cg",y)) )))/nrow(a[a$bin==x,]) } )
15 | # g=lapply(levels(a$bin), function(x){ table(do.call(c, lapply(a[a$bin==x,"PROMOTER"], function(y) unlist(gregexpr("G|g",y)) )))/nrow(a[a$bin==x,]) } )
16 | # c=lapply(levels(a$bin), function(x){ table(do.call(c, lapply(a[a$bin==x,"PROMOTER"], function(y) unlist(gregexpr("C|c",y)) )))/nrow(a[a$bin==x,]) } )
17 | 
18 | for (nuc1 in c('a','c','g','t')){
19 | 	say(nuc1)
20 | 	mononuc[[nuc1]]=lapply(levels(a$bin), function(x){ table(do.call(c, lapply(a[a$bin==x,"PROMOTER"], function(y) unlist(gregexpr(paste(nuc1,'|',toupper(nuc1),sep=''),y)) )))/nrow(a[a$bin==x,]) } )
21 | }
22 | 
23 | pdf(outfile,width=10,height=8) #makes Fig6C/FigS7 for human for FigS8 for mouse
24 | par(mfrow=c(4,4), oma = c(5,4,0,0) + 0.1, mar = c(0,0,1,1) + 0.1)
25 | i=0
26 | for (nuc1 in c('a','c','g','t')){
27 | 	for (nuc2 in c('a','c','g','t')){
28 | 		dinuc = paste(nuc1,nuc2,'|',toupper(nuc1),toupper(nuc2),'|',nuc1,toupper(nuc2),'|',toupper(nuc1),nuc2,sep='')
29 | 		dinuc2 = paste(toupper(nuc1),toupper(nuc2),sep='')
30 | 		say(dinuc)
31 | 		b=lapply(levels(a$bin), function(x){ table(do.call(c, lapply(a[a$bin==x,"PROMOTER"], function(y) unlist(gregexpr(dinuc,y)) )))/nrow(a[a$bin==x,]) } )
32 | 
33 | 		x = 1:8000
34 | 		xin = -4000:3999
35 | 		idx = 1:8000
36 | 
37 | 		plot(xin, predict(loess(b[[4]][idx]/(mononuc[[nuc1]][[4]][idx]*mononuc[[nuc2]][[4]][idx+1])~x, span=0.01), newdata=idx), col='cyan', type='l', ylim=c(0,1.6), main = dinuc2, axes = FALSE)
38 | 		axis(side = 1, labels = (i %/% 4 == 3))
39 | 		axis(side = 2, labels = (i %% 4 == 0), las=1)
40 | 		lines(xin, predict(loess(b[[3]][idx]/(mononuc[[nuc1]][[3]][idx]*mononuc[[nuc2]][[3]][idx+1])~x, span=0.01), newdata=idx), col='blue', type='l')
41 | 		lines(xin, predict(loess(b[[2]][idx]/(mononuc[[nuc1]][[2]][idx]*mononuc[[nuc2]][[2]][idx+1])~x, span=0.01), newdata=idx), col='red', type='l')
42 | 		lines(xin, predict(loess(b[[1]][idx]/(mononuc[[nuc1]][[1]][idx]*mononuc[[nuc2]][[1]][idx+1])~x, span=0.01), newdata=idx), col='black', type='l')
43 | 		i=i+1
44 | 	}
45 | }
46 | title(xlab = "Position relative to TSS", ylab = "Observed/Expected", outer = TRUE, line = 3)
47 | # legend("topleft", bg="white", bty="n", legend = levels(a$bin), text.col = c("black", "red","blue","cyan"), outer = TRUE)
48 | dev.off()
49 | 


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/FigS6.R:
--------------------------------------------------------------------------------
 1 | library(RColorBrewer)
 2 | library(zoo)
 3 | 
 4 | folder = args[1]
 5 | species = args[2]
 6 | 
 7 | for (type in c('gradinput','intgrad')){ #'saliency', 'elrp', 'deeplift'
 8 |   b=list()
 9 |   for(num in 1:10){ #10 folds of CV
10 |     say(type, num)
11 |     a=fastread(paste("zcat ", folder, type, '.', num, ".txt.gz",sep=''))
12 |     a$V2=(a$V2-min(a$V2))/(max(a$V2)-min(a$V2))
13 |     a$bin <- cut(a$V2, breaks=c(0,0.001,0.33,0.66,1), include.lowest = TRUE)
14 | 
15 |     b[[num]]=sapply(levels(a$bin), function(x){ apply(a[a$bin==x,3:(ncol(a)-7)], 2, mean)  } )
16 |     a[,3:(ncol(a)-7)]=round(t(t(a[,3:(ncol(a)-7)]) - b[[num]][,1]),3) #broadcast subtraction of vector through matrix
17 |     b[[num]]=sapply(levels(a$bin), function(x){ apply(a[a$bin==x,3:(ncol(a)-7)], 2, mean)  } )
18 |   }
19 | 
20 |   pdf(paste(species, type, ".pdf",sep=''),width=10,height=4) #ran this for both mouse and human 10-fold CV results for each technique
21 |   c=apply(simplify2array(b), 1:2, mean)
22 |   x = 1:10500
23 |   xin = -7000:3499
24 |   plot(xin, predict(loess(c[x,4]~x, span=0.01)), col='cyan', type='l')
25 |   abline(0,0,col="black")
26 |   lines(xin, predict(loess(c[x,3]~x, span=0.01)), col='blue', type='l')
27 |   lines(xin, predict(loess(c[x,2]~x, span=0.01)), col='red', type='l')
28 |   legend("topleft", bg="white", bty="n", legend = colnames(c)[2:4], text.col = c("red","blue","cyan"))
29 |   dev.off()
30 | }
31 | 


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/FigS7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/FigS7.pdf


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/FigS8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/FigS8.pdf


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz:
--------------------------------------------------------------------------------
1 | ../datasets/Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/all_crossvalidated_predictions.txt:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/all_crossvalidated_predictions.txt


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/all_crossvalidated_predictions_mouse.txt:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/all_crossvalidated_predictions_mouse.txt


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/best_positions.R:
--------------------------------------------------------------------------------
 1 | library(zoo)
 2 | library(GenomicRanges)
 3 | library(mixtools)
 4 | 
 5 | file = args[1]
 6 | fold = args[2]
 7 | species = args[3]
 8 | 
 9 | if(species == 'human') expr=read.delim("all_crossvalidated_predictions.txt") else expr=read.delim("all_crossvalidated_predictions_mouse.txt")
10 | 
11 | kmerlen = 1
12 | 
13 | a=fastread(paste("zcat ", file, sep=''))
14 | ids=a[,1]
15 | preds = expr[match(ids, expr$Gene),"Pred"]
16 | mixmdl = normalmixEM(preds)
17 | thresh2 = mixmdl$mu[2]
18 | post.df <- as.data.frame(cbind(x = mixmdl$x, mixmdl$posterior))
19 | threshold = post.df[which(post.df$comp.1 == max(post.df$comp.1[post.df$comp.1 <= 0.5])),"x"]
20 | 
21 | # pdf("Fig6B.pdf",width=10,height=4) #general plot to make 6B histogram, but on full data rather than 1 fold
22 | # plot(mixmdl,which=2)
23 | # abline(v=threshold)
24 | # dev.off()
25 | 
26 | a=a[,3:(ncol(a)-6)]
27 | 
28 | # low vs high
29 | zeromean=apply(a[preds < threshold,], 2, mean )
30 | zerosd=apply(a[preds < threshold,], 2, function(x) qnorm(0.995)*sd(x) ) #99th% confidence interval of z-distribution
31 | 
32 | gr <- GRanges()
33 | gr = suppressWarnings(do.call("c", sapply(1:nrow(a), function(seq) {
34 |     idx = which(as.vector(unlist(lapply(1:ncol(a), function(x) { a[seq,x] > zeromean[x] + zerosd[x] })))) #| a[seq,x] < zeromean[x] - zerosd[x]
35 |     GRanges(seqnames = rep(ids[seq],length(idx)),IRanges(start = idx, end = idx+kmerlen-1))
36 | })))
37 | 
38 | writefile(as.data.frame(reduce(gr)), paste("motif_analysis/bestpos1mer", fold, species, ".txt", sep=''), col.names=F)
39 | 


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/cv_human:
--------------------------------------------------------------------------------
1 | ../datasets/cv_human/


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/cv_mouse:
--------------------------------------------------------------------------------
1 | ../datasets/cv_mouse/


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/deep_explain_cv.py:
--------------------------------------------------------------------------------
 1 | import os, h5py, math
 2 | import numpy as np
 3 | import pandas as pd
 4 | import tensorflow as tf
 5 | from optparse import OptionParser
 6 | from tensorflow import keras
 7 | from keras.models import load_model, Model
 8 | from keras import backend as K
 9 | from deepexplain.tensorflow import DeepExplain
10 | 
11 | batchsize = 500
12 | 
13 | def main():
14 |     usage = 'usage: %prog [options] <data_file> <out_dir> <cv_fold>'
15 |     parser = OptionParser(usage)
16 |     (options,args) = parser.parse_args()
17 | 
18 |     if len(args) != 3:
19 |         print args
20 |         parser.error('Must provide data file and output directory')
21 |     else:
22 |         data_file = args[0]
23 |         out_dir = args[1]
24 |         fold = args[2]
25 |         testfile = os.path.join(out_dir, fold+'test.h5')
26 | 
27 |     testfile = h5py.File(testfile, 'r')
28 |     X_testhalflife, X_testpromoter, y_test, geneName = testfile['data'], testfile['promoter'], testfile['label'], testfile['geneName']
29 |     model = load_model(data_file)
30 | 
31 |     with DeepExplain(session=K.get_session()) as de:
32 |         input_tensor = model.inputs
33 |         fModel = Model(inputs = input_tensor, outputs = model.outputs)
34 |         for method in ['deeplift', 'grad*input', 'saliency', 'elrp', 'intgrad']: #'occlusion' not supported
35 |             pdframe = pd.DataFrame()
36 |             for i in range(0, int(math.ceil(len(geneName) / float(batchsize)))):
37 |                 first = i*batchsize
38 |                 last = (i*batchsize+batchsize)
39 |                 if last > len(geneName): last = len(geneName)
40 |                 xs = X_testpromoter[first:last,3000:13500,:]
41 |                 xs2 = X_testhalflife[first:last,:]
42 |                 ys = y_test[first:last]
43 |                 gN = geneName[first:last]
44 |                 if method in ('intgrad', 'deeplift'): #try these methods with and without specified baseline
45 |                     #empirical ACGT frequencies in -7Kb to +3.5Kb sequence surrounding human TSSs (for non-expressed genes only)
46 |                     baseline = [np.repeat(np.array([[0.2617064, 0.2335449, 0.2379253, 0.2668234]]), 10500, axis=0), np.zeros(6)]
47 |                     map = de.explain(method, fModel(input_tensor), input_tensor, [xs, xs2], baseline = baseline)
48 |                 else:
49 |                     map = de.explain(method, fModel(input_tensor), input_tensor, [xs, xs2])
50 |                 X, halflife = map[0], map[1]
51 |                 frame = pd.DataFrame(np.column_stack((ys, 10**3 * np.sum(X, 2), halflife)))
52 |                 frame.index = gN
53 |                 pdframe = pdframe.append(frame)
54 |             pdframe.to_csv(os.path.join(out_dir, method.replace("*", "")+'.'+fold+'.txt'),sep='\t',header=False, float_format='%.3f')
55 | 
56 | if __name__ == '__main__':
57 |     main()
58 | 


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/extract_kmer.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | $species = shift;
 4 | 
 5 | if ($species eq "human"){ open IN, "zcat Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1,11 | "; }
 6 | else{ open IN, "zcat Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz | cut -f 1,11 | "; }
 7 | while(<IN>){
 8 |     ($id, $seq) = split /\t/, $_;
 9 |     $id2seq{$id} = $seq;
10 | }
11 | close IN;
12 | 
13 | while(<>){ chomp;
14 |     @a = split /\t/, $_;
15 |     $id = $a[0];
16 |     $start = $a[1]+3000-1;
17 |     $len = $a[3];
18 |     print ">$id\_$start\_$len\n", substr($id2seq{$id}, $start, $len), "\n";
19 | }
20 | 


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/human_all_1mer_400K/dreme.txt:
--------------------------------------------------------------------------------
  1 | # DREME 4.12.0
  2 | #     command: dreme -oc human_all_1mer_400K/ -p subsampled_bestpos1mer_human_all_400K.fa -n subsampled_negbestpos1mer_human_all_400K.fa -mink 2 -maxk 10
  3 | #   positives: 400000 from subsampled_bestpos1mer_human_all_400K.fa (Mon Mar 26 23:42:31 PDT 2018)
  4 | #   negatives: 400000 from subsampled_negbestpos1mer_human_all_400K.fa (Mon Mar 26 23:42:22 PDT 2018)
  5 | #        host: parvati.grid.gs.washington.edu
  6 | #        when: Tue Mar 27 00:03:11 PDT 2018
  7 | 
  8 | MEME version 4.12.0
  9 | 
 10 | ALPHABET "DNA" DNA-LIKE
 11 | A "Adenine" CC0000 ~ T "Thymine" 008000
 12 | C "Cytosine" 0000CC ~ G "Guanine" FFB300
 13 | N "Any base" = ACGT
 14 | X = ACGT
 15 | . = ACGT
 16 | V "Not T" = ACG
 17 | H "Not G" = ACT
 18 | D "Not C" = AGT
 19 | B "Not A" = CGT
 20 | M "Amino" = AC
 21 | R "Purine" = AG
 22 | W "Weak" = AT
 23 | S "Strong" = CG
 24 | Y "Pyrimidine" = CT
 25 | K "Keto" = GT
 26 | U = T
 27 | END ALPHABET
 28 | 
 29 | strands: + -
 30 | 
 31 | Background letter frequencies (from dataset):
 32 | A 0.251 C 0.242 G 0.249 T 0.258
 33 | 
 34 | 
 35 | MOTIF CG DREME-1
 36 | 
 37 | #             Word    RC Word        Pos        Neg    P-value    E-value
 38 | # BEST          CG         CG     225655     155300  4.9e-5418  3.4e-5412
 39 | #               CG         CG     225655     155300  4.9e-5418  3.4e-5412
 40 | 
 41 | letter-probability matrix: alength= 4 w= 2 nsites= 464497 E= 3.4e-5412
 42 | 0.000000 1.000000 0.000000 0.000000
 43 | 0.000000 0.000000 1.000000 0.000000
 44 | 
 45 | 
 46 | MOTIF CACTGCAM DREME-2
 47 | 
 48 | #             Word    RC Word        Pos        Neg    P-value    E-value
 49 | # BEST    CACTGCAM   KTGCAGTG       9108       5326   3.5e-224   1.4e-218
 50 | #         CACTGCAA   TTGCAGTG       5101       2647   8.5e-176   3.3e-170
 51 | #         CACTGCAC   GTGCAGTG       4665       2988   1.6e-083   6.0e-078
 52 | 
 53 | letter-probability matrix: alength= 4 w= 8 nsites= 9781 E= 1.4e-218
 54 | 0.000000 1.000000 0.000000 0.000000
 55 | 1.000000 0.000000 0.000000 0.000000
 56 | 0.000000 1.000000 0.000000 0.000000
 57 | 0.000000 0.000000 0.000000 1.000000
 58 | 0.000000 0.000000 1.000000 0.000000
 59 | 0.000000 1.000000 0.000000 0.000000
 60 | 1.000000 0.000000 0.000000 0.000000
 61 | 0.522135 0.477865 0.000000 0.000000
 62 | 
 63 | 
 64 | MOTIF TCCCAGCW DREME-3
 65 | 
 66 | #             Word    RC Word        Pos        Neg    P-value    E-value
 67 | # BEST    TCCCAGCW   WGCTGGGA      10157       7016   1.3e-130   4.9e-125
 68 | #         TCCCAGCT   AGCTGGGA       5479       3775   1.3e-071   5.0e-066
 69 | #         TCCCAGCA   TGCTGGGA       4695       3248   2.1e-060   8.3e-055
 70 | 
 71 | letter-probability matrix: alength= 4 w= 8 nsites= 10188 E= 4.9e-125
 72 | 0.000000 0.000000 0.000000 1.000000
 73 | 0.000000 1.000000 0.000000 0.000000
 74 | 0.000000 1.000000 0.000000 0.000000
 75 | 0.000000 1.000000 0.000000 0.000000
 76 | 1.000000 0.000000 0.000000 0.000000
 77 | 0.000000 0.000000 1.000000 0.000000
 78 | 0.000000 1.000000 0.000000 0.000000
 79 | 0.461131 0.000000 0.000000 0.538869
 80 | 
 81 | 
 82 | MOTIF GCCTCCHAAA DREME-4
 83 | 
 84 | #             Word    RC Word        Pos        Neg    P-value    E-value
 85 | # BEST  GCCTCCHAAA TTTDGGAGGC       4692       2651   2.4e-128   9.3e-123
 86 | #       GCCTCCCAAA TTTGGGAGGC       4452       2498   2.5e-124   9.5e-119
 87 | #       GCCTCCTAAA TTTAGGAGGC        151        100   7.7e-004   3.0e+002
 88 | #       GCCTCCAAAA TTTTGGAGGC         91         53   9.7e-004   3.7e+002
 89 | 
 90 | letter-probability matrix: alength= 4 w= 10 nsites= 4697 E= 9.3e-123
 91 | 0.000000 0.000000 1.000000 0.000000
 92 | 0.000000 1.000000 0.000000 0.000000
 93 | 0.000000 1.000000 0.000000 0.000000
 94 | 0.000000 0.000000 0.000000 1.000000
 95 | 0.000000 1.000000 0.000000 0.000000
 96 | 0.000000 1.000000 0.000000 0.000000
 97 | 0.019374 0.948478 0.000000 0.032148
 98 | 1.000000 0.000000 0.000000 0.000000
 99 | 1.000000 0.000000 0.000000 0.000000
100 | 1.000000 0.000000 0.000000 0.000000
101 | 
102 | 
103 | MOTIF CAGGWGA DREME-5
104 | 
105 | #             Word    RC Word        Pos        Neg    P-value    E-value
106 | # BEST     CAGGWGA    TCWCCTG      11536       8393   3.0e-113   1.2e-107
107 | #          CAGGAGA    TCTCCTG       7483       5442   9.6e-074   3.7e-068
108 | #          CAGGTGA    TCACCTG       4092       2982   1.7e-040   6.6e-035
109 | 
110 | letter-probability matrix: alength= 4 w= 7 nsites= 11631 E= 1.2e-107
111 | 0.000000 1.000000 0.000000 0.000000
112 | 1.000000 0.000000 0.000000 0.000000
113 | 0.000000 0.000000 1.000000 0.000000
114 | 0.000000 0.000000 1.000000 0.000000
115 | 0.646290 0.000000 0.000000 0.353710
116 | 0.000000 0.000000 1.000000 0.000000
117 | 1.000000 0.000000 0.000000 0.000000
118 | 
119 | 
120 | MOTIF CCTGTAR DREME-6
121 | 
122 | #             Word    RC Word        Pos        Neg    P-value    E-value
123 | # BEST     CCTGTAR    YTACAGG      11198       8131   3.8e-111   1.5e-105
124 | #          CCTGTAA    TTACAGG       8099       5757   3.1e-090   1.2e-084
125 | #          CCTGTAG    CTACAGG       3116       2383   1.7e-023   6.6e-018
126 | 
127 | letter-probability matrix: alength= 4 w= 7 nsites= 11238 E= 1.5e-105
128 | 0.000000 1.000000 0.000000 0.000000
129 | 0.000000 1.000000 0.000000 0.000000
130 | 0.000000 0.000000 0.000000 1.000000
131 | 0.000000 0.000000 1.000000 0.000000
132 | 0.000000 0.000000 0.000000 1.000000
133 | 1.000000 0.000000 0.000000 0.000000
134 | 0.722460 0.000000 0.277540 0.000000
135 | 
136 | 
137 | MOTIF CAGGMTGG DREME-7
138 | 
139 | #             Word    RC Word        Pos        Neg    P-value    E-value
140 | # BEST    CAGGMTGG   CCAKCCTG      10211       7468   3.0e-097   1.1e-091
141 | #         CAGGCTGG   CCAGCCTG       8697       6311   1.2e-086   4.6e-081
142 | #         CAGGATGG   CCATCCTG       1520       1162   2.3e-012   9.0e-007
143 | 
144 | letter-probability matrix: alength= 4 w= 8 nsites= 10232 E= 1.1e-091
145 | 0.000000 1.000000 0.000000 0.000000
146 | 1.000000 0.000000 0.000000 0.000000
147 | 0.000000 0.000000 1.000000 0.000000
148 | 0.000000 0.000000 1.000000 0.000000
149 | 0.148944 0.851056 0.000000 0.000000
150 | 0.000000 0.000000 0.000000 1.000000
151 | 0.000000 0.000000 1.000000 0.000000
152 | 0.000000 0.000000 1.000000 0.000000
153 | 
154 | 
155 | MOTIF GTGGHTCA DREME-8
156 | 
157 | #             Word    RC Word        Pos        Neg    P-value    E-value
158 | # BEST    GTGGHTCA   TGADCCAC       5973       4156   2.2e-074   8.6e-069
159 | #         GTGGCTCA   TGAGCCAC       4517       3073   7.4e-063   2.9e-057
160 | #         GTGGATCA   TGATCCAC       1197        833   3.0e-016   1.2e-010
161 | #         GTGGTTCA   TGAACCAC        352        285   4.4e-003   1.7e+003
162 | 
163 | letter-probability matrix: alength= 4 w= 8 nsites= 6067 E= 8.6e-069
164 | 0.000000 0.000000 1.000000 0.000000
165 | 0.000000 0.000000 0.000000 1.000000
166 | 0.000000 0.000000 1.000000 0.000000
167 | 0.000000 0.000000 1.000000 0.000000
168 | 0.197297 0.744520 0.000000 0.058184
169 | 0.000000 0.000000 0.000000 1.000000
170 | 0.000000 1.000000 0.000000 0.000000
171 | 1.000000 0.000000 0.000000 0.000000
172 | 
173 | 
174 | MOTIF GCTAATTTTK DREME-9
175 | 
176 | #             Word    RC Word        Pos        Neg    P-value    E-value
177 | # BEST  GCTAATTTTK MAAAATTAGC       3715       2529   8.0e-052   3.1e-046
178 | #       GCTAATTTTT AAAAATTAGC       3318       2263   4.6e-046   1.8e-040
179 | #       GCTAATTTTG CAAAATTAGC        397        267   2.5e-007   9.7e-002
180 | 
181 | letter-probability matrix: alength= 4 w= 10 nsites= 3715 E= 3.1e-046
182 | 0.000000 0.000000 1.000000 0.000000
183 | 0.000000 1.000000 0.000000 0.000000
184 | 0.000000 0.000000 0.000000 1.000000
185 | 1.000000 0.000000 0.000000 0.000000
186 | 1.000000 0.000000 0.000000 0.000000
187 | 0.000000 0.000000 0.000000 1.000000
188 | 0.000000 0.000000 0.000000 1.000000
189 | 0.000000 0.000000 0.000000 1.000000
190 | 0.000000 0.000000 0.000000 1.000000
191 | 0.000000 0.000000 0.106864 0.893136
192 | 
193 | 
194 | MOTIF GCYAACA DREME-10
195 | 
196 | #             Word    RC Word        Pos        Neg    P-value    E-value
197 | # BEST     GCYAACA    TGTTRGC       4892       3534   1.8e-050   7.0e-045
198 | #          GCCAACA    TGTTGGC       3302       2457   2.5e-029   9.6e-024
199 | #          GCTAACA    TGTTAGC       1596       1081   9.2e-024   3.5e-018
200 | 
201 | letter-probability matrix: alength= 4 w= 7 nsites= 4906 E= 7.0e-045
202 | 0.000000 0.000000 1.000000 0.000000
203 | 0.000000 1.000000 0.000000 0.000000
204 | 0.000000 0.673869 0.000000 0.326131
205 | 1.000000 0.000000 0.000000 0.000000
206 | 1.000000 0.000000 0.000000 0.000000
207 | 0.000000 1.000000 0.000000 0.000000
208 | 1.000000 0.000000 0.000000 0.000000
209 | 
210 | 
211 | MOTIF CTTGAACC DREME-11
212 | 
213 | #             Word    RC Word        Pos        Neg    P-value    E-value
214 | # BEST    CTTGAACC   GGTTCAAG       3267       2202   7.8e-048   3.0e-042
215 | #         CTTGAACC   GGTTCAAG       3267       2202   7.8e-048   3.0e-042
216 | 
217 | letter-probability matrix: alength= 4 w= 8 nsites= 3268 E= 3.0e-042
218 | 0.000000 1.000000 0.000000 0.000000
219 | 0.000000 0.000000 0.000000 1.000000
220 | 0.000000 0.000000 0.000000 1.000000
221 | 0.000000 0.000000 1.000000 0.000000
222 | 1.000000 0.000000 0.000000 0.000000
223 | 1.000000 0.000000 0.000000 0.000000
224 | 0.000000 1.000000 0.000000 0.000000
225 | 0.000000 1.000000 0.000000 0.000000
226 | 
227 | 
228 | MOTIF AGGTCARGAG DREME-12
229 | 
230 | #             Word    RC Word        Pos        Neg    P-value    E-value
231 | # BEST  AGGTCARGAG CTCYTGACCT       2305       1433   5.7e-047   2.2e-041
232 | #       AGGTCAGGAG CTCCTGACCT       2047       1295   2.3e-039   9.0e-034
233 | #       AGGTCAAGAG CTCTTGACCT        258        139   1.2e-009   4.6e-004
234 | 
235 | letter-probability matrix: alength= 4 w= 10 nsites= 2305 E= 2.2e-041
236 | 1.000000 0.000000 0.000000 0.000000
237 | 0.000000 0.000000 1.000000 0.000000
238 | 0.000000 0.000000 1.000000 0.000000
239 | 0.000000 0.000000 0.000000 1.000000
240 | 0.000000 1.000000 0.000000 0.000000
241 | 1.000000 0.000000 0.000000 0.000000
242 | 0.111931 0.000000 0.888069 0.000000
243 | 0.000000 0.000000 1.000000 0.000000
244 | 1.000000 0.000000 0.000000 0.000000
245 | 0.000000 0.000000 1.000000 0.000000
246 | 
247 | 
248 | MOTIF CTCAGCCYC DREME-13
249 | 
250 | #             Word    RC Word        Pos        Neg    P-value    E-value
251 | # BEST   CTCAGCCYC  GRGGCTGAG       4962       3671   9.6e-045   3.7e-039
252 | #        CTCAGCCTC  GAGGCTGAG       4594       3388   2.6e-042   1.0e-036
253 | #        CTCAGCCCC  GGGGCTGAG        368        283   4.9e-004   1.9e+002
254 | 
255 | letter-probability matrix: alength= 4 w= 9 nsites= 4964 E= 3.7e-039
256 | 0.000000 1.000000 0.000000 0.000000
257 | 0.000000 0.000000 0.000000 1.000000
258 | 0.000000 1.000000 0.000000 0.000000
259 | 1.000000 0.000000 0.000000 0.000000
260 | 0.000000 0.000000 1.000000 0.000000
261 | 0.000000 1.000000 0.000000 0.000000
262 | 0.000000 1.000000 0.000000 0.000000
263 | 0.000000 0.074134 0.000000 0.925866
264 | 0.000000 1.000000 0.000000 0.000000
265 | 
266 | 
267 | MOTIF CACMA DREME-14
268 | 
269 | #             Word    RC Word        Pos        Neg    P-value    E-value
270 | # BEST       CACMA      TKGTG      48806      45155   4.0e-037   1.5e-031
271 | #            CACCA      TGGTG      29802      27221   1.8e-029   6.8e-024
272 | #            CACAA      TTGTG      20742      19569   1.0e-009   4.0e-004
273 | 
274 | letter-probability matrix: alength= 4 w= 5 nsites= 52085 E= 1.5e-031
275 | 0.000000 1.000000 0.000000 0.000000
276 | 1.000000 0.000000 0.000000 0.000000
277 | 0.000000 1.000000 0.000000 0.000000
278 | 0.409523 0.590477 0.000000 0.000000
279 | 1.000000 0.000000 0.000000 0.000000
280 | 
281 | 
282 | MOTIF AAAAATAC DREME-15
283 | 
284 | #             Word    RC Word        Pos        Neg    P-value    E-value
285 | # BEST    AAAAATAC   GTATTTTT       4100       3055   1.0e-035   3.8e-030
286 | #         AAAAATAC   GTATTTTT       4100       3055   1.0e-035   3.8e-030
287 | 
288 | letter-probability matrix: alength= 4 w= 8 nsites= 4103 E= 3.8e-030
289 | 1.000000 0.000000 0.000000 0.000000
290 | 1.000000 0.000000 0.000000 0.000000
291 | 1.000000 0.000000 0.000000 0.000000
292 | 1.000000 0.000000 0.000000 0.000000
293 | 1.000000 0.000000 0.000000 0.000000
294 | 0.000000 0.000000 0.000000 1.000000
295 | 1.000000 0.000000 0.000000 0.000000
296 | 0.000000 1.000000 0.000000 0.000000
297 | 
298 | 
299 | MOTIF AGTGCAATG DREME-16
300 | 
301 | #             Word    RC Word        Pos        Neg    P-value    E-value
302 | # BEST   AGTGCAATG  CATTGCACT        969        551   2.3e-027   8.4e-022
303 | #        AGTGCAATG  CATTGCACT        969        551   2.3e-027   8.4e-022
304 | 
305 | letter-probability matrix: alength= 4 w= 9 nsites= 969 E= 8.4e-022
306 | 1.000000 0.000000 0.000000 0.000000
307 | 0.000000 0.000000 1.000000 0.000000
308 | 0.000000 0.000000 0.000000 1.000000
309 | 0.000000 0.000000 1.000000 0.000000
310 | 0.000000 1.000000 0.000000 0.000000
311 | 1.000000 0.000000 0.000000 0.000000
312 | 1.000000 0.000000 0.000000 0.000000
313 | 0.000000 0.000000 0.000000 1.000000
314 | 0.000000 0.000000 1.000000 0.000000
315 | 
316 | 
317 | MOTIF CACYTG DREME-17
318 | 
319 | #             Word    RC Word        Pos        Neg    P-value    E-value
320 | # BEST      CACYTG     CARGTG      13283      11654   5.5e-026   2.0e-020
321 | #           CACCTG     CAGGTG       7814       6606   1.7e-024   6.2e-019
322 | #           CACTTG     CAAGTG       5564       5132   1.4e-005   5.0e+000
323 | 
324 | letter-probability matrix: alength= 4 w= 6 nsites= 13533 E= 2.0e-020
325 | 0.000000 1.000000 0.000000 0.000000
326 | 1.000000 0.000000 0.000000 0.000000
327 | 0.000000 1.000000 0.000000 0.000000
328 | 0.000000 0.585901 0.000000 0.414099
329 | 0.000000 0.000000 0.000000 1.000000
330 | 0.000000 0.000000 1.000000 0.000000
331 | 
332 | 
333 | MOTIF GGGTTTCWC DREME-18
334 | 
335 | #             Word    RC Word        Pos        Neg    P-value    E-value
336 | # BEST   GGGTTTCWC  GWGAAACCC       1289        910   2.9e-016   1.1e-010
337 | #        GGGTTTCAC  GTGAAACCC        966        662   2.4e-014   8.7e-009
338 | #        GGGTTTCTC  GAGAAACCC        323        248   9.6e-004   3.5e+002
339 | 
340 | letter-probability matrix: alength= 4 w= 9 nsites= 1289 E= 1.1e-010
341 | 0.000000 0.000000 1.000000 0.000000
342 | 0.000000 0.000000 1.000000 0.000000
343 | 0.000000 0.000000 1.000000 0.000000
344 | 0.000000 0.000000 0.000000 1.000000
345 | 0.000000 0.000000 0.000000 1.000000
346 | 0.000000 0.000000 0.000000 1.000000
347 | 0.000000 1.000000 0.000000 0.000000
348 | 0.749418 0.000000 0.000000 0.250582
349 | 0.000000 1.000000 0.000000 0.000000
350 | 
351 | 
352 | MOTIF CTCMTGATC DREME-19
353 | 
354 | #             Word    RC Word        Pos        Neg    P-value    E-value
355 | # BEST   CTCMTGATC  GATCAKGAG        403        213   8.0e-015   3.0e-009
356 | #        CTCATGATC  GATCATGAG        242        108   3.0e-013   1.1e-007
357 | #        CTCCTGATC  GATCAGGAG        164        105   1.9e-004   7.1e+001
358 | 
359 | letter-probability matrix: alength= 4 w= 9 nsites= 403 E= 3.0e-009
360 | 0.000000 1.000000 0.000000 0.000000
361 | 0.000000 0.000000 0.000000 1.000000
362 | 0.000000 1.000000 0.000000 0.000000
363 | 0.595533 0.404467 0.000000 0.000000
364 | 0.000000 0.000000 0.000000 1.000000
365 | 0.000000 0.000000 1.000000 0.000000
366 | 1.000000 0.000000 0.000000 0.000000
367 | 0.000000 0.000000 0.000000 1.000000
368 | 0.000000 1.000000 0.000000 0.000000
369 | 
370 | 
371 | MOTIF GGCAGAGS DREME-20
372 | 
373 | #             Word    RC Word        Pos        Neg    P-value    E-value
374 | # BEST    GGCAGAGS   SCTCTGCC       2879       2383   3.7e-012   1.3e-006
375 | #         GGCAGAGG   CCTCTGCC       2178       1800   1.0e-009   3.7e-004
376 | #         GGCAGAGC   GCTCTGCC        704        587   6.1e-004   2.3e+002
377 | 
378 | letter-probability matrix: alength= 4 w= 8 nsites= 2885 E= 1.3e-006
379 | 0.000000 0.000000 1.000000 0.000000
380 | 0.000000 0.000000 1.000000 0.000000
381 | 0.000000 1.000000 0.000000 0.000000
382 | 1.000000 0.000000 0.000000 0.000000
383 | 0.000000 0.000000 1.000000 0.000000
384 | 1.000000 0.000000 0.000000 0.000000
385 | 0.000000 0.000000 1.000000 0.000000
386 | 0.000000 0.243674 0.756326 0.000000
387 | 
388 | 
389 | MOTIF GGAGGTGGA DREME-21
390 | 
391 | #             Word    RC Word        Pos        Neg    P-value    E-value
392 | # BEST   GGAGGTGGA  TCCACCTCC       1070        787   2.6e-011   9.7e-006
393 | #        GGAGGTGGA  TCCACCTCC       1070        787   2.6e-011   9.7e-006
394 | 
395 | letter-probability matrix: alength= 4 w= 9 nsites= 1072 E= 9.7e-006
396 | 0.000000 0.000000 1.000000 0.000000
397 | 0.000000 0.000000 1.000000 0.000000
398 | 1.000000 0.000000 0.000000 0.000000
399 | 0.000000 0.000000 1.000000 0.000000
400 | 0.000000 0.000000 1.000000 0.000000
401 | 0.000000 0.000000 0.000000 1.000000
402 | 0.000000 0.000000 1.000000 0.000000
403 | 0.000000 0.000000 1.000000 0.000000
404 | 1.000000 0.000000 0.000000 0.000000
405 | 
406 | 
407 | MOTIF AYCTTGGC DREME-22
408 | 
409 | #             Word    RC Word        Pos        Neg    P-value    E-value
410 | # BEST    AYCTTGGC   GCCAAGRT       1614       1265   3.9e-011   1.4e-005
411 | #         ATCTTGGC   GCCAAGAT       1100        854   1.4e-008   5.1e-003
412 | #         ACCTTGGC   GCCAAGGT        514        411   3.9e-004   1.4e+002
413 | 
414 | letter-probability matrix: alength= 4 w= 8 nsites= 1614 E= 1.4e-005
415 | 1.000000 0.000000 0.000000 0.000000
416 | 0.000000 0.318463 0.000000 0.681537
417 | 0.000000 1.000000 0.000000 0.000000
418 | 0.000000 0.000000 0.000000 1.000000
419 | 0.000000 0.000000 0.000000 1.000000
420 | 0.000000 0.000000 1.000000 0.000000
421 | 0.000000 0.000000 1.000000 0.000000
422 | 0.000000 1.000000 0.000000 0.000000
423 | 
424 | 
425 | MOTIF GGCAGATCA DREME-23
426 | 
427 | #             Word    RC Word        Pos        Neg    P-value    E-value
428 | # BEST   GGCAGATCA  TGATCTGCC        476        311   2.2e-009   8.0e-004
429 | #        GGCAGATCA  TGATCTGCC        476        311   2.2e-009   8.0e-004
430 | 
431 | letter-probability matrix: alength= 4 w= 9 nsites= 476 E= 8.0e-004
432 | 0.000000 0.000000 1.000000 0.000000
433 | 0.000000 0.000000 1.000000 0.000000
434 | 0.000000 1.000000 0.000000 0.000000
435 | 1.000000 0.000000 0.000000 0.000000
436 | 0.000000 0.000000 1.000000 0.000000
437 | 1.000000 0.000000 0.000000 0.000000
438 | 0.000000 0.000000 0.000000 1.000000
439 | 0.000000 1.000000 0.000000 0.000000
440 | 1.000000 0.000000 0.000000 0.000000
441 | 
442 | 
443 | MOTIF AGTCTTGCTC DREME-24
444 | 
445 | #             Word    RC Word        Pos        Neg    P-value    E-value
446 | # BEST  AGTCTTGCTC GAGCAAGACT        604        431   4.1e-008   1.5e-002
447 | #       AGTCTTGCTC GAGCAAGACT        604        431   4.1e-008   1.5e-002
448 | 
449 | letter-probability matrix: alength= 4 w= 10 nsites= 604 E= 1.5e-002
450 | 1.000000 0.000000 0.000000 0.000000
451 | 0.000000 0.000000 1.000000 0.000000
452 | 0.000000 0.000000 0.000000 1.000000
453 | 0.000000 1.000000 0.000000 0.000000
454 | 0.000000 0.000000 0.000000 1.000000
455 | 0.000000 0.000000 0.000000 1.000000
456 | 0.000000 0.000000 1.000000 0.000000
457 | 0.000000 1.000000 0.000000 0.000000
458 | 0.000000 0.000000 0.000000 1.000000
459 | 0.000000 1.000000 0.000000 0.000000
460 | 
461 | 
462 | MOTIF GTGTTGGGA DREME-25
463 | 
464 | #             Word    RC Word        Pos        Neg    P-value    E-value
465 | # BEST   GTGTTGGGA  TCCCAACAC        425        282   4.2e-008   1.5e-002
466 | #        GTGTTGGGA  TCCCAACAC        425        282   4.2e-008   1.5e-002
467 | 
468 | letter-probability matrix: alength= 4 w= 9 nsites= 426 E= 1.5e-002
469 | 0.000000 0.000000 1.000000 0.000000
470 | 0.000000 0.000000 0.000000 1.000000
471 | 0.000000 0.000000 1.000000 0.000000
472 | 0.000000 0.000000 0.000000 1.000000
473 | 0.000000 0.000000 0.000000 1.000000
474 | 0.000000 0.000000 1.000000 0.000000
475 | 0.000000 0.000000 1.000000 0.000000
476 | 0.000000 0.000000 1.000000 0.000000
477 | 1.000000 0.000000 0.000000 0.000000
478 | 
479 | 
480 | # Stopping reason: E-value threshold exceeded
481 | #    Running time: 44160.40 seconds
482 | 


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/humangradinput.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/humangradinput.pdf


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/humanintgrad.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/humanintgrad.pdf


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/motif_analysis:
--------------------------------------------------------------------------------
1 | ../datasets/motif_analysis/


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/mouse_all_1mer_400K/dreme.txt:
--------------------------------------------------------------------------------
  1 | # DREME 4.12.0
  2 | #     command: dreme -oc mouse_all_1mer_400K/ -p subsampled_bestpos1mer_mouse_all_400K.fa -n subsampled_negbestpos1mer_mouse_all_400K.fa -mink 2 -maxk 10
  3 | #   positives: 400000 from subsampled_bestpos1mer_mouse_all_400K.fa (Mon Mar 26 23:42:00 PDT 2018)
  4 | #   negatives: 400000 from subsampled_negbestpos1mer_mouse_all_400K.fa (Mon Mar 26 23:42:14 PDT 2018)
  5 | #        host: parvati.grid.gs.washington.edu
  6 | #        when: Tue Mar 27 00:03:55 PDT 2018
  7 | 
  8 | MEME version 4.12.0
  9 | 
 10 | ALPHABET "DNA" DNA-LIKE
 11 | A "Adenine" CC0000 ~ T "Thymine" 008000
 12 | C "Cytosine" 0000CC ~ G "Guanine" FFB300
 13 | N "Any base" = ACGT
 14 | X = ACGT
 15 | . = ACGT
 16 | V "Not T" = ACG
 17 | H "Not G" = ACT
 18 | D "Not C" = AGT
 19 | B "Not A" = CGT
 20 | M "Amino" = AC
 21 | R "Purine" = AG
 22 | W "Weak" = AT
 23 | S "Strong" = CG
 24 | Y "Pyrimidine" = CT
 25 | K "Keto" = GT
 26 | U = T
 27 | END ALPHABET
 28 | 
 29 | strands: + -
 30 | 
 31 | Background letter frequencies (from dataset):
 32 | A 0.260 C 0.232 G 0.239 T 0.269
 33 | 
 34 | 
 35 | MOTIF CG DREME-1
 36 | 
 37 | #             Word    RC Word        Pos        Neg    P-value    E-value
 38 | # BEST          CG         CG     186358     136959  4.6e-2762  3.2e-2756
 39 | #               CG         CG     186358     136959  4.6e-2762  3.2e-2756
 40 | 
 41 | letter-probability matrix: alength= 4 w= 2 nsites= 419330 E= 3.2e-2756
 42 | 0.000000 1.000000 0.000000 0.000000
 43 | 0.000000 0.000000 1.000000 0.000000
 44 | 
 45 | 
 46 | MOTIF CAVC DREME-2
 47 | 
 48 | #             Word    RC Word        Pos        Neg    P-value    E-value
 49 | # BEST        CAVC       GBTG     207580     196664   6.8e-132   2.6e-126
 50 | #             CACC       GGTG      98623      89412   1.3e-130   5.1e-125
 51 | #             CAGC       GCTG     113609     107884   1.0e-046   4.0e-041
 52 | #             CAAC       GTTG      72750      69684   1.7e-019   6.4e-014
 53 | 
 54 | letter-probability matrix: alength= 4 w= 4 nsites= 352826 E= 2.6e-126
 55 | 0.000000 1.000000 0.000000 0.000000
 56 | 1.000000 0.000000 0.000000 0.000000
 57 | 0.237108 0.350895 0.411996 0.000000
 58 | 0.000000 1.000000 0.000000 0.000000
 59 | 
 60 | 
 61 | MOTIF CA DREME-3
 62 | 
 63 | #             Word    RC Word        Pos        Neg    P-value    E-value
 64 | # BEST          CA         TG     387026     384137   1.3e-067   4.1e-062
 65 | #               CA         TG     387026     384137   1.3e-067   4.1e-062
 66 | 
 67 | letter-probability matrix: alength= 4 w= 2 nsites= 1816073 E= 4.1e-062
 68 | 0.000000 1.000000 0.000000 0.000000
 69 | 1.000000 0.000000 0.000000 0.000000
 70 | 
 71 | 
 72 | MOTIF GGY DREME-4
 73 | 
 74 | #             Word    RC Word        Pos        Neg    P-value    E-value
 75 | # BEST         GGY        RCC     178464     174623   2.7e-018   1.8e-013
 76 | #              GGT        ACC     120888     117686   2.6e-015   1.7e-010
 77 | #              GGC        GCC      91497      89041   2.6e-011   1.7e-006
 78 | 
 79 | letter-probability matrix: alength= 4 w= 3 nsites= 277850 E= 1.8e-013
 80 | 0.000000 0.000000 1.000000 0.000000
 81 | 0.000000 0.000000 1.000000 0.000000
 82 | 0.000000 0.426817 0.000000 0.573183
 83 | 
 84 | 
 85 | MOTIF CCTTTARTCC DREME-5
 86 | 
 87 | #             Word    RC Word        Pos        Neg    P-value    E-value
 88 | # BEST  CCTTTARTCC GGAYTAAAGG       1380       1001   3.8e-015   1.7e-010
 89 | #       CCTTTAATCC GGATTAAAGG       1304        952   6.1e-014   2.8e-009
 90 | #       CCTTTAGTCC GGACTAAAGG         76         49   9.8e-003   4.5e+002
 91 | 
 92 | letter-probability matrix: alength= 4 w= 10 nsites= 1382 E= 1.7e-010
 93 | 0.000000 1.000000 0.000000 0.000000
 94 | 0.000000 1.000000 0.000000 0.000000
 95 | 0.000000 0.000000 0.000000 1.000000
 96 | 0.000000 0.000000 0.000000 1.000000
 97 | 0.000000 0.000000 0.000000 1.000000
 98 | 1.000000 0.000000 0.000000 0.000000
 99 | 0.945007 0.000000 0.054993 0.000000
100 | 0.000000 0.000000 0.000000 1.000000
101 | 0.000000 1.000000 0.000000 0.000000
102 | 0.000000 1.000000 0.000000 0.000000
103 | 
104 | 
105 | # Stopping reason: E-value threshold exceeded
106 | #    Running time: 14961.69 seconds
107 | 


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/mouse_all_1mer_400K/dreme.xml:
--------------------------------------------------------------------------------
 1 | <dreme version="4.12.0" release="Tue Jun 27 16:22:50 2017 -0700">
 2 |   <model>
 3 |     <command_line>dreme -oc mouse_all_1mer_400K/ -p subsampled_bestpos1mer_mouse_all_400K.fa -n subsampled_negbestpos1mer_mouse_all_400K.fa -mink 2 -maxk 10</command_line>
 4 |     <positives name="subsampled bestpos1mer mouse all 400K" count="400000" file="subsampled_bestpos1mer_mouse_all_400K.fa" last_mod_date="Mon Mar 26 23:42:00 PDT 2018" />
 5 |     <negatives name="subsampled negbestpos1mer mouse all 400K" count="400000" from="file" file="subsampled_negbestpos1mer_mouse_all_400K.fa" last_mod_date="Mon Mar 26 23:42:14 PDT 2018" />
 6 |     <alphabet name="DNA" like="dna">
 7 |       <letter id="A" symbol="A" complement="T" name="Adenine" colour="CC0000"/>
 8 |       <letter id="C" symbol="C" complement="G" name="Cytosine" colour="0000CC"/>
 9 |       <letter id="G" symbol="G" complement="C" name="Guanine" colour="FFB300"/>
10 |       <letter id="T" symbol="T" aliases="U" complement="A" name="Thymine" colour="008000"/>
11 |       <letter id="N" symbol="N" aliases="X." equals="ACGT" name="Any base"/>
12 |       <letter id="V" symbol="V" equals="ACG" name="Not T"/>
13 |       <letter id="H" symbol="H" equals="ACT" name="Not G"/>
14 |       <letter id="D" symbol="D" equals="AGT" name="Not C"/>
15 |       <letter id="B" symbol="B" equals="CGT" name="Not A"/>
16 |       <letter id="M" symbol="M" equals="AC" name="Amino"/>
17 |       <letter id="R" symbol="R" equals="AG" name="Purine"/>
18 |       <letter id="W" symbol="W" equals="AT" name="Weak"/>
19 |       <letter id="S" symbol="S" equals="CG" name="Strong"/>
20 |       <letter id="Y" symbol="Y" equals="CT" name="Pyrimidine"/>
21 |       <letter id="K" symbol="K" equals="GT" name="Keto"/>
22 |     </alphabet>
23 |     <strands>both</strands>
24 |     <background A="0.260" C="0.232" G="0.239" T="0.269" from="dataset"/>
25 |     <stop evalue="0.05"/>
26 |     <ngen>100</ngen>
27 |     <add_pv_thresh>0.01</add_pv_thresh>
28 |     <seed>1</seed>
29 |     <host>parvati.grid.gs.washington.edu</host>
30 |     <when>Tue Mar 27 00:03:55 PDT 2018</when>
31 |   </model>
32 |   <motifs>
33 |     <motif id="m01" alt="DREME-1" seq="CG" length="2" nsites="419330" p="186358" n="136959" pvalue="4.6e-2762" evalue="3.2e-2756" unerased_evalue="3.2e-2756">
34 |       <pos A="0.000000" C="1.000000" G="0.000000" T="0.000000"/>
35 |       <pos A="0.000000" C="0.000000" G="1.000000" T="0.000000"/>
36 |       <match seq="CG" p="186358" n="136959" pvalue="4.6e-2762" evalue="3.2e-2756"/>
37 |     </motif>
38 |     <motif id="m02" alt="DREME-2" seq="CAVC" length="4" nsites="352826" p="207580" n="196664" pvalue="6.8e-132" evalue="2.6e-126" unerased_evalue="1.9e-175">
39 |       <pos A="0.000000" C="1.000000" G="0.000000" T="0.000000"/>
40 |       <pos A="1.000000" C="0.000000" G="0.000000" T="0.000000"/>
41 |       <pos A="0.237108" C="0.350895" G="0.411996" T="0.000000"/>
42 |       <pos A="0.000000" C="1.000000" G="0.000000" T="0.000000"/>
43 |       <match seq="CACC" p="98623" n="89412" pvalue="1.3e-130" evalue="5.1e-125"/>
44 |       <match seq="CAGC" p="113609" n="107884" pvalue="1.0e-046" evalue="4.0e-041"/>
45 |       <match seq="CAAC" p="72750" n="69684" pvalue="1.7e-019" evalue="6.4e-014"/>
46 |     </motif>
47 |     <motif id="m03" alt="DREME-3" seq="CA" length="2" nsites="1816073" p="387026" n="384137" pvalue="1.3e-067" evalue="4.1e-062" unerased_evalue="1.2e-109">
48 |       <pos A="0.000000" C="1.000000" G="0.000000" T="0.000000"/>
49 |       <pos A="1.000000" C="0.000000" G="0.000000" T="0.000000"/>
50 |       <match seq="CA" p="387026" n="384137" pvalue="1.3e-067" evalue="4.1e-062"/>
51 |     </motif>
52 |     <motif id="m04" alt="DREME-4" seq="GGY" length="3" nsites="277850" p="178464" n="174623" pvalue="2.7e-018" evalue="1.8e-013" unerased_evalue="6.2e-132">
53 |       <pos A="0.000000" C="0.000000" G="1.000000" T="0.000000"/>
54 |       <pos A="0.000000" C="0.000000" G="1.000000" T="0.000000"/>
55 |       <pos A="0.000000" C="0.426817" G="0.000000" T="0.573183"/>
56 |       <match seq="GGT" p="120888" n="117686" pvalue="2.6e-015" evalue="1.7e-010"/>
57 |       <match seq="GGC" p="91497" n="89041" pvalue="2.6e-011" evalue="1.7e-006"/>
58 |     </motif>
59 |     <motif id="m05" alt="DREME-5" seq="CCTTTARTCC" length="10" nsites="1382" p="1380" n="1001" pvalue="3.8e-015" evalue="1.7e-010" unerased_evalue="1.4e-016">
60 |       <pos A="0.000000" C="1.000000" G="0.000000" T="0.000000"/>
61 |       <pos A="0.000000" C="1.000000" G="0.000000" T="0.000000"/>
62 |       <pos A="0.000000" C="0.000000" G="0.000000" T="1.000000"/>
63 |       <pos A="0.000000" C="0.000000" G="0.000000" T="1.000000"/>
64 |       <pos A="0.000000" C="0.000000" G="0.000000" T="1.000000"/>
65 |       <pos A="1.000000" C="0.000000" G="0.000000" T="0.000000"/>
66 |       <pos A="0.945007" C="0.000000" G="0.054993" T="0.000000"/>
67 |       <pos A="0.000000" C="0.000000" G="0.000000" T="1.000000"/>
68 |       <pos A="0.000000" C="1.000000" G="0.000000" T="0.000000"/>
69 |       <pos A="0.000000" C="1.000000" G="0.000000" T="0.000000"/>
70 |       <match seq="CCTTTAATCC" p="1304" n="952" pvalue="6.1e-014" evalue="2.8e-009"/>
71 |       <match seq="CCTTTAGTCC" p="76" n="49" pvalue="9.8e-003" evalue="4.5e+002"/>
72 |     </motif>
73 |   </motifs>
74 |   <run_time cpu="14961.40" real="14961.69" stop="evalue"/>
75 | </dreme>
76 | 


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/mousegradinput.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/mousegradinput.pdf


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/mouseintgrad.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/Fig6_S6_S7_S8/mouseintgrad.pdf


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/pM10Kb_Mouse_cv:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/pM10Kb_Mouse_cv/


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/pM10Kb_cv:
--------------------------------------------------------------------------------
1 | ../Fig3_S3/pM10Kb_cv/


--------------------------------------------------------------------------------
/Fig6_S6_S7_S8/runme.sh:
--------------------------------------------------------------------------------
 1 | # Deep Explain CV computes the following methods:
 2 | # 'deeplift', 'grad*input', 'saliency', 'elrp', 'intgrad'
 3 | # 'grad*input' and 'intgrad' are presented in the paper for the reasons described
 4 | # thus only these two have stored precomputed results to save space in download
 5 | 
 6 | #human (using best model from 10 trials on each of 10 folds of data)
 7 | python deep_explain_cv.py cv_human/01trainepoch.10-0.427.h5 pM10Kb_cv/ 1
 8 | python deep_explain_cv.py cv_human/62trainepoch.13-0.421.h5 pM10Kb_cv/ 2
 9 | python deep_explain_cv.py cv_human/63trainepoch.10-0.428.h5 pM10Kb_cv/ 3
10 | python deep_explain_cv.py cv_human/24trainepoch.09-0.427.h5 pM10Kb_cv/ 4
11 | python deep_explain_cv.py cv_human/25trainepoch.16-0.422.h5 pM10Kb_cv/ 5
12 | python deep_explain_cv.py cv_human/76trainepoch.11-0.414.h5 pM10Kb_cv/ 6
13 | python deep_explain_cv.py cv_human/17trainepoch.09-0.414.h5 pM10Kb_cv/ 7
14 | python deep_explain_cv.py cv_human/08trainepoch.11-0.418.h5 pM10Kb_cv/ 8
15 | python deep_explain_cv.py cv_human/79trainepoch.14-0.436.h5 pM10Kb_cv/ 9
16 | python deep_explain_cv.py cv_human/110trainepoch.08-0.425.h5 pM10Kb_cv/ 10
17 | 
18 | #Fig6A and FigS6A-B
19 | Rscript FigS6.R pM10Kb_cv/ human #resulting plots in {human/mouse}{gradinput/intgrad}.pdf
20 | 
21 | # for human (Fig6C and S7)
22 | Rscript Fig6C_S7_S8.R Roadmap_FantomAnnotations.InputData.pM10Kb.txt.gz FigS7.pdf
23 | 
24 | for x in {1..10}; do { Rscript best_positions.R pM10Kb_cv/gradinput.$x.txt.gz $x human; } done
25 | 
26 | cd motif_analysis/
27 | #generate permuted set
28 | for x in {1..10}; do { cut -f 1 negbestpos1mer$x\human.txt | shuf | paste - <(cut -f 2- bestpos1mer$x\human.txt) >negbestpos1mer$x\human.txt; } done
29 | for x in {1..10}; do { ./extract_kmers.pl human <bestpos1mer$x\human.txt >bestpos1mer$x\human.fa; } done
30 | for x in {1..10}; do { ./extract_kmers.pl human <negbestpos1mer$x\human.txt >negbestpos1mer$x\human.fa; } done
31 | cat bestpos1mer*human.fa >bestpos1mer_human_all.fa
32 | cat negbestpos1mer*human.fa >negbestpos1mer_human_all.fa
33 | fasta-subsample bestpos1mer_human_all.fa 400000 >subsampled_bestpos1mer_human_all_400K.fa
34 | fasta-subsample negbestpos1mer_human_all.fa 400000 >subsampled_negbestpos1mer_human_all_400K.fa
35 | #Fig6B human
36 | dreme -oc human_all_1mer_400K/ -p subsampled_bestpos1mer_human_all_400K.fa -n subsampled_negbestpos1mer_human_all_400K.fa -mink 2 -maxk 10
37 | cd ..
38 | 
39 | #mouse (using best model from 10 trials on each of 10 folds of data)
40 | python deep_explain_cv.py cv_mouse/71trainepoch.07-0.3200.h5 pM10Kb_Mouse_cv/ 1
41 | python deep_explain_cv.py cv_mouse/02trainepoch.07-0.3186.h5 pM10Kb_Mouse_cv/ 2
42 | python deep_explain_cv.py cv_mouse/63trainepoch.06-0.3173.h5 pM10Kb_Mouse_cv/ 3
43 | python deep_explain_cv.py cv_mouse/64trainepoch.09-0.3194.h5 pM10Kb_Mouse_cv/ 4
44 | python deep_explain_cv.py cv_mouse/45trainepoch.13-0.3113.h5 pM10Kb_Mouse_cv/ 5
45 | python deep_explain_cv.py cv_mouse/96trainepoch.07-0.3134.h5 pM10Kb_Mouse_cv/ 6
46 | python deep_explain_cv.py cv_mouse/77trainepoch.07-0.3223.h5 pM10Kb_Mouse_cv/ 7
47 | python deep_explain_cv.py cv_mouse/88trainepoch.06-0.3243.h5 pM10Kb_Mouse_cv/ 8
48 | python deep_explain_cv.py cv_mouse/79trainepoch.07-0.3171.h5 pM10Kb_Mouse_cv/ 9
49 | python deep_explain_cv.py cv_mouse/610trainepoch.10-0.3200.h5 pM10Kb_Mouse_cv/ 10
50 | 
51 | #FigS6C-D
52 | Rscript FigS6.R pM10Kb_Mouse_cv/ mouse
53 | 
54 | for x in {1..10}; do { Rscript best_positions.R pM10Kb_Mouse_cv/gradinput.$x.txt.gz $x mouse; } done
55 | 
56 | cd motif_analysis/
57 | #generate permuted set
58 | for x in {1..10}; do { cut -f 1 negbestpos1mer$x\mouse.txt | shuf | paste - <(cut -f 2- bestpos1mer$x\mouse.txt) >negbestpos1mer$x\mouse.txt; } done
59 | for x in {1..10}; do { ./extract_kmers.pl mouse <bestpos1mer$x\mouse.txt >bestpos1mer$x\mouse.fa; } done
60 | for x in {1..10}; do { ./extract_kmers.pl mouse <negbestpos1mer$x\mouse.txt >negbestpos1mer$x\mouse.fa; } done
61 | cat bestpos1mer*mouse.fa >bestpos1mer_mouse_all.fa
62 | cat negbestpos1mer*mouse.fa >negbestpos1mer_mouse_all.fa
63 | fasta-subsample bestpos1mer_mouse_all.fa 400000 >subsampled_bestpos1mer_mouse_all_400K.fa
64 | fasta-subsample negbestpos1mer_mouse_all.fa 400000 >subsampled_negbestpos1mer_mouse_all_400K.fa
65 | #Fig6B mouse
66 | dreme -oc mouse_all_1mer_400K/ -p subsampled_bestpos1mer_mouse_all_400K.fa -n subsampled_negbestpos1mer_mouse_all_400K.fa -mink 2 -maxk 10
67 | cd ..
68 | 
69 | # for mouse (FigS8)
70 | Rscript Fig6C_S7_S8.R Mouse_FantomAnnotations.InputData.pM10Kb.txt.gz FigS8.pdf
71 | 


--------------------------------------------------------------------------------
/FigS1/57epigenomes.RPKM.pc.gz:
--------------------------------------------------------------------------------
1 | ../datasets/57epigenomes.RPKM.pc.gz


--------------------------------------------------------------------------------
/FigS1/EG.name.txt:
--------------------------------------------------------------------------------
1 | ../datasets/EG.name.txt


--------------------------------------------------------------------------------
/FigS1/FigS1.R:
--------------------------------------------------------------------------------
 1 | library(gplots)
 2 | 
 3 | x=read.delim(gzfile("57epigenomes.RPKM.pc.gz"), row.names=1)
 4 | x$E000=NULL
 5 | names=read.delim("EG.name.txt",F)
 6 | colnames(x)=paste(colnames(x), gsub("_", " ", as.character(unlist(sapply(colnames(x), function(x) names[names$V1==x, "V2"])))))
 7 | 
 8 | y=as.matrix(cor(x, method='spearman'))
 9 | colnames(y)=colnames(x)
10 | 
11 | pdf("FigS1.pdf", height=10, width=10)
12 | par(oma=c(16,1,1,14))
13 | heatmap.2(y, trace="none", breaks=seq(0,1,0.05), #,density.info="none"
14 | symkey=FALSE, cexRow=0.6, cexCol=0.6, dendrogram="row", key=TRUE, col=matlab::jet.colors(20), denscol="black")
15 | dev.off()
16 | 


--------------------------------------------------------------------------------
/FigS1/FigS1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/FigS1/FigS1.pdf


--------------------------------------------------------------------------------
/FigS1/runme.sh:
--------------------------------------------------------------------------------
1 | Rscript FigS1.R


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright 2018 Vikram Agarwal
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="xpresso_logo.png" width="300">
 2 | 
 3 | # Xpresso: Predicting gene expression levels from genomic sequence
 4 | 
 5 | This repository is intended to accompany our publication, primarily to enhance the reproducibility of our results. For more information please refer to:
 6 | 
 7 | Agarwal V, Shendure J. [Predicting mRNA abundance directly from genomic sequence using deep convolutional neural networks](https://www.cell.com/cell-reports/pdf/S2211-1247(20)30616-1.pdf). 2020. **_Cell Reports_** 31 (7), 107663. [Youtube talk introducing the paper](https://youtu.be/xSdIJc-grXQ).
 8 | 
 9 | 
10 | These tools can be used in a variety of organisms and cell types of interest to:
11 | 
12 | * Perform hyperparameter optimization in the gene expression prediction task (as shown in **Fig 1**)
13 | * Perform evolutionary analyses on human and mouse organisms, as well as one-to-one orthologs of each (as shown in **Fig 2**)
14 | * Uncover modes of gene regulation in a cell type of interest that are operating at the transcriptional and post-transcriptional levels (as shown in **Fig 3**)
15 | * Evaluate model performance for cell type-specifc and cell type-agnostic models (as shown in **Fig 4**)
16 | * Predict transcriptional activity across a genomic locus (as shown in **Fig 5**)
17 | * Interpret deep learning models to learn about promoter properties (as shown in **Fig 6**)
18 | 
19 | If you find our code or predictions to be helpful for your work, please cite the paper above.
20 | 
21 | 
22 | # Dependencies for running entire pipeline:
23 | * Python3 modules: numpy, h5py, pandas, sklearn, keras (>=2.2.4-tf), hyperopt, biopython
24 | 
25 | * R libraries: LSD, data.table, latticeExtra, Biostrings, rhdf5, ROCR, gplots, mixtools, reshape2, beeswarm, RColorBrewer, zoo, GenomicRanges
26 | 
27 | * [TensorFlow (>=1.15.0)](https://www.tensorflow.org/install/)
28 | 
29 | * [DeepExplain](https://github.com/marcoancona/DeepExplain)
30 | 
31 | * [The MEME Suite](http://meme-suite.org/doc/download.html?man_type=web)
32 | 
33 | * [UCSC tools](http://hgdownload.soe.ucsc.edu/downloads.html#source_downloads) installation, including bigBedToBed
34 | 
35 | * [BEDTools](https://github.com/arq5x/bedtools2/releases)
36 | 
37 | # Instructions for use
38 | 
39 | For R code to work properly, please copy the contents of .Rprofile in this folder to your local .Rprofile.
40 | 
41 | Users are advised to read the code closely and modify commented pieces as appropriate to acquire
42 | desired output for your environment. For example, you will need to download all of the additional
43 | R library and Python module dependencies for the code to work. This being said, if you find crucial
44 | files are missing, making the code unusable, or if you identify a major problem in the code, please
45 | raise a Github issue.
46 | 
47 | In each Figure's folder, change directories to it and please read the file "runme.sh" first as it provides a general overview of relevant commands that were used sequentially to pre-process the data and generate the figures.
48 | 
49 | **OPTIONAL**: For full functionality and to fix symbolic links, run the following command in the base Xpresso directory to download the associated datapack:
50 | 
51 | `wget -r -np -nH --reject "index.html*" --cut-dirs 5 https://krishna.gs.washington.edu/content/members/vagar/Xpresso/data/datasets/`
52 | 
53 | The figures will link to this folder accordingly. Some of the files need to be decompressed, and not all files are provided due to minimize the package size (currently ~11Gb). If you need additional files not provided for the purpose of reproduction, please contact Vikram Agarwal (vagar {at} calicolabs {dot} com).
54 | 
55 | # Colab and Xpresso website
56 | 
57 | Simpler tools for a broad overview and to deploy pre-trained models can be accessed at the Xpresso [website](https://xpresso.gs.washington.edu/). You can also start training models and generating predictions quickly using the iPython Notebook, or open it in Google Colab to get up to use a cloud GPU with this link:
58 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/vagarwal87/bdd33e66fa2c59c41409ca47e7132e61/xpresso.ipynb)
59 | 
60 | **Note: The Colab generates predictions on a FASTA file of arbitrary DNA sequences without considering mRNA half-life features. To consider half-life features, one must prepare the full test file as shown
61 | in the datapack and Fig1_S2/**
62 | 


--------------------------------------------------------------------------------
/allfxns.pm:
--------------------------------------------------------------------------------
  1 | package allfxns;
  2 | 
  3 | use Getopt::Long;
  4 | use POSIX qw/ceil floor/;
  5 | use List::Util qw/min max/;
  6 | #use Math::CDF qw/qnorm/;
  7 | #use List::MoreUtils qw/uniq/;
  8 | use Env; Env::import();
  9 | 
 10 | @ISA = qw(Exporter);
 11 | @EXPORT = qw(ceil floor qnorm uniq min max
 12 | 		 bsub qsub parallelize 
 13 | 		 fisher_yates_shuffle histogram log2 log10 mean median medianabsdev quantile round stdev sum trimmed_mean trimmed_stdev vecsum zscore 
 14 | 		 alifold com comRNA fold gen_all_nmers readFasta rev revCom revComRNA gcContent cpgContent 
 15 | 		 intersect unique array_diff array_minus);
 16 | 
 17 | #JOB SUBMISSION SUBROUTINES
 18 | 
 19 | sub bsub{
 20 | 	($options, $error, $output, $job) = @_;
 21 | 	system "bsub $options -e $error -o $output <$job";
 22 | 	unlink $job;
 23 | }
 24 | 
 25 | sub qsub{
 26 | 	($options, $error, $output, $job) = @_;
 27 | 	system "qsub $options -S /bin/bash -e $error -o $output $job";
 28 | 	unlink $job;
 29 | }
 30 | 
 31 | sub parallelize{
 32 | 	%o = %{$_[0]};
 33 | 	($cmd, $indir, $uniq, $instr, $outstr, $count, $jobnum) = ($o{"cmd"}, $o{"indir"}, $o{"uniq"}, $o{"instr"}, $o{"outstr"}, 0, 1000000);
 34 | 	$outdir = $o{"outdir"} || "$TMP";
 35 | 	$skipuniq = $o{"skipuniq"} || "";
 36 | 	$exten = $o{"exten"} || "out";
 37 | 	$qsub = $o{"qsub"} || 0;
 38 | 	$bsub = $o{"bsub"} || 0;
 39 | 	$onefile = $o{"onefile"} || 0;
 40 | 	$skipsame = $o{"skipsame"} || 0;
 41 | 	$redir = ($onefile) ? ">>" : ">";
 42 | 	@files =  <$indir/*.$uniq>;
 43 | 	system "mkdir $outdir" if !(-d $outdir);
 44 | 
 45 | 	if ($qsub || $bsub) {
 46 | 		$cmdperjob = $o{"cmdperjob"} || 1;
 47 | 		$subopts = $o{"subopts"} || "-q idle";
 48 | 		$suberr = $o{"suberr"} || "$TMP/sub.err";
 49 | 		$subout = $o{"subout"} || "$TMP/sub.out";
 50 | 		$jobfolder = $o{"jobfolder"} || "$TMP";
 51 | 	}
 52 | 
 53 | 	foreach $file (@files){
 54 | 		++$count;
 55 | 		$code = (split /\.$uniq/, (split /\//, $file)[-1])[0];
 56 | #		$fileexists = `grep -P '$code\\t' /lab/bartel3_ata/agarwal/metazoans/human.utrs/three_prime_UTR/bins.txt`; next if $fileexists;
 57 | #		$region = (split /\//, $file)[-2];
 58 | 		$code = "concat$jobnum" if $onefile;
 59 | 		next if (($skipsame && -s "$outdir/$code.$exten" != 0) || ($skipuniq ne "" && $file =~ /$skipuniq/)); # -s if file has zero size
 60 | 		if (!$qsub && !$bsub) {
 61 | 			#print "$cmd $instr $file $outstr $outdir/$code.$exten\n";
 62 | 			#system "$cmd $instr $file $outstr $outdir/$code.$exten";
 63 | 			#$numfile = `grep '>' $file | wc -l`;
 64 | 			$outfile = $code;
 65 | 			$numfile1 = (-e "$outdir/$outfile.$exten") ? `wc -l $outdir/$outfile.aln` : 1;
 66 | 			print "$cmd $outdir/$outfile.aln\n" if $numfile1 == 0;
 67 | 		}
 68 | 		else {
 69 | 			if ($cmd =~ /bin_MSA/ && $cmd !~ /all/){
 70 | #				($bin) = (split /\s/, `grep -m1 -P '^$code\t' $DIR/targetpred/robin/3UTRs_nonredundant_18577genes.UTR_cons.10bins`)[-1]; die "getbin" if $bin != int($bin); # $bin++; --> do this if robin's
 71 | 				($tmp, $species, $region, $kmerlen) = (split /\s/, $cmd);
 72 | 				($bin) = (split /\s/, `grep -m1 -P '^$code\t' $DIR/metazoans/$species/$region/allgenes.bins`)[-1] if $kmerlen == 2 || $kmerlen == 8; #.23way
 73 | #				print "$code, $species, $region, $kmerlen, $bin\n";
 74 | 			}
 75 | ##			print "$cmd $bin $instr $file 2>&- $outstr $redir $outdir/$code.$exten\n";
 76 | 			$jobfile = "$jobfolder/job$jobnum.sh";
 77 | 			open SH, ">>$jobfile" or die "can't open $jobfile";
 78 | 			print SH "$cmd $bin $instr $file 2>&- $outstr $redir $outdir/$code.$exten\n";
 79 | 			close SH;
 80 | 			if ($count % $cmdperjob == 0) {
 81 | 				qsub($subopts, $suberr, $subout, $jobfile) if $qsub;
 82 | 				bsub($subopts, $suberr, $subout, $jobfile) if $bsub;
 83 | 				$jobnum++;
 84 | 			}
 85 | 		}
 86 | 	}
 87 | 	qsub($subopts, $suberr, $subout, $jobfile) if $qsub;
 88 | 	bsub($subopts, $suberr, $subout, $jobfile) if $bsub;
 89 | 	print STDERR "COMPLETE!\n";
 90 | }
 91 | 
 92 | #STATISTICS SUBROUTINES
 93 | 
 94 | sub fisher_yates_shuffle {
 95 | 	local $x = shift;
 96 | 	for ($i = @$x; --$i; ) {
 97 | 		$j = int rand ($i+1);
 98 | 		next if $i == $j;
 99 | 		@$x[$i,$j] = @$x[$j,$i];
100 | 	}
101 | }
102 | 
103 | sub histogram {
104 | 	$bin_width = 10;
105 | 	if ($#_ == 1){
106 | 		($hash, $bin_width) = @_;
107 | 	}
108 | 	else { $hash = shift; }
109 | 	$max, $min;
110 | 	%a = %$hash;
111 | 	%histogram;
112 | 	foreach (keys %a){
113 | 		$histogram{ceil(($_ + 1) / $bin_width) -1} += $a{$_};
114 | 	}
115 | 	
116 | 	while ( ($key, $value) = each(%histogram) ) {
117 | 		$max = $key if !defined($min) || $key > $max;
118 | 		$min = $key if !defined($min) || $key < $min;
119 | 	}
120 | 	
121 | 	for ($i = $min; $i <= $max; $i++) {
122 | 		$bin = sprintf("% 10d", ($i) * $bin_width);
123 | 		$frequency = $histogram{$i} || 0;
124 | 
125 | 		$frequency = "#" x $frequency;
126 | 		print $bin." ".$frequency."\n" if $frequency ne "";
127 | 	}
128 | 
129 | 	print "===============================\n\n";
130 | 	print "		Width: ".$bin_width."\n";
131 | 	print "		Range: ".$min."-".$max."\n\n";
132 | }
133 | 
134 | sub log2 { return log($_[0])/log(2); }
135 | 
136 | sub log10 { return log($_[0])/log(10); }
137 | 
138 | sub mean  { return sum($_[0])/scalar(@{$_[0]}); }
139 | 
140 | sub median{ return quantile($_[0], 2); }
141 | 
142 | sub medianabsdev{
143 | 	local $med = median($_[0]); local @b;
144 | 	push(@b, abs($_ - $med)) for (@{$_[0]});
145 | 	return median(\@b);
146 | }
147 | 
148 | sub quantile{
149 | 	local $rpole = shift;
150 | 	local $x = shift;
151 | 	@pole = @$rpole;
152 | 	$ret;
153 | 	@pole = sort {$a <=> $b} @pole;
154 | 	if( ($#pole % $x) == 0 ) {
155 | 		$ret = $pole[int($#pole/$x)];
156 | 	} else {
157 | 		$ret = ($pole[int($#pole/$x)] + $pole[int($#pole/$x)+1]) / 2;
158 | 	}
159 | 	return $ret;
160 | }
161 | 
162 | sub round{ return int($_[0] + 0.5 * ($_[0] <=> 0)); }
163 | 
164 | sub stdev{
165 |       return 0 unless @_ > 1;
166 |       local $mean = mean(\@_);
167 | 	local $tot = 0;
168 | 	foreach (@_){ $tot += ($_ - $mean)**2; }
169 |       return sqrt( $tot / $#_ );
170 | }
171 | 
172 | sub trimmed_mean {
173 | 	local $a = shift; local $perc = shift;
174 | 	$perc /= 200;
175 | 	@a = sort {$a <=> $b} @$a; $num = scalar(@a);
176 | 	@a = @a[int($num*$perc)..int($num*(1-$perc))];
177 | 	return mean(\@a);
178 | }
179 | 
180 | sub trimmed_stdev {
181 | 	local $a = shift; local $perc = shift;
182 | 	$perc /= 200;
183 | 	@a = sort {$a <=> $b} @$a; $num = scalar(@a);
184 | 	@a = @a[int($num*$perc)..int($num*(1-$perc))];
185 | 	return stdev(@a);
186 | }
187 | 
188 | sub sum{
189 | 	local $sum;
190 | 	$sum += $_ for(@{$_[0]});
191 | 	return $sum;
192 | }
193 | 
194 | sub vecsum{	#sum two vectors passed as array refs
195 | 	$len = max(scalar(@{$_[0]}), scalar(@{$_[1]}))-1;
196 | 	for $i (0..$len){ ${$_[0]}[$i] += ${$_[1]}[$i]; }
197 | }
198 | 
199 | sub zscore{
200 | 	local $val = shift;
201 | 	return ( $val - mean(\@_) ) / stdev(@_);
202 | }
203 | 
204 | 
205 | #NUCLEIC ACID SEQUENCE SUBROUTINES
206 | 
207 | sub alifold{
208 | 	local $file = shift;
209 | 	local $o = "-d0 -r -cv 0.6 -nc 0.5";
210 | 	$score = `RNAalifold $o $file | tail -1`;
211 | 	($mfe) = ($score =~ /.*\(\s*(-\d+.\d+) = .*\).*/);
212 | 	return $mfe;
213 | }
214 | 
215 | sub com{
216 | 	local $seq = shift;
217 | 	$seq =~ tr/tucgaTUCGA/aagctAAGCT/;
218 | 	return $seq;
219 | }
220 | 
221 | sub comRNA{
222 | 	local $seq = shift;
223 | 	$seq =~ tr/tucgaTUCGA/aagcuAAGCU/;
224 | 	return $seq;
225 | }
226 | 
227 | sub fold{
228 | 	local $seq = shift;
229 | 	local $o = shift;
230 | 	local $score = `echo $seq | RNAfold $o | tail -1`;
231 | 	($mfe) = ($score =~ /.*\((\s?-\d+.\d+)\).*/);
232 | 	return $mfe;
233 | }
234 | 
235 | sub gen_all_nmers{
236 | 	local $nmer_size = shift;
237 | 	local @words = '';
238 | 	foreach (1..$nmer_size) {
239 | 		@new_words = ();
240 | 		foreach $word (@words){
241 | 			foreach $i ( qw/A C T G/ ){ push (@new_words, $word.$i); }
242 | 		}
243 | 		@words = @new_words;
244 | 	}
245 | 	return @words;
246 | }
247 | 
248 | sub readFasta{
249 | 	local $fasta = shift;
250 | 	local %fasta = ();
251 | 	open DNA, "<$fasta" || die "Could not open fasta file for $fasta\n";
252 | 	while ($line = <DNA>){ chomp $line;
253 | 		if ($line =~ /^>\s?(\w+\.?\d*)/){ $header = $1; }
254 | #		if ($line =~ /^>(.*)/){ $header = $1; }
255 | 		else { $fasta{$header} .= $line; }
256 | 	}
257 | 	close DNA;
258 | 	return \%fasta;
259 | }
260 | 
261 | sub rev{ return scalar reverse $_[0]; }
262 | 
263 | sub revCom{ return rev(com($_[0])); }
264 | 
265 | sub revComRNA{ return rev(comRNA($_[0])); }
266 | 
267 | sub gcContent{ return 0 if (() = ($_[0] =~ /[AUTCG]/ig)) == 0; return sprintf("%.3f", (() = ($_[0] =~ /[CG]/ig))/(() = ($_[0] =~ /[AUTCG]/ig))); } # %G + C in seq, ignoring case, missing nucleotides, or gaps
268 | 
269 | sub cpgContent{ return 0 if (() = ($_[0] =~ /[AUTCG]/ig)) == 0; return sprintf("%.3f", (() = ($_[0] =~ /CG/ig))/(length($_[0])-1)); }
270 | 
271 | #ARRAY SUBROUTINES
272 | 
273 | sub unique(@) {
274 | 	return keys %{ {map { $_ => undef } @_}}; 
275 | }
276 | 
277 | sub intersect(\@\@) {
278 | 	my %e = map { $_ => undef } @{$_[0]};
279 | 	return grep { exists( $e{$_} ) } @{$_[1]};
280 | }
281 | 
282 | sub array_diff(\@\@) {
283 | 	my %e = map { $_ => undef } @{$_[1]};
284 | 	return @{[ ( grep { (exists $e{$_}) ? ( delete $e{$_} ) : ( 1 ) } @{ $_[0] } ), keys %e ] };
285 | }
286 | 
287 | sub array_minus(\@\@) {
288 | 	my %e = map{ $_ => undef } @{$_[1]};
289 | 	return grep( ! exists( $e{$_} ), @{$_[0]} ); 
290 | }
291 | 


--------------------------------------------------------------------------------
/xpresso_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vagarwal87/Xpresso/b5d1da2b7f7e9376e8b6eca2b6cc73cd361734a3/xpresso_logo.png


--------------------------------------------------------------------------------