├── DeepSTARR ├── DeepSTARR_nucl_contr_scores.py ├── DeepSTARR_pred_new_sequence.py ├── DeepSTARR_training.ipynb ├── Neural_Network_DNA_Demo │ ├── LICENSE │ ├── README.md │ ├── data │ │ ├── ctcf.fa.gz │ │ └── pu1.fa.gz │ ├── helper │ │ ├── IOHelper.py │ │ ├── SequenceHelper.py │ │ └── __init__.py │ ├── motif_plotter │ │ ├── __init__.py │ │ └── information_helper.py │ └── nn_for_sequence_data.ipynb ├── README.md └── Sequences_example.fa ├── Figures ├── Fig1.R ├── Fig2.R ├── Fig3.R ├── Fig4.R ├── Fig5.R ├── Fig6.R ├── Fig7.R └── README.md ├── GenomeWide_UMISTARRseq ├── README.md ├── STARRseq_UMI_collapsing.R ├── UMISTARRseq_pipeline.sh ├── bigBedToBigWig.sh ├── bowtie_pe.sh ├── bsub_gridengine ├── call_peaks.sh └── slippage_filter_pe.sh ├── LICENSE ├── Oligo_UMISTARRseq ├── Drosophila_oligo_library_processing.Rmd ├── Human_oligo_library_processing.Rmd ├── README.md ├── bowtie_pe_NoRevCompMapping.sh ├── bsub_gridengine ├── oligo_UMISTARRseq_pipeline.sh └── slippage_filter_pe.sh ├── README.md └── img ├── DeepSTARR.png ├── DeepSTARR_predictions.png ├── gw_UMISTARRseq.png ├── gw_UMISTARRseq_UMISTARRseq.pdf └── oligo_UMISTARRseq_enh_mutants.png /DeepSTARR/DeepSTARR_nucl_contr_scores.py: -------------------------------------------------------------------------------- 1 | 2 | ### Load arguments 3 | 4 | import sys, getopt 5 | 6 | ### other parameters 7 | # number of dinucleotide shuffled sequences per sequence as background 8 | dinuc_shuffle_n=100 9 | 10 | def main(argv): 11 | model_ID = '' 12 | try: 13 | opts, args = getopt.getopt(argv,"hm:s:c:",["model=", "sequence_set=", "class_output="]) 14 | except getopt.GetoptError: 15 | print('DeepSTARR_nucl_contr_scores.py -m -s -c ') 16 | sys.exit(2) 17 | for opt, arg in opts: 18 | if opt == '-h': 19 | print('DeepSTARR_nucl_contr_scores.py -m -s -c ') 20 | sys.exit() 21 | elif opt in ("-m", "--model"): 22 | model_ID = arg 23 | elif opt in ("-s", "--seq"): 24 | sequence_set = arg 25 | elif opt in ("-c", "--class_output"): 26 | class_output = arg 27 | if model_ID=='': sys.exit("CNN model file not found") 28 | if sequence_set=='': sys.exit("fasta seq file not found") 29 | if class_output=='': sys.exit("enhancer class (dev/hk) not found") 30 | print('CNN model file is ', model_ID) 31 | print('Input FASTA file is ', sequence_set) 32 | print('Enhancer class is ', class_output) 33 | return model_ID, sequence_set, class_output 34 | 35 | if __name__ == "__main__": 36 | model_ID, sequence_set, class_output = main(sys.argv[1:]) 37 | 38 | 39 | ### Load libraries 40 | import pandas as pd 41 | 42 | import sys 43 | sys.path.append('Neural_Network_DNA_Demo/') 44 | from helper import IOHelper, SequenceHelper # from https://github.com/bernardo-de-almeida/Neural_Network_DNA_Demo.git 45 | 46 | 47 | ### load fasta sequences functions 48 | def one_hot_encode_along_channel_axis(sequence): 49 | to_return = np.zeros((len(sequence),4), dtype=np.int8) 50 | seq_to_one_hot_fill_in_array(zeros_array=to_return, 51 | sequence=sequence, one_hot_axis=1) 52 | return to_return 53 | 54 | def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis): 55 | assert one_hot_axis==0 or one_hot_axis==1 56 | if (one_hot_axis==0): 57 | assert zeros_array.shape[1] == len(sequence) 58 | elif (one_hot_axis==1): 59 | assert zeros_array.shape[0] == len(sequence) 60 | #will mutate zeros_array 61 | for (i,char) in enumerate(sequence): 62 | if (char=="A" or char=="a"): 63 | char_idx = 0 64 | elif (char=="C" or char=="c"): 65 | char_idx = 1 66 | elif (char=="G" or char=="g"): 67 | char_idx = 2 68 | elif (char=="T" or char=="t"): 69 | char_idx = 3 70 | elif (char=="N" or char=="n"): 71 | continue #leave that pos as all 0's 72 | else: 73 | raise RuntimeError("Unsupported character: "+str(char)) 74 | if (one_hot_axis==0): 75 | zeros_array[char_idx,i] = 1 76 | elif (one_hot_axis==1): 77 | zeros_array[i,char_idx] = 1 78 | 79 | 80 | def prepare_input(file_seq): 81 | # Convert sequences to one-hot encoding matrix 82 | input_fasta_data_A = IOHelper.get_fastas_from_file(file_seq, uppercase=True) 83 | 84 | # get length of first sequence 85 | sequence_length = len(input_fasta_data_A.sequence.iloc[0]) 86 | 87 | # Convert sequence to one hot encoding matrix 88 | seq_matrix_A = SequenceHelper.do_one_hot_encoding(input_fasta_data_A.sequence, sequence_length, 89 | SequenceHelper.parse_alpha_to_seq) 90 | print(seq_matrix_A.shape) 91 | 92 | X = np.nan_to_num(seq_matrix_A) # Replace NaN with zero and infinity with large finite numbers 93 | X_reshaped = X.reshape((X.shape[0], X.shape[1], X.shape[2])) 94 | 95 | print(file_seq) 96 | 97 | return X_reshaped 98 | 99 | ### load model functions 100 | def load_model(model): 101 | import deeplift 102 | from keras.models import model_from_json 103 | keras_model_weights = model + '.h5' 104 | keras_model_json = model + '.json' 105 | keras_model = model_from_json(open(keras_model_json).read()) 106 | keras_model.load_weights(keras_model_weights) 107 | return keras_model, keras_model_weights, keras_model_json 108 | 109 | from deeplift.dinuc_shuffle import dinuc_shuffle 110 | import numpy as np 111 | 112 | ### deepExplainer functions 113 | def dinuc_shuffle_several_times(list_containing_input_modes_for_an_example, 114 | seed=1234): 115 | assert len(list_containing_input_modes_for_an_example)==1 116 | onehot_seq = list_containing_input_modes_for_an_example[0] 117 | rng = np.random.RandomState(seed) 118 | to_return = np.array([dinuc_shuffle(onehot_seq, rng=rng) for i in range(dinuc_shuffle_n)]) 119 | return [to_return] #wrap in list for compatibility with multiple modes 120 | 121 | # get hypothetical scores also 122 | def combine_mult_and_diffref(mult, orig_inp, bg_data): 123 | assert len(orig_inp)==1 124 | projected_hypothetical_contribs = np.zeros_like(bg_data[0]).astype("float") 125 | assert len(orig_inp[0].shape)==2 126 | #At each position in the input sequence, we iterate over the one-hot encoding 127 | # possibilities (eg: for genomic sequence, this is ACGT i.e. 128 | # 1000, 0100, 0010 and 0001) and compute the hypothetical 129 | # difference-from-reference in each case. We then multiply the hypothetical 130 | # differences-from-reference with the multipliers to get the hypothetical contributions. 131 | #For each of the one-hot encoding possibilities, 132 | # the hypothetical contributions are then summed across the ACGT axis to estimate 133 | # the total hypothetical contribution of each position. This per-position hypothetical 134 | # contribution is then assigned ("projected") onto whichever base was present in the 135 | # hypothetical sequence. 136 | #The reason this is a fast estimate of what the importance scores *would* look 137 | # like if different bases were present in the underlying sequence is that 138 | # the multipliers are computed once using the original sequence, and are not 139 | # computed again for each hypothetical sequence. 140 | for i in range(orig_inp[0].shape[-1]): 141 | hypothetical_input = np.zeros_like(orig_inp[0]).astype("float") 142 | hypothetical_input[:,i] = 1.0 143 | hypothetical_difference_from_reference = (hypothetical_input[None,:,:]-bg_data[0]) 144 | hypothetical_contribs = hypothetical_difference_from_reference*mult[0] 145 | projected_hypothetical_contribs[:,:,i] = np.sum(hypothetical_contribs,axis=-1) 146 | return [np.mean(projected_hypothetical_contribs,axis=0)] 147 | 148 | 149 | def my_deepExplainer(model, one_hot, class_output): 150 | import shap # forked from https://github.com/AvantiShri/shap/blob/master/shap/explainers/deep/deep_tf.py 151 | import numpy as np 152 | 153 | # output layer 154 | if class_output=="dev": 155 | out_layer=-2 156 | if class_output=="hk": 157 | out_layer=-1 158 | 159 | explainer = shap.DeepExplainer((model.layers[0].input, model.layers[out_layer].output), 160 | data=dinuc_shuffle_several_times, 161 | combine_mult_and_diffref=combine_mult_and_diffref) 162 | 163 | # running on all sequences 164 | shap_values_hypothetical = explainer.shap_values(one_hot) 165 | 166 | # normalising contribution scores 167 | # sum the deeplift importance scores across the ACGT axis (different nucleotides at the same position) 168 | # and “project” that summed importance onto whichever base is actually present at that position 169 | shap_values_contribution=shap_values_hypothetical[0]*one_hot 170 | 171 | return shap_values_hypothetical[0], shap_values_contribution 172 | 173 | 174 | ### Loading sequences and calculating nucleotide contribution scores 175 | 176 | print("\nLoading sequences and model ...\n") 177 | 178 | X_all = prepare_input(sequence_set) 179 | keras_model, keras_model_weights, keras_model_json = load_model(model_ID) 180 | 181 | print("\nRunning DeepExplain ...\n") 182 | 183 | scores=my_deepExplainer(keras_model, X_all, class_output=class_output) 184 | 185 | print("\nSaving ...\n") 186 | 187 | import h5py 188 | import os 189 | model_ID_out=os.path.basename(model_ID) 190 | 191 | if (os.path.isfile(sequence_set+"_"+model_ID_out+"_"+class_output+"_contribution_scores.h5")): 192 | os.remove(str(sequence_set+"_"+model_ID_out+"_"+class_output+"_contribution_scores.h5")) 193 | f = h5py.File(sequence_set+"_"+model_ID_out+"_"+class_output+"_contribution_scores.h5") 194 | 195 | g = f.create_group("contrib_scores") 196 | # save the actual contribution scores 197 | g.create_dataset(class_output, data=scores[1]) 198 | print("Done contr scores for " + class_output) 199 | 200 | g = f.create_group("hyp_contrib_scores") 201 | # save the hypothetical contribution scores 202 | g.create_dataset(class_output, data=scores[0]) 203 | print("Done hyp scores for " + class_output) 204 | 205 | f.close() 206 | -------------------------------------------------------------------------------- /DeepSTARR/DeepSTARR_pred_new_sequence.py: -------------------------------------------------------------------------------- 1 | 2 | ### Load arguments 3 | 4 | import sys, getopt 5 | 6 | def main(argv): 7 | new_seq = '' 8 | model_ID = '' 9 | try: 10 | opts, args = getopt.getopt(argv,"hs:m:",["seq=","model="]) 11 | except getopt.GetoptError: 12 | print('DeepSTARR_pred_new_sequence.py -s -m ') 13 | sys.exit(2) 14 | for opt, arg in opts: 15 | if opt == '-h': 16 | print('DeepSTARR_pred_new_sequence.py -s -m ') 17 | sys.exit() 18 | elif opt in ("-s", "--seq"): 19 | new_seq = arg 20 | elif opt in ("-m", "--model"): 21 | model_ID = arg 22 | if new_seq=='': sys.exit("fasta seq file not found") 23 | if model_ID=='': sys.exit("CNN model file not found") 24 | print('Input FASTA file is ', new_seq) 25 | print('Model file is ', model_ID) 26 | return new_seq, model_ID 27 | 28 | if __name__ == "__main__": 29 | new_seq, model_ID = main(sys.argv[1:]) 30 | 31 | 32 | ### Load libraries 33 | 34 | from keras.layers.convolutional import Conv1D, MaxPooling1D 35 | from keras.layers.core import Dropout, Reshape, Dense, Activation, Flatten 36 | from keras.layers import BatchNormalization, InputLayer, Input 37 | from keras.models import Sequential 38 | from keras.optimizers import Adam 39 | from keras.callbacks import EarlyStopping, History 40 | 41 | import pandas as pd 42 | import numpy as np 43 | 44 | import sys 45 | sys.path.append('Neural_Network_DNA_Demo/') 46 | from helper import IOHelper, SequenceHelper # from https://github.com/bernardo-de-almeida/Neural_Network_DNA_Demo.git 47 | 48 | ### Load sequences 49 | print("\nLoading sequences ...\n") 50 | input_fasta = IOHelper.get_fastas_from_file(new_seq, uppercase=True) 51 | print(input_fasta.shape) 52 | 53 | # length of first sequence 54 | sequence_length = len(input_fasta.sequence.iloc[0]) 55 | 56 | # Convert sequence to one hot encoding matrix 57 | seq_matrix = SequenceHelper.do_one_hot_encoding(input_fasta.sequence, sequence_length, 58 | SequenceHelper.parse_alpha_to_seq) 59 | 60 | ### load model 61 | def load_model(model_path): 62 | import deeplift 63 | from keras.models import model_from_json 64 | keras_model_weights = model_path + '.h5' 65 | keras_model_json = model_path + '.json' 66 | keras_model = model_from_json(open(keras_model_json).read()) 67 | keras_model.load_weights(keras_model_weights) 68 | #keras_model.summary() 69 | return keras_model, keras_model_weights, keras_model_json 70 | 71 | keras_model, keras_model_weights, keras_model_json = load_model(model_ID) 72 | 73 | ### predict dev and hk activity 74 | print("\nPredicting ...\n") 75 | pred=keras_model.predict(seq_matrix) 76 | out_prediction = input_fasta 77 | out_prediction['Predictions_dev'] = pred[0] 78 | out_prediction['Predictions_hk'] = pred[1] 79 | 80 | ### save file 81 | print("\nSaving file ...\n") 82 | import os.path 83 | model_ID_out=os.path.basename(model_ID) 84 | out_prediction.to_csv(new_seq + "_predictions_" + model_ID_out + ".txt", sep="\t", index=False) 85 | -------------------------------------------------------------------------------- /DeepSTARR/DeepSTARR_training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Notebook to train DeepSTARR" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#### Used packages and their version" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "#### GPU environment \n", 24 | "\n", 25 | "# conda create --name DeepSTARR python=3.7 tensorflow-gpu=1.14.0 keras-gpu=2.2.4\n", 26 | "# conda activate DeepSTARR\n", 27 | "# conda install numpy=1.16.2 pandas=0.25.3 matplotlib=3.1.1 ipykernel=5.4.3\n", 28 | "# pip install git+git://github.com/AvantiShri/shap.git@master\n", 29 | "# pip install 'h5py<3.0.0'\n", 30 | "# pip install deeplift==0.6.13.0\n", 31 | "# pip install keras-tuner==1.0.1\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import tensorflow as tf\n", 41 | "\n", 42 | "import keras\n", 43 | "import keras.layers as kl\n", 44 | "from keras.layers.convolutional import Conv1D, MaxPooling1D\n", 45 | "from keras.layers.core import Dropout, Reshape, Dense, Activation, Flatten\n", 46 | "from keras.layers import BatchNormalization, InputLayer, Input\n", 47 | "from keras import models\n", 48 | "from keras.models import Sequential, Model\n", 49 | "from keras.optimizers import Adam\n", 50 | "from keras.callbacks import EarlyStopping, History, ModelCheckpoint\n", 51 | " \n", 52 | "import pandas as pd\n", 53 | "import numpy as np\n", 54 | "\n", 55 | "import sys\n", 56 | "sys.path.append('Neural_Network_DNA_Demo/')\n", 57 | "from helper import IOHelper, SequenceHelper # from https://github.com/const-ae/Neural_Network_DNA_Demo\n", 58 | "\n", 59 | "import random\n", 60 | "random.seed(1234)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Download data" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "# FASTA files with DNA sequences of genomic regions from train/val/test sets\n", 77 | "!wget 'https://data.starklab.org/almeida/DeepSTARR/Data/Sequences_Train.fa'\n", 78 | "!wget 'https://data.starklab.org/almeida/DeepSTARR/Data/Sequences_Val.fa'\n", 79 | "!wget 'https://data.starklab.org/almeida/DeepSTARR/Data/Sequences_Test.fa'\n", 80 | "\n", 81 | "# Files with developmental and housekeeping activity of genomic regions from train/val/test sets\n", 82 | "!wget 'https://data.starklab.org/almeida/DeepSTARR/Data/Sequences_activity_Train.txt'\n", 83 | "!wget 'https://data.starklab.org/almeida/DeepSTARR/Data/Sequences_activity_Val.txt'\n", 84 | "!wget 'https://data.starklab.org/almeida/DeepSTARR/Data/Sequences_activity_Test.txt'" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Load data" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "# function to load sequences and enhancer activity\n", 101 | "def prepare_input(set):\n", 102 | " # Convert sequences to one-hot encoding matrix\n", 103 | " file_seq = str(\"Sequences_\" + set + \".fa\")\n", 104 | " input_fasta_data_A = IOHelper.get_fastas_from_file(file_seq, uppercase=True)\n", 105 | "\n", 106 | " # get length of first sequence\n", 107 | " sequence_length = len(input_fasta_data_A.sequence.iloc[0])\n", 108 | "\n", 109 | " # Convert sequence to one hot encoding matrix\n", 110 | " seq_matrix_A = SequenceHelper.do_one_hot_encoding(input_fasta_data_A.sequence, sequence_length,\n", 111 | " SequenceHelper.parse_alpha_to_seq)\n", 112 | " print(seq_matrix_A.shape)\n", 113 | " \n", 114 | " X = np.nan_to_num(seq_matrix_A) # Replace NaN with zero and infinity with large finite numbers\n", 115 | " X_reshaped = X.reshape((X.shape[0], X.shape[1], X.shape[2]))\n", 116 | "\n", 117 | " Activity = pd.read_table(\"Sequences_activity_\" + set + \".txt\")\n", 118 | " Y_dev = Activity.Dev_log2_enrichment\n", 119 | " Y_hk = Activity.Hk_log2_enrichment\n", 120 | " Y = [Y_dev, Y_hk]\n", 121 | " \n", 122 | " print(set)\n", 123 | "\n", 124 | " return input_fasta_data_A.sequence, seq_matrix_A, X_reshaped, Y" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# Data for train/val/test sets\n", 134 | "X_train_sequence, X_train_seq_matrix, X_train, Y_train = prepare_input(\"Train\")\n", 135 | "X_valid_sequence, X_valid_seq_matrix, X_valid, Y_valid = prepare_input(\"Val\")\n", 136 | "X_test_sequence, X_test_seq_matrix, X_test, Y_test = prepare_input(\"Test\")" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Build DeepSTARR model" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "### Additional metrics\n", 153 | "from scipy.stats import spearmanr\n", 154 | "def Spearman(y_true, y_pred):\n", 155 | " return ( tf.py_function(spearmanr, [tf.cast(y_pred, tf.float32), \n", 156 | " tf.cast(y_true, tf.float32)], Tout = tf.float32) )\n" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "params = {'batch_size': 128,\n", 166 | " 'epochs': 100,\n", 167 | " 'early_stop': 10,\n", 168 | " 'kernel_size1': 7,\n", 169 | " 'kernel_size2': 3,\n", 170 | " 'kernel_size3': 5,\n", 171 | " 'kernel_size4': 3,\n", 172 | " 'lr': 0.002,\n", 173 | " 'num_filters': 256,\n", 174 | " 'num_filters2': 60,\n", 175 | " 'num_filters3': 60,\n", 176 | " 'num_filters4': 120,\n", 177 | " 'n_conv_layer': 4,\n", 178 | " 'n_add_layer': 2,\n", 179 | " 'dropout_prob': 0.4,\n", 180 | " 'dense_neurons1': 256,\n", 181 | " 'dense_neurons2': 256,\n", 182 | " 'pad':'same'}\n", 183 | "\n", 184 | "def DeepSTARR(params=params):\n", 185 | " \n", 186 | " lr = params['lr']\n", 187 | " dropout_prob = params['dropout_prob']\n", 188 | " n_conv_layer = params['n_conv_layer']\n", 189 | " n_add_layer = params['n_add_layer']\n", 190 | " \n", 191 | " # body\n", 192 | " input = kl.Input(shape=(249, 4))\n", 193 | " x = kl.Conv1D(params['num_filters'], kernel_size=params['kernel_size1'],\n", 194 | " padding=params['pad'],\n", 195 | " name='Conv1D_1st')(input)\n", 196 | " x = BatchNormalization()(x)\n", 197 | " x = Activation('relu')(x)\n", 198 | " x = MaxPooling1D(2)(x)\n", 199 | "\n", 200 | " for i in range(1, n_conv_layer):\n", 201 | " x = kl.Conv1D(params['num_filters'+str(i+1)],\n", 202 | " kernel_size=params['kernel_size'+str(i+1)],\n", 203 | " padding=params['pad'],\n", 204 | " name=str('Conv1D_'+str(i+1)))(x)\n", 205 | " x = BatchNormalization()(x)\n", 206 | " x = Activation('relu')(x)\n", 207 | " x = MaxPooling1D(2)(x)\n", 208 | " \n", 209 | " x = Flatten()(x)\n", 210 | " \n", 211 | " # dense layers\n", 212 | " for i in range(0, n_add_layer):\n", 213 | " x = kl.Dense(params['dense_neurons'+str(i+1)],\n", 214 | " name=str('Dense_'+str(i+1)))(x)\n", 215 | " x = BatchNormalization()(x)\n", 216 | " x = Activation('relu')(x)\n", 217 | " x = Dropout(dropout_prob)(x)\n", 218 | " bottleneck = x\n", 219 | " \n", 220 | " # heads per task (developmental and housekeeping enhancer activities)\n", 221 | " tasks = ['Dev', 'Hk']\n", 222 | " outputs = []\n", 223 | " for task in tasks:\n", 224 | " outputs.append(kl.Dense(1, activation='linear', name=str('Dense_' + task))(bottleneck))\n", 225 | "\n", 226 | " model = keras.models.Model([input], outputs)\n", 227 | " model.compile(keras.optimizers.Adam(lr=lr),\n", 228 | " loss=['mse', 'mse'], # loss\n", 229 | " loss_weights=[1, 1], # loss weigths to balance\n", 230 | " metrics=[Spearman]) # additional track metric\n", 231 | "\n", 232 | " return model, params\n", 233 | "\n", 234 | "DeepSTARR()[0].summary()\n", 235 | "DeepSTARR()[1] # dictionary" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "# Training DeepSTARR" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "def train(selected_model, X_train, Y_train, X_valid, Y_valid, params):\n", 252 | "\n", 253 | " my_history=selected_model.fit(X_train, Y_train,\n", 254 | " validation_data=(X_valid, Y_valid),\n", 255 | " batch_size=params['batch_size'], epochs=params['epochs'],\n", 256 | " callbacks=[EarlyStopping(patience=params['early_stop'], monitor=\"val_loss\", restore_best_weights=True),\n", 257 | " History()])\n", 258 | " \n", 259 | " return selected_model, my_history" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "main_model, main_params = DeepSTARR()\n", 269 | "main_model, my_history = train(main_model, X_train, Y_train, X_valid, Y_valid, main_params)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "# Evaluating the Model" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "### Model performance: mean squared error (MSE) and Pearson (PCC) and Spearman (SCC) correlation coefficients" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "from scipy import stats\n", 293 | "from sklearn.metrics import mean_squared_error\n", 294 | "\n", 295 | "# create functions\n", 296 | "def summary_statistics(X, Y, set, task):\n", 297 | " pred = main_model.predict(X, batch_size=main_params['batch_size'])\n", 298 | " if task ==\"Dev\":\n", 299 | " i=0\n", 300 | " if task ==\"Hk\":\n", 301 | " i=1\n", 302 | " print(set + ' MSE ' + task + ' = ' + str(\"{0:0.2f}\".format(mean_squared_error(Y, pred[i].squeeze()))))\n", 303 | " print(set + ' PCC ' + task + ' = ' + str(\"{0:0.2f}\".format(stats.pearsonr(Y, pred[i].squeeze())[0])))\n", 304 | " print(set + ' SCC ' + task + ' = ' + str(\"{0:0.2f}\".format(stats.spearmanr(Y, pred[i].squeeze())[0])))\n", 305 | " \n", 306 | "# run for each set and enhancer type\n", 307 | "summary_statistics(X_train, Y_train[0], \"train\", \"Dev\")\n", 308 | "summary_statistics(X_train, Y_train[1], \"train\", \"Hk\")\n", 309 | "summary_statistics(X_valid, Y_valid[0], \"validation\", \"Dev\")\n", 310 | "summary_statistics(X_valid, Y_valid[1], \"validation\", \"Hk\")\n", 311 | "summary_statistics(X_test, Y_test[0], \"test\", \"Dev\")\n", 312 | "summary_statistics(X_test, Y_test[1], \"test\", \"Hk\")" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "# Save model weights" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "model_name=\"DeepSTARR\"\n", 329 | "\n", 330 | "model_json = main_model.to_json()\n", 331 | "with open('Model_' + model_name + '.json', \"w\") as json_file:\n", 332 | " json_file.write(model_json)\n", 333 | "main_model.save_weights('Model_' + model_name + '.h5')" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [] 342 | } 343 | ], 344 | "metadata": { 345 | "kernelspec": { 346 | "display_name": "DeepLearning_conda_env_gpu", 347 | "language": "python", 348 | "name": "deeplearning_conda_env_gpu" 349 | }, 350 | "language_info": { 351 | "codemirror_mode": { 352 | "name": "ipython", 353 | "version": 3 354 | }, 355 | "file_extension": ".py", 356 | "mimetype": "text/x-python", 357 | "name": "python", 358 | "nbconvert_exporter": "python", 359 | "pygments_lexer": "ipython3", 360 | "version": "3.7.9" 361 | } 362 | }, 363 | "nbformat": 4, 364 | "nbformat_minor": 4 365 | } 366 | -------------------------------------------------------------------------------- /DeepSTARR/Neural_Network_DNA_Demo/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 const-ae 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /DeepSTARR/Neural_Network_DNA_Demo/README.md: -------------------------------------------------------------------------------- 1 | # Neural Network for Sequence Data 2 | 3 | This project is an example how the recent advancements in Neural Networks 4 | can be applied to sequence data, namely DNA. 5 | 6 | We will take a collection of sequences plus information if they are 7 | bound by a transcription factor as input data and after training 8 | a convolutional neural network we will be able to make predictions 9 | for new sequences. In addition we will extract what the network learned 10 | and make a plot of the motif. 11 | 12 | The example are chosen such that it is not necessary to have a GPU and 13 | should learn just fine on a CPU. 14 | 15 | # Preparation 16 | 17 | To make sure everybody can play around with the example easily and you do not need to install the dependencies, please follow the instructions below. 18 | 19 | The code for this tutorial is written in Python and you will need a 2.7 or 3.5 installation (https://www.python.org/downloads/). If you have 20 | 21 | Neural Networks are a complex topic and there are quite a few packages you need to install to get going. The easiest way to install packages in Python is to use [Anaconda](https://www.continuum.io/downloads). In the following I will assume that you only have Anaconda installed. 22 | 23 | ## Libraries 24 | 25 | To run the the code in the tutorial you will need the following libraries: 26 | 27 | * Jupyter / IPython 28 | * Keras 29 | * Theano _or_ Tensorflow 30 | * Numpy 31 | 32 | ## Installation Steps 33 | 34 | After the successful installation of Anaconda we will create a new conda environment to not pollute the default environment: 35 | 36 | ```Shell 37 | $ conda create -n seqnn python=3.5 38 | ``` 39 | 40 | ```Shell 41 | # On Windows 42 | $ activate seqnn 43 | # On Mac / Linux 44 | $ source activate seqnn 45 | ``` 46 | 47 | Install Theano 48 | 49 | ```Shell 50 | $ conda install theano pygpu 51 | ``` 52 | 53 | Install other dependencies 54 | 55 | ```Shell 56 | $ conda install scikit-learn keras numpy scipy matplotlib ipython jupyter pandas sympy nose nb_conda 57 | ``` 58 | 59 | Download the `Neural_Network_DNA_Demo` project either by cloning it with git 60 | 61 | ```Shell 62 | $ git clone https://github.com/const-ae/Neural_Network_DNA_Demo.git 63 | ``` 64 | 65 | __OR__ download this [zip file](https://github.com/Artjom-Metro/Neural_Network_DNA_Demo/archive/master.zip) and extracting it somewhere. 66 | 67 | Move with the command line to the `Neural_Network_DNA_Demo` folder: 68 | 69 | ```Shell 70 | $ cd /Neural_Network_DNA_Demo 71 | ``` 72 | 73 | and start the Jupyter notebook: 74 | 75 | ```Shell 76 | $ jupyter notebook 77 | ``` 78 | 79 | A new browser page should open where you can click on the `.ipynb` file 80 | and start experimenting. 81 | 82 | 83 | # Problems, Issues etc. 84 | 85 | If you have problems with the installation of ... 86 | 87 | * ... Theano --> check [this](http://deeplearning.net/software/theano/install.html#install) guide 88 | * ... Keras --> check [this](https://keras.io/#installation) guide 89 | 90 | or just open an issue [here](https://github.com/const-ae/Neural_Network_DNA_Demo/issues). 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /DeepSTARR/Neural_Network_DNA_Demo/data/ctcf.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bernardo-de-almeida/DeepSTARR/b02e460c7581934bb6c8910e53be04da10688781/DeepSTARR/Neural_Network_DNA_Demo/data/ctcf.fa.gz -------------------------------------------------------------------------------- /DeepSTARR/Neural_Network_DNA_Demo/data/pu1.fa.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bernardo-de-almeida/DeepSTARR/b02e460c7581934bb6c8910e53be04da10688781/DeepSTARR/Neural_Network_DNA_Demo/data/pu1.fa.gz -------------------------------------------------------------------------------- /DeepSTARR/Neural_Network_DNA_Demo/helper/IOHelper.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import math 3 | import os.path 4 | from subprocess import Popen, PIPE, STDOUT 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | def get_fastas_from_file(fasta_path, as_dict=False, 11 | uppercase=False, stop_at=None): 12 | fastas = [] 13 | seq = None 14 | header = None 15 | for r in (gzip.open(fasta_path) if fasta_path.endswith(".gz") else open(fasta_path)): 16 | if type(r) is bytes: 17 | r = r.decode("utf-8") 18 | r = r.strip() 19 | if r.startswith(">"): 20 | if seq != None and header != None: 21 | fastas.append([header, seq]) 22 | if stop_at != None and len(fastas) >= stop_at: 23 | break 24 | seq = "" 25 | header = r[1:] 26 | else: 27 | if seq != None: 28 | seq += r.upper() if uppercase else r 29 | else: 30 | seq = r.upper() if uppercase else r 31 | # append last fasta read by method 32 | if stop_at != None and len(fastas) < stop_at: 33 | fastas.append([header, seq]) 34 | elif stop_at == None: 35 | fastas.append([header, seq]) 36 | if as_dict: 37 | return {h: s for h, s in fastas} 38 | 39 | return pd.DataFrame({'location': [e[0] for e in fastas], 'sequence': [e[1] for e in fastas]}) 40 | 41 | 42 | def get_shape_fastas_from_file(fasta_path, as_dict=False, 43 | uppercase=False, stop_at=None): 44 | fastas = [] 45 | seq = None 46 | header = None 47 | for r in (gzip.open(fasta_path) if fasta_path.endswith(".gz") else open(fasta_path)): 48 | if type(r) is bytes: 49 | r = r.decode("utf-8") 50 | r = r.strip() 51 | if r.startswith(">"): 52 | if seq != None and header != None: 53 | fastas.append([header, seq]) 54 | if stop_at != None and len(fastas) >= stop_at: 55 | break 56 | seq = None 57 | header = r[1:] 58 | else: 59 | if seq != None: 60 | seq += "," + (r.upper() if uppercase else r) 61 | else: 62 | seq = r.upper() if uppercase else r 63 | # append last fasta read by method 64 | if stop_at != None and len(fastas) < stop_at: 65 | fastas.append([header, seq]) 66 | elif stop_at == None: 67 | fastas.append([header, seq]) 68 | if as_dict: 69 | return {h: s for h, s in fastas} 70 | 71 | return pd.DataFrame({'location': [e[0] for e in fastas], 'sequence': [e[1] for e in fastas]}) 72 | 73 | 74 | def get_padded_sequences(fasta_file): 75 | fasta = get_fastas_from_file(fasta_file) 76 | max_length = max([len(x) for x in fasta.sequence]) 77 | padded_sequences = [] 78 | for seq in fasta.sequence: 79 | diff = max_length - len(seq) 80 | n_seq = (math.floor(diff/2) * 'N') + seq + (math.ceil(diff/2) * 'N') 81 | padded_sequences.append(n_seq) 82 | fasta.sequence = padded_sequences 83 | return fasta 84 | 85 | 86 | def convert_bed_to_fasta_hg19(bed_path, fasta_path, reference_genome_path, use_peak_max=False, 87 | bp_flanking=50): 88 | ''' 89 | Copied from Ignacio: /g/scb/zaugg/rio/EclipseProjects/zaugglab/lib/FastaAnalyzer.py 90 | :param bed_path: The path to our BED file 91 | :param fasta_path: The output fasta that will be created 92 | :param use_peak_max: If True, we will extract w.r.t. to peak position 93 | (See https://www.biostars.org/p/102710/ for format description 94 | :param bp_flanking: If use_peak is True, then flanking regions will 95 | be calculated from this file 96 | :return: 97 | ''' 98 | 99 | args = ["/g/software/bin/bedtools", "getfasta", "-fi", reference_genome_path, 100 | "-fo", fasta_path] 101 | 102 | # create a new coordinates file with flanking sequences 103 | if use_peak_max: 104 | df = pd.read_csv(bed_path, sep='\t', index_col=False, 105 | names=['chrom', 'chromStart', 'chromEnd', 'name', 'score', 106 | 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']) 107 | df['startFromPeak'] = df['thickStart'] - bp_flanking 108 | df['endFromPeak'] = df['thickStart'] + bp_flanking 109 | df = df[['chrom', 'startFromPeak', 'endFromPeak']] 110 | tsv_string = df.to_csv(header=False, sep='\t', index=False) 111 | args = args + ['-bed', 'stdin'] 112 | 113 | p = Popen(args, stdout=PIPE, stdin=PIPE, stderr=STDOUT) 114 | x = p.communicate(input=tsv_string.encode(encoding='UTF-8')) 115 | x = x[0].decode('UTF-8') 116 | if x != '': 117 | print("ERROR: " + x) 118 | else: 119 | os.system(" ".join(args + ['-bed', bed_path])) 120 | 121 | 122 | def write_fasta_file(file, sequences, descr=None): 123 | """ 124 | Sequences has to be a list of strings. descr can be None than a dummy line is inserted or a list of the 125 | same length as sequences. 126 | """ 127 | with open(file, "w") as out: 128 | for idx, seq in enumerate(sequences): 129 | if descr is None: 130 | out.write(">Dummy_Line\n") 131 | else: 132 | out.write(">" + str(descr[idx]) + "\n") 133 | out.write("".join(seq) + "\n") 134 | 135 | 136 | def save_keras_model(model, model_path, overwrite=False): 137 | json_string = model.to_json() 138 | with open(model_path + '.json', 'w+') as f: 139 | f.write(json_string) 140 | model.save_weights(model_path + '.h5', overwrite=overwrite) 141 | 142 | 143 | def load_keras_model(path): 144 | from keras.models import model_from_json 145 | model = model_from_json(open(path + '.json').read()) 146 | model.load_weights(path + '.h5') 147 | return model 148 | 149 | 150 | def save_scoring_file(header, values, scores, labels, file): 151 | if len(scores) != len(labels): 152 | raise ValueError("The score and label length must match!") 153 | if len(header) != scores.shape[3] + values.shape[2]: 154 | raise ValueError("The value + score width and header length must match!") 155 | 156 | with open(file, 'w') as output: 157 | output.write("\t".join(["Index", "Label"] + header) + "\n") 158 | for line_idx in range(0, len(scores)): 159 | output.write("\t".join([str(line_idx), labels[line_idx]] 160 | + ["["+",".join(map(str, values[line_idx, :, c]))+"]" for c in range(0, values.shape[2])] 161 | + ["["+",".join(map(str, scores[line_idx, 0, :, c]))+"]" for c in range(0, scores.shape[3])])) 162 | output.write("\n") 163 | 164 | 165 | 166 | def read_importance_file(location): 167 | return pd.read_csv(location, sep="\t") 168 | 169 | 170 | def parse_importance_df(df, col_names): 171 | # Iterate over every entry 172 | parsed_cols = [] 173 | for name in col_names: 174 | col = df[name].as_matrix() 175 | parsed_col = np.apply_along_axis(lambda e: np.array([float(x) for x in e[0][1:-1].split(",")]), 1, col.reshape(len(col),1)) 176 | parsed_cols.append(parsed_col) 177 | return np.stack(parsed_cols, 2) 178 | 179 | 180 | def write_output_file(output_file, name, PositiveData, NegativeData, Training_Script, aucs, auprcs, importance_scores): 181 | with open(output_file, "w") as out: 182 | out.write("Name:" + str(name) + "\n") 183 | out.write("PositiveData:" + str(PositiveData) + "\n") 184 | out.write("NegativeData:" + str(NegativeData) + "\n") 185 | out.write("Training_Script:" + str(Training_Script) + "\n") 186 | out.write("AUCs:" + ",".join(map(str, aucs)) + "\n") 187 | out.write("AUPRCs:" + ",".join(map(str, auprcs)) + "\n") 188 | out.write("Importance_Scores:" + str(importance_scores) + "\n") -------------------------------------------------------------------------------- /DeepSTARR/Neural_Network_DNA_Demo/helper/SequenceHelper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def parse_alpha_to_seq(sequence): 5 | output = np.arange(len(sequence)) 6 | for i in range(0, len(sequence)): 7 | snippet = sequence[i] 8 | if snippet == 'A': 9 | output[i] = 0 10 | elif snippet == 'C': 11 | output[i] = 1 12 | elif snippet == 'G': 13 | output[i] = 2 14 | elif snippet == 'T': 15 | output[i] = 3 16 | elif snippet == 'N': 17 | output[i] = -1 18 | else: 19 | raise AssertionError("Cannot handle snippet: " + snippet) 20 | return output 21 | 22 | 23 | def parse_binary_seq(sequence): 24 | output = np.arange(len(sequence) / 2) 25 | for i in range(0, len(sequence), 2): 26 | snippet = sequence[i] + sequence[i + 1] 27 | if snippet == '00': 28 | output[int(i / 2)] = 0 29 | elif snippet == '01': 30 | output[int(i / 2)] = 1 31 | elif snippet == '10': 32 | output[int(i / 2)] = 2 33 | elif snippet == '11': 34 | output[int(i / 2)] = 3 35 | else: 36 | raise AssertionError("Cannot handle snippet: " + snippet) 37 | return output 38 | 39 | 40 | def parse_binary_seq_to_alpha(sequence): 41 | output = "" 42 | for i in range(0, len(sequence), 2): 43 | snippet = sequence[i] + sequence[i + 1] 44 | if snippet == '00': 45 | output += 'A' 46 | elif snippet == '01': 47 | output += 'C' 48 | elif snippet == '10': 49 | output += 'G' 50 | elif snippet == '11': 51 | output += 'T' 52 | else: 53 | raise AssertionError("Cannot handle snippet: " + snippet) 54 | return output 55 | 56 | 57 | def to_categorical(y, nb_classes=None): 58 | '''Convert class vector (integers from 0 to nb_classes) 59 | to binary class matrix, for use with categorical_crossentropy 60 | ''' 61 | y = np.asarray(y, dtype='int32') 62 | if not nb_classes: 63 | nb_classes = np.max(y) + 1 64 | Y = np.zeros((len(y), nb_classes)) 65 | for i in range(len(y)): 66 | if y[i] != -1: 67 | Y[i, y[i]] = 1. 68 | return Y 69 | 70 | 71 | def do_one_hot_encoding(sequence, seq_length, f=parse_alpha_to_seq): 72 | X = np.zeros((sequence.shape[0], seq_length, 4)) 73 | for idx in range(0, len(sequence)): 74 | X[idx] = to_categorical(f(sequence[idx]), 4) 75 | return X 76 | 77 | 78 | def do_dinucleotide_shuffling(X, size=1): 79 | x_shuffled = np.repeat(X, size, 0) 80 | 81 | for x in range(0, x_shuffled.shape[0]): 82 | random_index = np.arange(0, X.shape[1]/2) 83 | np.random.shuffle(random_index) 84 | for y in range(0, int(X.shape[1]/2)): 85 | x_shuffled[x,y*2, ] = X[x%X.shape[0],random_index[y]*2] 86 | x_shuffled[x,(y*2)+1, ] = X[x%X.shape[0],(random_index[y]*2)+1] 87 | 88 | return x_shuffled 89 | 90 | 91 | def generate_complementary_sequence(sequence): 92 | comp_seq = [] 93 | for b in sequence: 94 | if b == "A": 95 | comp_seq.append("T") 96 | elif b == "T": 97 | comp_seq.append("A") 98 | elif b == "C": 99 | comp_seq.append("G") 100 | elif b == "G": 101 | comp_seq.append("C") 102 | elif b == "N": 103 | comp_seq.append("N") 104 | else: 105 | raise ValueError("Cannot convert base {0} to complement base!".format(b)) 106 | return ''.join(comp_seq) -------------------------------------------------------------------------------- /DeepSTARR/Neural_Network_DNA_Demo/helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bernardo-de-almeida/DeepSTARR/b02e460c7581934bb6c8910e53be04da10688781/DeepSTARR/Neural_Network_DNA_Demo/helper/__init__.py -------------------------------------------------------------------------------- /DeepSTARR/Neural_Network_DNA_Demo/motif_plotter/__init__.py: -------------------------------------------------------------------------------- 1 | from matplotlib.font_manager import FontProperties 2 | from matplotlib.textpath import TextPath 3 | import matplotlib.patches as patches 4 | from matplotlib.transforms import Affine2D 5 | from motif_plotter.information_helper import * 6 | 7 | 8 | def make_text_elements(text, x=0.0, y=0.0, width=1.0, height=1.0, color='blue', edgecolor="black", 9 | font = FontProperties(family='monospace')): 10 | tp = TextPath((0.0, 0.0), text, size=1, prop=font) 11 | bbox = tp.get_extents() 12 | bwidth = bbox.x1 - bbox.x0 13 | bheight = bbox.y1 - bbox.y0 14 | trafo = Affine2D() 15 | trafo.translate(-bbox.x0, -bbox.y0) 16 | trafo.scale(1 / bwidth * width, 1 / bheight * height) 17 | trafo.translate(x,y) 18 | tp = tp.transformed(trafo) 19 | return patches.PathPatch(tp, facecolor=color, edgecolor=edgecolor) 20 | 21 | 22 | def make_bar_plot(axes, texts, heights, width=0.8, colors=None): 23 | """ 24 | Makes a bar plot but each bar is not just a rectangle but an element from the texts list 25 | :param axes: the axes that is modified 26 | :param texts: a list of strings, where each element is plotted as a "bar" 27 | :param heights: a list of the height of each texts element 28 | :param width: the width of the bar. Default: 0.8 29 | :param colors: A list of colors, a list with a single entry or None. Default: None, which is plotted as blue 30 | :return: None 31 | """ 32 | texts = list(texts) 33 | heights = list(heights) 34 | n_elem = len(texts) 35 | if n_elem != len(heights): 36 | raise ValueError("Texts and heights must be of the same length") 37 | if colors is None: 38 | colors = ['blue'] * n_elem 39 | elif len(colors) == 1: 40 | colors *= n_elem 41 | 42 | axes.set_ylim(min(0,min(heights)), max(0,max(heights))) 43 | axes.set_xlim(0, n_elem) 44 | for idx, (text, height, color) in enumerate(zip(texts, heights, colors)): 45 | text_shape = make_text_elements(text, x=idx+(1-width)/2, y=0, width=width, height=height, 46 | color=color, edgecolor=color) 47 | axes.add_patch(text_shape) 48 | 49 | 50 | def make_stacked_bar_plot(axes, texts, heights, width=0.8, colors=None): 51 | """ 52 | Makes a stackedbar plot but each bar is not just a rectangle but an element from the texts list 53 | :param axes: the axes that is modified 54 | :param texts: a list of list of strings, where each element is plotted as a "bar" 55 | :param heights: a list of lists of the height of each texts element 56 | :param width: the width of the bar. Default: 0.8 57 | :param colors: 58 | :return: None 59 | """ 60 | if colors is None: 61 | colors = [['blue'] * len(text) for text in texts] 62 | elif len(colors) == 1: 63 | colors = [colors * len(text) for text in texts] 64 | 65 | if len(texts) != len(heights): 66 | raise ValueError("Texts and heights must be of the same length") 67 | for idx, (text, height, color) in enumerate(zip(texts, heights, colors)): 68 | y_stack_pos = 0 69 | y_stack_neg = 0 70 | for jdx, (t, h, c) in enumerate(zip(text, height, color)): 71 | if h > 0: 72 | text_shape = make_text_elements(t, x=idx+(1-width)/2, y=y_stack_pos, width=width, height=h, 73 | color=c, edgecolor=c) 74 | y_stack_pos += h 75 | axes.add_patch(text_shape) 76 | elif h < 0: 77 | text_shape = make_text_elements(t, x=idx + (1 - width) / 2, y=y_stack_neg, width=width, height=h, 78 | color=c, edgecolor=c) 79 | y_stack_neg += h 80 | axes.add_patch(text_shape) 81 | 82 | axes.autoscale() 83 | axes.set_xlim(0, len(texts)) 84 | 85 | 86 | def make_single_sequence_spectrum(axis, row, row_scores, one_hot_decoding=None, colors=None): 87 | if one_hot_decoding is None: 88 | one_hot_decoding = ["A", "T", "C", "G"] 89 | if colors is None: 90 | colors = ['#008000', '#cc0000', '#0000cc', '#ffb300'] 91 | sequence = [np.array(one_hot_decoding)[x] for x in np.apply_along_axis(np.argmax, 1, row)] 92 | score_sequence = np.apply_along_axis(lambda e: np.max(e) if abs(np.min(e)) < np.max(e) else np.min(e), 1, row_scores) 93 | color_sequence = [np.array(colors)[x] for x in np.apply_along_axis(np.argmax, 1, row)] 94 | make_bar_plot(axis, sequence, score_sequence, colors=color_sequence) 95 | 96 | 97 | class ConsensusMotifPlotter: 98 | 99 | def __init__(self, elements, weights, colors=None): 100 | self.n_elem = len(elements) 101 | self.colors = colors 102 | self.elements = elements 103 | self.weights = weights 104 | 105 | @classmethod 106 | def from_importance_scoring(cls, value): 107 | nucleotides = [['A', 'C', 'T', 'G']] * len(value.Sequence) 108 | scores = value.Scores 109 | colors = [['#008000', '#0000cc', '#cc0000', '#ffb300']] * len(value.Sequence) 110 | sorted_nucleotides = np.array(nucleotides) 111 | sorted_scores = np.array(scores) 112 | sorted_colors = np.array(colors) 113 | order = np.absolute(scores).argsort() 114 | for i, order in enumerate(order): 115 | sorted_scores[i, :] = sorted_scores[i, order] 116 | sorted_nucleotides[i, :] = sorted_nucleotides[i, order] 117 | sorted_colors[i, :] = sorted_colors[i, order] 118 | return cls(sorted_nucleotides, sorted_scores, sorted_colors) 119 | 120 | @classmethod 121 | def from_aligned_importance_scoring(cls, values, plot_width=30, start=None, end=None): 122 | 123 | n_seqs = values.shape[0] 124 | scores = values.sum(axis=0) / n_seqs 125 | if start is None and end is None: 126 | if plot_width < scores.shape[0]: 127 | total_scores = np.abs(scores).sum(axis=1) 128 | start = np.argmax([sum(total_scores[idx:(idx+plot_width)]) for idx in range(0, scores.shape[0]-plot_width)]) 129 | end = start + plot_width 130 | else: 131 | start = 0 132 | end = scores.shape[0] 133 | scores = scores[start:end, :] 134 | nucleotides = [["A", "T", "C", "G"]] * len(scores) 135 | colors = [['#008000', '#cc0000', '#0000cc', '#ffb300']] * len(scores) 136 | sorted_nucleotides = np.array(nucleotides) 137 | sorted_scores = np.array(scores) 138 | sorted_colors = np.array(colors) 139 | order = np.absolute(scores).argsort() 140 | for i, order in enumerate(order): 141 | sorted_scores[i, :] = sorted_scores[i, order] 142 | sorted_nucleotides[i, :] = sorted_nucleotides[i, order] 143 | sorted_colors[i, :] = sorted_colors[i, order] 144 | return cls(sorted_nucleotides, sorted_scores, sorted_colors), start, end 145 | 146 | @classmethod 147 | def from_weighted_sequence(cls, ws): 148 | colors_scheme = {'G': '#ffb300', 'A': '#008000', 'C': '#0000cc', 'T': '#cc0000', '_': '#333333'} 149 | return cls([[x] if x != '_' else "#" for x in ws.seq], [[x] for x in ws.scores], 150 | [[colors_scheme[x]] for x in ws.seq]) 151 | 152 | @classmethod 153 | def from_bio_motif(cls, motif, scale_info_content=True): 154 | n_elem = len(motif) 155 | colors_scheme = {'G': '#ffb300', 'A': '#008000', 'C': '#0000cc', 'T': '#cc0000'} 156 | bases = ['A', 'T', 'G', 'C'] 157 | if scale_info_content: 158 | rel_info = calc_relative_information(motif) 159 | else: 160 | rel_info = motif.counts 161 | 162 | basess = [] 163 | scoress = [] 164 | colorss = [] 165 | for i in range(0, n_elem): 166 | scores = [(b, rel_info[b][i], colors_scheme[b]) for b in bases] 167 | scores.sort(key=lambda t: t[1]) 168 | basess += [[x[0] for x in scores]] 169 | scoress += [[x[1] for x in scores]] 170 | colorss += [[x[2] for x in scores]] 171 | return cls(basess, scoress, colorss) 172 | 173 | 174 | def plot(self, axes): 175 | """ 176 | Add the motif to an axes 177 | :return: modifies the axes object with all the necessary characters 178 | """ 179 | make_stacked_bar_plot(axes, self.elements, self.weights, width=1, colors=self.colors) -------------------------------------------------------------------------------- /DeepSTARR/Neural_Network_DNA_Demo/motif_plotter/information_helper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def approximate_error(motif): 5 | """Calculate approximate error""" 6 | pwm = motif.pwm 7 | bases = list(pwm.keys()) 8 | n = sum(motif.counts[bases[0]]) 9 | approx_error = (len(bases)-1)/(2 * np.log(2) * n) 10 | return approx_error 11 | 12 | 13 | def exact_error(motif): 14 | """Calculate exact error, using multinomial(na,nc,ng,nt)""" 15 | ## Super Slow. O(n^3) 16 | pwm = motif.pwm 17 | bases = pwm.keys() 18 | na = sum(motif.counts['A']) 19 | n = na 20 | nc = 0 21 | ng = 0 22 | nt = 0 23 | done = False 24 | exact_error = 0 25 | while not done: 26 | print (na,nc,ng,nt) 27 | exact_error += sum([-p*np.log2(p) for p in [na/n, nc/n, ng/n, nt/n]]) 28 | if nt<=0: 29 | ## iterate inner loop 30 | if ng > 0: 31 | ## g => t 32 | ng = ng - 1 33 | nt = nt + 1 34 | elif nc > 0: 35 | ## c -> g 36 | nc = nc - 1; 37 | ng = ng + 1; 38 | else: 39 | ## a->c 40 | na = na - 1 41 | nc = nc + 1 42 | else: 43 | if ng > 0: 44 | ## g => t 45 | ng = ng - 1 46 | nt = nt + 1 47 | elif nc>0: 48 | ## c => g; all t -> g 49 | nc = nc - 1 50 | ng = nt + 1 51 | nt = 0 52 | elif na>0: 53 | ## a => c; all g,t -> c 54 | nc = nt + 1 55 | na = na - 1 56 | nt = 0 57 | else: 58 | done = True 59 | return exact_error 60 | 61 | 62 | def calc_info_matrix(motif, correction_type='approx'): 63 | """Calculate information matrix with small sample correction""" 64 | pwm = motif.pwm 65 | bases = pwm.keys() 66 | if correction_type=='approx': 67 | error = approximate_error(motif) 68 | else: 69 | error = exact_error(motif) 70 | info_matrix = [2-error+sum([pwm[b][l]*np.nan_to_num(np.log2(pwm[b][l])) for b in bases]) for l in range(0, len(motif))] 71 | return info_matrix 72 | 73 | def calc_relative_information(motif, correction_type='approx'): 74 | """Calculate relative information matrix""" 75 | pwm = motif.pwm 76 | bases = pwm.keys() 77 | if correction_type=='approx': 78 | info_matrix = calc_info_matrix(motif) 79 | else: 80 | info_matrix = calc_info_matrix(motif, 'exact') 81 | relative_info = {base: [prob*info for prob,info in zip(pwm[base], info_matrix)] for base in bases} 82 | return relative_info 83 | 84 | -------------------------------------------------------------------------------- /DeepSTARR/README.md: -------------------------------------------------------------------------------- 1 | # Scripts for training and interpreting DeepSTARR 2 | 3 | ## Training DeepSTARR 4 | 5 |

6 | 7 | 8 |

9 | 10 | Code to train DeepSTARR is in the notebook [DeepSTARR_training](DeepSTARR_training.ipynb). 11 | Data used to train and evaluate the DeepSTARR model as well as the final trained model are available on zenodo at https://doi.org/10.5281/zenodo.5502060. 12 | DeepSTARR is also deposited in [Kipoi](http://kipoi.org/models/DeepSTARR/). 13 | 14 | ### Tutorial 15 | An end-to-end example to train DeepSTARR, compute the nucleotide contribution scores and modisco TF motifs is contained in the following colab notebook: https://colab.research.google.com/drive/1Xgak40TuxWWLh5P5ARf0-4Xo0BcRn0Gd. You can run this notebook yourself to experiment with DeepSTARR. 16 | 17 | ### Predict developmental and housekeeping enhancer activity of new DNA sequences 18 | To predict the developmental and housekeeping enhancer activity in *Drosophila melanogaster* S2 cells for new DNA sequences, please run: 19 | ``` 20 | # Clone this repository 21 | git clone https://github.com/bernardo-de-almeida/DeepSTARR.git 22 | cd DeepSTARR/DeepSTARR 23 | 24 | # download the trained DeepSTARR model from zenodo (https://doi.org/10.5281/zenodo.5502060) 25 | 26 | # create 'DeepSTARR' conda environment by running the following: 27 | conda create --name DeepSTARR python=3.7 tensorflow=1.14.0 keras=2.2.4 # or tensorflow-gpu/keras-gpu if you are using a GPU 28 | source activate DeepSTARR 29 | pip install git+https://github.com/AvantiShri/shap.git@master 30 | pip install 'h5py<3.0.0' 31 | pip install deeplift==0.6.13.0 32 | 33 | # Run prediction script 34 | python DeepSTARR_pred_new_sequence.py -s Sequences_example.fa -m DeepSTARR.model 35 | ``` 36 | Where: 37 | * -s FASTA file with input DNA sequences 38 | 39 | ## Interpreting DeepSTARR with nucleotide contribution scores 40 | To compute nucleotide contribution scores for new DNA sequences in respect to developmental or housekeeping enhancer activity, please download the trained DeepSTARR model from [zenodo](https://doi.org/10.5281/zenodo.5502060) and run: 41 | ``` 42 | # Create and activate the conda environment as above, and then run: 43 | python DeepSTARR_nucl_contr_scores.py -m DeepSTARR.model -s Sequences_example.fa -c dev # Contribution scores for developmental enhancer activity 44 | python DeepSTARR_nucl_contr_scores.py -m DeepSTARR.model -s Sequences_example.fa -c hk # Contribution scores for housekeeping enhancer activity 45 | ``` 46 | Where: 47 | * -s FASTA file with input DNA sequences 48 | * -c Enhancer type for which contribution scores should be derived 49 | 50 | #### Note 51 | Neural_Network_DNA_Demo forked from https://github.com/const-ae/Neural_Network_DNA_Demo 52 | 53 | ## Questions 54 | If you have any questions/requests/comments please contact me at [bernardo.almeida94@gmail.com](mailto:bernardo.almeida94@gmail.com). 55 | -------------------------------------------------------------------------------- /DeepSTARR/Sequences_example.fa: -------------------------------------------------------------------------------- 1 | >chr2R_17033215_17033463_dev 2 | TTGACTAAAAAATATTCAAAATATAAACCAAACCAAACAACAAGCTCTTTGTTTTTGTTTACTCTAGGCGAAACAGAACTTTGAACTGCGTATGATAAATAGTTAATCACGCCACTTCTCTAGCATCACGCCACTTTTTTATGACGCAATTTCTGCGTAGATGATGGGACAGTTTTAGGTGTAAGCATATGAGTCATTTTGTTTTTATTAACCAAATATATATATATTACAAAAAGTGAGTTGAACTGA 3 | >chr3R_2902399_2902647_dev 4 | GAGAAACCGTCGATAAGGTGGCAGAAGGGCAGAACAAATGGGTCTCTCGTTTTCACTCCACTTCGCCAAGATTTGCCAGTGGTGGGGTGATGTCGGATGACGGGCGTCATTGTGCCGGGCGTCACATCCGCAAGTGGCGTGACTGAATTTGTAGTCTAGGGTACTCGTATTTCCGACCGCATCTGATTCACTTTTCATTGTTATGCATTGCAGTCACTGAAACCACAACCAGCAGCGACAAAATGACAT 5 | >chr3L_8520651_8520899_dev 6 | ATACCATGTATGTACATATCATGTGAAATTTACTCATCAAGTTTTATCTATTGAACTTTGAGCGATCGCGTGTTCAATTTTAGGCTTTTTGTGCGGATGTCATGCGAAATTTGATTGTGCGATCGAAGCATAAAATGTGGAATGGGTTCCAGGAATTTTGAGTATATGAAATTGAGATTTAATGTGAAATTTCATGATGTTGACTCATCGAAAAGTAATCAGAACGTTTTGGAATTTCACAATTTGCGA 7 | >chr2R_20148204_20148452_dev 8 | GAATTCAATATCCAAACACTTAATCTAAACTATTTTGTTGTTTAATTCCTGTTTACCCACTACATATTTACTTTGCGTGTTCAACGCGTTATTTAGTTCACACTGATAAAGCCATTTCCTCTGAAAACAACTGCATCTTGATAGCACAGTGCCAACAAAAAATAAAAAATCTGAGCAAATAGTGTACAAACTACATCTGGTCAACAGATGCTGATAAAGCGGCTTGATAAGGCCATCTCGGAAATCAGT 9 | >chr2R_12017449_12017697_dev 10 | CCGCCAGCCACTCTCAGTCCACAATCGCAATGCAGTACACACACACACGCACACAGACTTATCGCGTGGCGTCCCCCAACCGTTACGTCACTAGGCAAGGCAAACAAACACGCTGATTGGTCGAAAAACATCGTACGATAAATTTCTCAAATCGAAATGAGCGAAAAGACCCAGAAAATGGAATAATTAAAAAGCAAAGGCTGATTCACTGCAACTTGGCTGCTTCGTGCCACGTTTCGCTTCGGTTCG 11 | >chr2L_4913339_4913587_dev 12 | GATTGAAGCTACTTTCGTGAATAGAAGTCAGTTCTTAAGAATGGTTAAACTTTTGATATGCTTCTGAATTAAAGATGAATCACCCAAGAATATAATCCAAAGCAACGTTGCCCACGTCACGTAGTCCCACAGATAACTAAGAATGATGGCAATTCTTATTCTAATAATGCACACAAACCAGACGAAAGTCATGCCATCTTCAATACAGATTTGTAAATCACTGCCAAGAATCTGAGAAGCCGAGCTCAA 13 | >chr3R_15514106_15514354_dev 14 | AATTTGCCACGTCCTTGTTGAATTTTTTTTCTCGCACTATTTATGAATTCTTGTTTTGCGGTTAACGTTTTGCGTTATTCAGTGATTCATATCCGGGGACTCGTAAAATCATAAAAAAATGTGATAATAAAATATTTGTCAACATGTATCGCACCACCAGCACGACTCACCCACACACATGAAGTTAAAATTTGCAATCAAAGTGGCCGACTACTCGGATTGTGTGTGTGTAATTCAATCCTGATCAGT 15 | >chr3L_1546594_1546842_dev 16 | TGCAGCCGGCAAGACGACGCCATTAATTGGCCGTCGCCACGTCATGCTTGTCGTATGATTTTTCGTGTCCGAACTTGCAGTAGGTGAGCACCCGCCCCACGCCTCCGCTCTCTCAACATCTCGCTCGATGACGCGTCATTTTGGTGGCGTACTTTACCCTGCGATCGTACGATTTCCCCAGCGGCAGCGTCATTCCAATGGGGCAAGTGTTAGTCACGCTACGTGCTCCACTTTCCCAAGCGCCTTTTT 17 | >chr3R_16882712_16882960_dev 18 | TCGAGGCACCCCCACTTTTTTCCATTGTAATTATCTCCAAAAACCACCATCGGTAATTGTACAATTCTGCATTGTATAATCTAATTGATAAGACTGGCATTACACATCCCATCTGGAATTGTTAGTCATGTCCGGCAGCATCATGTGCCTGGCAAACAAAAGATGAAGTATCCAGCACCCACGCTCAGAACCCCATTTAGTGTGCGAGTGAAGCTTATCTAATCATCTCCGTCTGGGGCAGAAAGCTCA 19 | >chr2L_10204670_10204918_dev 20 | CTAGCTTGAATTCAATGACGTGCTAAAATGACATCATTTTATTACCATTAAAGACCCTAAAAAGATATTTGCTAAAAACGCAGTTCGAAACATTTTTGTCATTTATTTGGAAGTGAGTCAACAGATAAGGGAATGTACATTCATAGTTCATTGTTTAGTTTCTTTTTCTTTTTTGCTTGAAATATCCGGTTGGACTCGTTCCGGATGCCATAGTCATCAGTACATTTATTTTTGTAAAATGCATAACAT 21 | -------------------------------------------------------------------------------- /Figures/Fig1.R: -------------------------------------------------------------------------------- 1 | 2 | library(BSgenome.Dmelanogaster.UCSC.dm3) 3 | library(GenomicRanges) 4 | library(ggplot2) 5 | theme_set(theme_light() + theme(axis.text = element_text(colour = "black"))) 6 | library(ggpointdensity) 7 | 8 | ######## 9 | # Fig 1D 10 | ######## 11 | 12 | df <- read.delim("https://data.starklab.org/almeida/DeepSTARR/Figures_data/STARRseq_data_DeepSTARR_predictions.txt") 13 | 14 | # Prediction performance per set 15 | scater_list_test <- lapply(c(dev="dev", 16 | hk="hk"), function(class){ 17 | scater_list2 <- lapply(unique(df$set), function(set){ 18 | tmp <- df[df$set %in% set,] 19 | 20 | if(class=="dev"){ 21 | tmp$obs <- tmp$Dev_log2_enrichment 22 | tmp$pred <- tmp$Predictions_dev 23 | }else{ 24 | tmp$obs <- tmp$Hk_log2_enrichment 25 | tmp$pred <- tmp$Predictions_hk 26 | } 27 | 28 | gg <- ggplot(tmp, aes(obs, pred)) + 29 | geom_pointdensity(size=0.3) + 30 | scale_color_gradient(low = "grey70", high = "grey20") + 31 | guides(col=F) + 32 | scale_x_continuous(paste0(class, " enhancer activity [log2]"), 33 | breaks = seq(-12,20,2)) + 34 | scale_y_continuous(paste0("Predicted ", class, " activity [log2]"), 35 | breaks = seq(-12,20,2)) + 36 | geom_abline(slope = 1, intercept = 0, linetype="dashed", col="grey30") + 37 | theme_bw() + 38 | theme(panel.background = element_rect(fill="white",colour="white"), panel.grid = element_blank(), axis.line=element_line(colour="black"), 39 | axis.text=element_text(size=14, colour="black"), 40 | axis.title=element_text(size=16, colour="black"), 41 | plot.title = element_text(size=18, hjust = 0.5, colour="black"), plot.subtitle = element_text(size=14, hjust = 0.5)) + 42 | annotate("text", x=min(tmp$obs, na.rm=T), y = max(tmp$pred, na.rm=T), 43 | label = paste0("PCC: ", round(cor(tmp$obs, tmp$pred),2)), 44 | vjust=1, hjust=0, size=5) 45 | 46 | return(gg) 47 | }) 48 | return(scater_list2) 49 | }) 50 | 51 | # plot test set 52 | ggsave("Fig1D_dev.png", scater_list_test$dev[[3]], height = 4, width = 4.2, type="cairo") 53 | ggsave("Fig1D_hk.png", scater_list_test$hk[[3]], height = 4, width = 4.2, type="cairo") 54 | 55 | 56 | ######## 57 | # Fig 1E 58 | ######## 59 | 60 | ## test set with actual peak summits 61 | Peaks <- list(dev=read.delim("https://data.starklab.org/almeida/DeepSTARR/Figures_data/DSCP_200bp_gw.UMI_cut_merged.peaks.txt", header = F), 62 | hk=read.delim("https://data.starklab.org/almeida/DeepSTARR/Figures_data/RpS12_200bp_gw.UMI_cut_merged.peaks.txt", header = F)) 63 | 64 | Peaks <- lapply(Peaks, function(x){ 65 | gr <- makeGRangesFromDataFrame(x, seqnames.field = "V1", start.field = "V2", end.field = "V2", seqinfo = Dmelanogaster@seqinfo, keep.extra.columns = T) 66 | mcols(gr) <- mcols(gr)[,5:7] 67 | names(mcols(gr)) <- c("Enrch.", "Corr_enrch", "p_value") 68 | gr <- gr[gr$Corr_enrch>3] 69 | return(gr) 70 | }) 71 | 72 | # resize to 249bp (move 1nt because of bed file 0-based) 73 | Peaks <- lapply(Peaks, function(x) resize(shift(x,-1), fix="center", 249)) 74 | 75 | # only test set 76 | set <- "Test" 77 | tmp <- df[df$set %in% "Test" & df$class %in% c("positive_peaks"),] 78 | 79 | # get enhancers 80 | tmp <- tmp[paste0(tmp$seqnames, tmp$start) %in% c(paste0(Peaks$dev@seqnames, Peaks$dev@ranges@start), 81 | paste0(Peaks$hk@seqnames, Peaks$hk@ranges@start)),] 82 | 83 | # calculate log2 FC 84 | tmp$obs_log2FC <- tmp$Dev_log2_enrichment-tmp$Hk_log2_enrichment 85 | tmp$pred_log2FC <- tmp$Predictions_dev-tmp$Predictions_hk 86 | 87 | scater_list2_test <- ggplot(tmp, aes(obs_log2FC, pred_log2FC, col=obs_log2FC)) + 88 | geom_point(size=0.6) + 89 | scale_color_gradient2("log2FC [obs]", low = "#0C89CA", mid = "grey70", midpoint = 0, high = "#EF4A24") + 90 | scale_x_continuous("log2FC dev vs hk [observed]", 91 | # limits = c(min(c(tmp$obs,0)),max(tmp$obs)), 92 | breaks = seq(-12,20,2)) + 93 | scale_y_continuous("log2FC dev vs hk [predicted]", 94 | # limits = c(min(c(tmp$pred,0)),max(tmp$pred)), 95 | breaks = seq(-12,20,2)) + 96 | geom_abline(slope = 1, intercept = 0, linetype="dashed", col="grey30") + 97 | theme_bw() + 98 | theme(panel.background = element_rect(fill="white",colour="white"), panel.grid = element_blank(), axis.line=element_line(colour="black"), 99 | axis.text=element_text(size=14, colour="black"), axis.title=element_text(size=16, colour="black"), 100 | plot.title = element_text(size=18, hjust = 0.5, colour="black"), plot.subtitle = element_text(size=14, hjust = 0.5)) 101 | 102 | ggsave("Fig1E.png", scater_list2_test, height = 4, width = 5.4, type="cairo") 103 | -------------------------------------------------------------------------------- /Figures/Fig2.R: -------------------------------------------------------------------------------- 1 | 2 | library(BSgenome.Dmelanogaster.UCSC.dm3) 3 | library(GenomicRanges) 4 | library(ggplot2) 5 | theme_set(theme_light() + theme(axis.text = element_text(colour = "black"))) 6 | library(ggpointdensity) 7 | 8 | ######## 9 | # Fig 2A 10 | ######## 11 | 12 | # function to plot logos of enhancers 13 | library(ggseqlogo) 14 | my.logo <- function(x, cutoff=NULL){ 15 | p <- ggseqlogo(x, method='custom', seq_type='dna', ncol=1) + 16 | scale_x_continuous(breaks=seq(0,249,25), expand=c(0,0)) + 17 | scale_y_continuous(expand=c(0,0)) + 18 | theme(panel.border = element_rect(colour="black", fill=NA), 19 | axis.ticks = element_line(colour="black")) 20 | if(!is.null(cutoff)) p <- p + geom_hline(yintercept = cutoff, lty="dashed") 21 | p 22 | } 23 | 24 | # nucleotide contribution scores for oligos 25 | twist_contr_scores <- readRDS(url("https://data.starklab.org/almeida/DeepSTARR/Figures_data/DeepSTARR_contr_scores_oligo_library.rds")) 26 | 27 | # plots 28 | pdf("Fig2A.pdf", height = 2.1, width = 13) 29 | # dev 30 | i="chr3L_3310914_3311162_-_wt_dCP" 31 | my.logo(twist_contr_scores$dev[[grep(i, names(twist_contr_scores$dev), fixed = T)]]) + ggtitle(paste0(i, " - dev scores")) 32 | 33 | # hk 34 | i="chrX_4580794_4581042_-_wt_hkCP" 35 | my.logo(twist_contr_scores$hk[[grep(i, names(twist_contr_scores$hk), fixed = T)]]) + ggtitle(paste0(i, " - dev scores")) 36 | 37 | dev.off() 38 | 39 | 40 | ######## 41 | # Fig 2C 42 | ######## 43 | 44 | df <- read.delim("https://data.starklab.org/almeida/DeepSTARR/Figures_data/Drosophila_mutation_all_instances_results.txt") 45 | # only mutant version 1 46 | df <- df[df$Mutant_version %in% "s1",] 47 | 48 | # per enhancer type 49 | boxplot_list <- list() 50 | for(CP in c("dev", "hk")){ 51 | 52 | # only strong and specific dev or hk enhancers 53 | if(CP=="dev") df_CP <- df[complete.cases(df$dev_mut) & df$enhancer_group %in% "dev",] 54 | if(CP=="hk") df_CP <- df[complete.cases(df$hk_mut) & df$enhancer_group %in% "hk",] 55 | 56 | # rm GATA (in hk because it is similar to Dref) or GATA_no_DRE (in dev) 57 | if(CP=="dev"){ 58 | df_CP <- df_CP[!df_CP$Motif_mutated %in% c("GATA (no Dref)"),] 59 | } 60 | if(CP=="hk"){ 61 | df_CP <- df_CP[!df_CP$Motif_mutated %in% c("GATA"),] 62 | } 63 | 64 | ### rm motifs with only one enhancer 65 | df_CP <- df_CP[!df_CP$Motif_mutated %in% levels(df_CP$Motif_mutated)[table(df_CP$Motif_mutated)<2],] 66 | df_CP$Motif_mutated <- droplevels(df_CP$Motif_mutated) 67 | 68 | df_CP$Motif_mutated2 <- as.character(df_CP$Motif_mutated) 69 | df_CP$Motif_mutated2[grep("ctrl", df_CP$Motif_mutated)] <- "3 controls" 70 | df_CP$Motif_mutated2 <- factor(df_CP$Motif_mutated2, 71 | levels = c("3 controls", "AP-1", "GATA", "GATA (no Dref)", "SREBP", "CREB", "twist", "ETS", "STAT", "Trl", "Dref", "Ohler1", "Ohler7", "Ohler6")) 72 | 73 | df_CP$Motif_mutated3 <- as.character(df_CP$Motif_mutated) 74 | df_CP$Motif_mutated3[-grep("ctrl", df_CP$Motif_mutated)] <- "Motif" 75 | df_CP$Motif_mutated3 <- factor(df_CP$Motif_mutated3, 76 | levels = c("ctrl_TAGG", "ctrl_CCTTA", "ctrl_GGGCT", "Motif")) 77 | 78 | ### boxplot all motifs 79 | 80 | # x labels counts per boxplot 81 | xlabels <- sapply(as.character(levels(df_CP$Motif_mutated2)), function(x){ 82 | paste0(gsub("_", " ", x)," (", table(df_CP$Motif_mutated2)[x], ")") 83 | }) 84 | 85 | motif_colours <- sapply(names(xlabels), function(i){ 86 | if(length(grep("3 controls", i))>0) return("grey60") 87 | if(i %in% c("GATA", "GATA (no Dref)", "AP-1", "GAGA", "Trl", "twist", "SREBP", "ETS", "STAT", "CREB")) return("orangered") 88 | if(i %in% c("CAGCTG", "Dref", "Ohler1", "Ohler6", "Ohler7")) return("dodgerblue") 89 | }) 90 | 91 | # if less than 5 points, plot points instead of boxplot 92 | df_CP$FC <- df_CP[,paste0(CP, "_log2FC_wt_mut")] 93 | boxplot <- ggplot(df_CP, aes(Motif_mutated2, FC, fill=Motif_mutated2, alpha=Motif_mutated3)) + 94 | geom_point(size=-1) + 95 | geom_boxplot(data=df_CP[!df_CP$Motif_mutated2 %in% levels(df_CP$Motif_mutated2)[table(df_CP$Motif_mutated2)<=5],], aes(Motif_mutated2, FC, fill=Motif_mutated2, alpha=Motif_mutated3), 96 | outlier.size = 0.6) + 97 | geom_boxplot(data=df_CP[df_CP$Motif_mutated2 %in% levels(df_CP$Motif_mutated2)[table(df_CP$Motif_mutated2)<=5],], aes(Motif_mutated2, FC, fill=Motif_mutated2, alpha=Motif_mutated3), 98 | outlier.size = 0.6, alpha=0.2) + 99 | geom_point(data=df_CP[df_CP$Motif_mutated2 %in% levels(df_CP$Motif_mutated2)[table(df_CP$Motif_mutated2)<=5],], aes(Motif_mutated2, FC, fill=Motif_mutated2, alpha=Motif_mutated3), 100 | size=2.2, shape=21, position = position_jitter(width = 0.15)) + 101 | ylab(paste0("log2 FC ", CP, " enhancer activity")) + 102 | scale_x_discrete("Mutated motifs", breaks= names(xlabels), labels = xlabels) + 103 | geom_hline(yintercept = 0, linetype="dashed", col="grey40") + 104 | scale_fill_manual(values=motif_colours) + 105 | scale_alpha_manual(values=rep(1,5)) + 106 | guides(fill=F, alpha=F) + 107 | theme_bw(base_size = 14) + 108 | theme(axis.text.x = element_text(colour="black", size=15, angle=45, hjust=1), 109 | axis.text.y = element_text(colour="black", size=14), 110 | axis.title = element_text(colour="black", size=16)) + 111 | scale_y_continuous(breaks = seq(-10,20,2), limits = c(-8, 3)) 112 | 113 | boxplot_list[[CP]] <- boxplot 114 | 115 | } 116 | 117 | pdf("Fig2C.pdf", width = 6.5, height = 5.5) 118 | print(boxplot_list$dev) 119 | print(boxplot_list$hk) 120 | dev.off() 121 | 122 | 123 | ######## 124 | # Fig 2D 125 | ######## 126 | 127 | library(ggrepel) 128 | library(dplyr) 129 | 130 | high_col_list_tmp <- c(GATA=rgb(116,159,242, maxColorValue = 255), 131 | "AP-1"=rgb(177,53,115, maxColorValue = 255), 132 | AP1=rgb(177,53,115, maxColorValue = 255), 133 | Trl=rgb(212,147,91, maxColorValue = 255), 134 | twist=rgb(121,170,109, maxColorValue = 255), 135 | ETS=rgb(255,102,102, maxColorValue = 255), 136 | SREBP=rgb(102,102,0, maxColorValue = 255), 137 | CREB3="#CCCC00", 138 | MAF="#E41A1C", 139 | Dref="#51A9FF", 140 | Ohler1="#1E65AB", 141 | Ohler5="#4C0099", 142 | "Ebox/Ohler5"="#4C0099", 143 | Ohler6="#24ACAC", 144 | Ohler7="#B266FF") 145 | 146 | df <- read.delim("https://data.starklab.org/almeida/DeepSTARR/Figures_data/DeepSTARR_motif_imp_and_motif_enrichment.txt") 147 | 148 | # highlight specific groups 149 | df$motif_group2 <- NA 150 | df$motif_group2[grep("^AP1", df$Motif_cluster_name, ignore.case = T)] <- "AP-1" 151 | df$motif_group2[grep("^CREB3", df$Motif_cluster_name, ignore.case = T)] <- "CREB3" 152 | df$motif_group2[grep("^GATA", df$Motif_cluster_name, ignore.case = T)] <- "GATA" 153 | df$motif_group2[grep("^Trl|GAGA-repeat", df$Motif_cluster_name, ignore.case = T)] <- "Trl" 154 | df$motif_group2[grep("ETS/4|ETS/5|ETS/1|ETS/7|ETS/2|^ETS$", df$Motif_cluster_name, ignore.case = T)] <- "ETS" 155 | df$motif_group2[grep("SREBP", df$Motif_cluster_name, ignore.case = T)] <- "SREBP" 156 | df$motif_group2[grep("^twi", df$motif_description2, ignore.case = T)] <- "twist" 157 | df$motif_group2[grep("^Ebox/CATCTG$", df$motif_description2, ignore.case = T)] <- "twist" 158 | 159 | df$motif_group2[grep("DRE", df$Motif_cluster_name, ignore.case = T)] <- "Dref" 160 | df$motif_group2[grep("Ohler1", df$Motif_cluster_name, ignore.case = T)] <- "Ohler1" 161 | df$motif_group2[grep("Ohler6", df$Motif_cluster_name, ignore.case = T)] <- "Ohler6" 162 | df$motif_group2[grep("^Ohler7", df$Motif_cluster_name, ignore.case = T)] <- "Ohler7" 163 | 164 | df$motif_group2[grep("dev_new", df$Motif)][!complete.cases(df$motif_group2[grep("dev_new", df$Motif)])] <- "Others dev" 165 | df$motif_group2[grep("hk_new", df$Motif)][!complete.cases(df$motif_group2[grep("hk_new", df$Motif)])] <- "Others hk" 166 | df$motif_group2[grep("DRE_Ohler7", df$motif_description2)] <- "Others hk" 167 | 168 | theme_set(theme_classic(base_size=14) + theme(axis.text = element_text(colour = "black"))) 169 | 170 | pdf("Fig2D.pdf", width = 7.5, height = 5.5) 171 | for(class in c("dev", "hk")){ 172 | 173 | if(class=="dev"){ 174 | df$X <- df$Enrichment_dev_enhancers_log2OR 175 | df$Y <- df$log2FC_Dev 176 | tmp <- df[complete.cases(df$motif_group2) & !df$motif_group2 %in% c("Dref", "Ohler1", "Ohler6", "Ohler7", "Ebox/Ohler5", "Others hk"),] 177 | }else if(class=="hk"){ 178 | df$X <- df$Enrichment_hk_enhancers_log2OR 179 | df$Y <- df$log2FC_Hk 180 | tmp <- df[complete.cases(df$motif_group2) & df$motif_group2 %in% c("Dref", "Ohler1", "Ohler6", "Ohler7", "Ebox/Ohler5", "Others hk"),] 181 | } 182 | 183 | # Cluster representatives (two best by motif enrichment and DeepSTARR prediction) 184 | tmp2 <- rbind(df %>% 185 | group_by(Motif_cluster) %>% 186 | top_n(1, X), 187 | df %>% 188 | group_by(Motif_cluster) %>% 189 | top_n(1, Y)) 190 | tmp2 <- tmp2[!duplicated(tmp2$Motif),] 191 | 192 | gg_type <- ggplot(tmp2, aes(X, Y)) + 193 | geom_point(col="grey70", size=0.8) + 194 | scale_x_continuous("Motif enrichment (log2 odds ratio)", breaks = seq(-10,10,1)) + 195 | scale_y_continuous("DeepSTARR importance (log2 FC)", breaks = seq(-10,10,0.5)) + 196 | geom_hline(yintercept = 0, linetype="dashed", col="grey60") + 197 | geom_vline(xintercept = 0, linetype="dashed", col="grey60") + 198 | geom_point(data=tmp2[tmp2$motif_group2 %in% tmp$motif_group2,], aes(X, Y, col=motif_group2), 199 | size=2.2) + 200 | scale_colour_manual("Motif group", values = c(high_col_list_tmp, "Others dev"="black", "Others hk"="black")) + 201 | theme(axis.title=element_text(size=18), 202 | axis.text=element_text(size=16), 203 | legend.title=element_text(size=18), 204 | legend.text=element_text(size=16)) 205 | 206 | print(gg_type) 207 | 208 | } 209 | dev.off() 210 | 211 | 212 | -------------------------------------------------------------------------------- /Figures/Fig3.R: -------------------------------------------------------------------------------- 1 | 2 | library(BSgenome.Dmelanogaster.UCSC.dm3) 3 | library(GenomicRanges) 4 | library(ggplot2) 5 | theme_set(theme_light() + theme(axis.text = element_text(colour = "black"))) 6 | library(ggpointdensity) 7 | 8 | motif_colours <- c(GATA=rgb(116,159,242, maxColorValue = 255), 9 | GATAA=rgb(116,159,242, maxColorValue = 255), 10 | TGA.TCA=rgb(177,53,115, maxColorValue = 255), 11 | AP1=rgb(177,53,115, maxColorValue = 255), 12 | "AP-1"=rgb(177,53,115, maxColorValue = 255), 13 | GAGA=rgb(212,147,91, maxColorValue = 255), 14 | Trl=rgb(212,147,91, maxColorValue = 255), 15 | twist=rgb(121,170,109, maxColorValue = 255), 16 | ATCGAT=rgb(0,128,255, maxColorValue = 255), 17 | Dref=rgb(0,128,255, maxColorValue = 255), 18 | Random="grey60", 19 | "3 controls"="grey60", 20 | ctrl_TAGG="grey60", 21 | ctrl_GGGCT="grey60", 22 | ctrl_CCTTA="grey60", 23 | GGGCT="grey60" 24 | ) 25 | 26 | ######## 27 | # Fig 3A 28 | ######## 29 | 30 | # function to plot logos of enhancers 31 | library(ggseqlogo) 32 | my.logo <- function(x, cutoff=NULL){ 33 | p <- ggseqlogo(x, method='custom', seq_type='dna', ncol=1) + 34 | scale_x_continuous(breaks=seq(0,249,25), expand=c(0,0)) + 35 | scale_y_continuous(expand=c(0,0)) + 36 | theme(panel.border = element_rect(colour="black", fill=NA), 37 | axis.ticks = element_line(colour="black")) 38 | if(!is.null(cutoff)) p <- p + geom_hline(yintercept = cutoff, lty="dashed") 39 | p 40 | } 41 | 42 | # nucleotide contribution scores for oligos 43 | twist_contr_scores <- readRDS(url("https://data.starklab.org/almeida/DeepSTARR/Figures_data/DeepSTARR_contr_scores_oligo_library.rds")) 44 | 45 | # plots 46 | pdf("Fig3A.pdf", height = 2.1, width = 13) 47 | 48 | i="chr3L_13015084_13015332_-_wt_dCP" 49 | my.logo(twist_contr_scores$dev[[grep(i, names(twist_contr_scores$dev), fixed = T)]]) + ggtitle(paste0(i, " - dev scores")) 50 | 51 | dev.off() 52 | 53 | 54 | ######## 55 | # Fig 3B 56 | ######## 57 | 58 | mutation_data_and_DeepSTARR <- readRDS(url("https://data.starklab.org/almeida/DeepSTARR/Figures_data/Drosophila_mutation_data_3_and_DeepSTARR.rds")) 59 | mutation_data_and_DeepSTARR <- mutation_data_and_DeepSTARR[,c(1:14,60:69,75,80,78,73,79,85,86)] 60 | names(mutation_data_and_DeepSTARR)[30:31] <- c("DeepSTARR_dev", "DeepSTARR_hk") 61 | 62 | Motifs <- list(GATAA=data.frame(Motif="GATAA", ID="flyfactorsurvey__srp_SANGER_5_FBgn0003507"), 63 | TGA.TCA=data.frame(Motif="TGA.TCA", ID="jaspar__MA0476.1"), 64 | twist=data.frame(Motif="twist", ID="flyfactorsurvey__twi_da_SANGER_5_FBgn0000413"), 65 | Trl=data.frame(Motif="Trl", ID="flyfactorsurvey__Trl_FlyReg_FBgn0013263"), 66 | ATCGAT=data.frame(Motif="ATCGAT", ID="homer__AVYTATCGATAD_DREF")) 67 | Motifs <- do.call(rbind, Motifs) 68 | 69 | pdf("Fig3B.pdf", width = 6.7, height = 4.7) 70 | library(patchwork) 71 | summary_statistics <- data.frame() 72 | for(motif in unique(Motifs$Motif)){ 73 | 74 | class <- ifelse(motif=="ATCGAT", "hk", "dev") 75 | high_col <- motif_colours[motif] 76 | 77 | if(motif=="twist"){ 78 | tmp <- rbind(mutation_data_and_DeepSTARR[mutation_data_and_DeepSTARR$enhancer_group %in% class & mutation_data_and_DeepSTARR$Motif_mutated %in% "CA..TG" & mutation_data_and_DeepSTARR$wt_instance %in% c("CATCTG", "CAGATG", "CATATG"),], 79 | mutation_data_and_DeepSTARR[mutation_data_and_DeepSTARR$enhancer_group %in% class & mutation_data_and_DeepSTARR$Motif_mutated %in% "twist",]) 80 | }else if(motif=="Trl"){ 81 | tmp <- mutation_data_and_DeepSTARR[mutation_data_and_DeepSTARR$enhancer_group %in% class & mutation_data_and_DeepSTARR$Motif_mutated %in% motif,] 82 | tmp <- tmp[c(intersect(grep("GAGAG", tmp$instance_sequence_extended), grep("+", tmp$instance_strand)), 83 | intersect(grep("CTCTC", tmp$instance_sequence_extended), grep("-", tmp$instance_strand))),] 84 | }else{ 85 | tmp <- mutation_data_and_DeepSTARR[mutation_data_and_DeepSTARR$enhancer_group %in% class & mutation_data_and_DeepSTARR$Motif_mutated %in% motif,] 86 | } 87 | 88 | if(motif=="ATCGAT"){model_list <- c(names(tmp)[31], as.character(Motifs$ID[Motifs$Motif %in% motif]))}else{model_list <- c(names(tmp)[30], as.character(Motifs$ID[Motifs$Motif %in% motif]))} 89 | 90 | for(model in model_list){ 91 | 92 | if(class=="dev") tmp$var <- tmp$dev_log2FC_wt_mut 93 | if(class=="hk") tmp$var <- tmp$hk_log2FC_wt_mut 94 | tmp <- tmp[complete.cases(tmp$var),] 95 | 96 | g1 <- ggplot(tmp, aes("1", var)) + 97 | geom_violin(alpha=0.7, fill=high_col) + 98 | geom_boxplot(outlier.size = -1, color="black", width=0.18, size=0.8, fill=NA) + 99 | scale_y_continuous(paste0("log2 FC - ", motif, " mutant [observed]"), breaks = seq(-10,4,2)) + 100 | xlab(paste0("All instances\n(n=", nrow(tmp[complete.cases(tmp$var),]), ")")) + 101 | geom_hline(yintercept = 0, linetype="dashed", col="grey40") + 102 | theme(axis.ticks.x = element_blank(), 103 | axis.text.x = element_blank()) 104 | 105 | pc <- cor.test(tmp[,model], tmp$var, 106 | method = "pearson", use="complete.obs") 107 | 108 | g2 <- ggplot(tmp, aes(tmp[,model], var)) + 109 | geom_pointdensity() + 110 | scale_color_gradient(low = "grey70", high = high_col) + 111 | guides(col=F) + 112 | scale_y_continuous(paste0("log2 FC - ", motif, " mutant"), breaks = seq(-10,4,2)) + 113 | scale_x_continuous(paste0("Predicted motif importance")) + 114 | ggtitle(paste0(motif, " mutations - ", model)) + 115 | geom_hline(yintercept = 0, linetype="dashed", col="grey40") + 116 | theme(axis.title.y = element_blank()) + 117 | annotate("text", x=min(tmp[,model], na.rm=T), y = max(tmp$var, na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5) 118 | 119 | print(g1 + g2 + plot_layout(widths = c(1,5))) 120 | 121 | summary_statistics <- rbind(summary_statistics, data.frame(motif=motif, 122 | model=model, 123 | PCC=pc$estimate, 124 | pvalue=pc$p.value)) 125 | } 126 | } 127 | 128 | dev.off() 129 | 130 | 131 | ######## 132 | # Fig 3C 133 | ######## 134 | 135 | # read table treated with PWM confident instances 136 | Motif_instances_df <- readRDS(url("https://data.starklab.org/almeida/DeepSTARR/Figures_data/Drosophila_Motif_instances_df_treated_PWM_info_2.rds")) 137 | 138 | table(Motif_instances_df$Motif, Motif_instances_df$`Motif_p5e-04`>0) 139 | table(Motif_instances_df$Motif_2, Motif_instances_df$`Motif_p5e-04`>0) 140 | 141 | # Only enhancers that require the motifs (log2FC <= -1) and have a confident motif instance 142 | # Motif_instances_df <- Motif_instances_df[complete.cases(Motif_instances_df$log2FC_all_instances) & Motif_instances_df$log2FC_all_instances <= -1 & Motif_instances_df$`Motif_p5e-04`>0,] 143 | # don't subset on PWM score and include neg control motifs 144 | Motif_instances_df <- Motif_instances_df[(complete.cases(Motif_instances_df$log2FC_all_instances) & Motif_instances_df$log2FC_all_instances <= -1) | Motif_instances_df$Motif_2 %in% "3 controls",] 145 | table(Motif_instances_df$Motif_2, Motif_instances_df$`Motif_p5e-04`>0) 146 | 147 | # calculate delta log2FC 148 | df_delta <- Motif_instances_df %>% 149 | group_by(Sequence_ID, Motif) %>% 150 | summarise(Motif_2=unique(Motif_2), 151 | enhancer_group=unique(enhancer_group), 152 | Delta=max(log2FC)-min(log2FC), 153 | motif_counts=n()) 154 | 155 | table(df_delta$Motif, df_delta$motif_counts) 156 | table(df_delta$Motif_2, df_delta$motif_counts) 157 | 158 | my_theme <- theme_bw(base_size = 14) + 159 | theme(axis.text.x = element_text(colour="black", size=15, angle=45, hjust=1), 160 | axis.text.y = element_text(colour="black", size=14), 161 | axis.title = element_text(colour="black", size=16), 162 | strip.text = element_text(colour="black", size=14)) 163 | 164 | #### compare delta in FC to wt 165 | 166 | tmp <- df_delta[df_delta$Motif_2 %in% c("3 controls", "AP-1", "GATA", "twist", "Trl", "Dref") & df_delta$motif_counts>1,] 167 | tmp$Motif_2 <- droplevels(tmp$Motif_2) 168 | 169 | # x labels counts per boxplot 170 | xlabels <- sapply(as.character(levels(tmp$Motif_2)), function(x){ 171 | paste0(gsub("_", " ", x)," (", table(tmp$Motif_2)[x], ")") 172 | }) 173 | 174 | # test difference to 3 controls 175 | wilcox.test(tmp$Delta[tmp$Motif_2 == "AP-1" & tmp$enhancer_group=="dev"], tmp$Delta[tmp$Motif_2 == "3 controls" & tmp$enhancer_group=="dev"]) # n.s. 176 | wilcox.test(tmp$Delta[tmp$Motif_2 == "GATA" & tmp$enhancer_group=="dev"], tmp$Delta[tmp$Motif_2 == "3 controls" & tmp$enhancer_group=="dev"]) # *** 177 | wilcox.test(tmp$Delta[tmp$Motif_2 == "twist" & tmp$enhancer_group=="dev"], tmp$Delta[tmp$Motif_2 == "3 controls" & tmp$enhancer_group=="dev"]) # * 178 | wilcox.test(tmp$Delta[tmp$Motif_2 == "Trl" & tmp$enhancer_group=="dev"], tmp$Delta[tmp$Motif_2 == "3 controls" & tmp$enhancer_group=="dev"]) # *** 179 | wilcox.test(tmp$Delta[tmp$Motif_2 == "Dref" & tmp$enhancer_group=="hk"], tmp$Delta[tmp$Motif_2 == "3 controls" & tmp$enhancer_group=="hk"]) # **** 180 | 181 | gg <- ggplot(tmp, aes(Motif_2, Delta, fill=Motif_2)) + 182 | geom_violin(alpha=0.7) + 183 | geom_boxplot(outlier.size = -1, color="black", width=0.15, size=0.8, fill=NA) + 184 | facet_grid(~enhancer_group, scales = "free_x", space="free_x") + 185 | scale_fill_manual(values=motif_colours) + 186 | guides(fill=F) + 187 | scale_x_discrete("Mutated motifs", labels=xlabels) + 188 | scale_y_continuous(paste0("log2 FC between instances\nin the same enhancer"), breaks = seq(-10,10,1), expand=c(0,0), limits = c(0,5.9)) + 189 | my_theme 190 | 191 | pdf(paste0("Fig3C_left.pdf"), height = 5, width = 7) 192 | print(gg + geom_hline(yintercept = 1, col="black", linetype="dashed", size=0.7)) 193 | dev.off() 194 | 195 | ### barplots 196 | tmp_melt <- reshape::melt(table(tmp$Motif_2, tmp$Delta >= log2(2))) 197 | tmp_melt <- tmp_melt[tmp_melt$Var.1 %in% c("3 controls", "AP-1", "GATA", "twist", "Trl", "Dref"),] 198 | tmp_melt$Var.1 <- factor(tmp_melt$Var.1, levels=c("3 controls", "AP-1", "GATA", "twist", "Trl", "Dref")) 199 | 200 | gg_bar <- ggplot(tmp_melt, aes(Var.1, value)) + 201 | xlab("Mutated motifs") + 202 | scale_x_discrete("Mutated motifs", labels=xlabels) + 203 | scale_y_continuous(expand=c(0,0)) + 204 | theme_bw(base_size = 14) + 205 | theme(axis.text.x = element_text(colour=motif_colours[levels(tmp_melt$Var.1)], size=15, angle=45, hjust=1), 206 | axis.text.y = element_text(colour="black", size=14), 207 | axis.title = element_text(colour="black", size=16), 208 | strip.text = element_text(colour="black", size=14)) + 209 | scale_fill_manual("> 2-fold diff", values=c("grey90", "grey50")) + 210 | scale_colour_manual("log2 FC < -1", values=c("grey90", "red")) + 211 | guides(colour=F) 212 | 213 | # barplots - with line on average (excluding neg regions) 214 | av <- round(mean(prop.table(table(tmp$Motif_2, tmp$Delta >= log2(2)), margin = 1)[-1,2]),3) 215 | ggsave("Fig3C_right.pdf", 216 | gg_bar + geom_bar(stat = "identity", position="fill", aes(fill = factor(Var.2)), width=0.8, colour="black", size=0.3) + geom_hline(yintercept = av, linetype="dashed", col="black") + scale_y_continuous("Proportion of enhancers [%]", expand=c(0,0), breaks = c(seq(0,1,0.25),av), labels = c(seq(0,1,0.25),av)*100), 217 | width = 6, height = 5) 218 | 219 | 220 | ######## 221 | # Fig 3D 222 | ######## 223 | 224 | final_statistics <- summary_statistics 225 | final_statistics$motif <- factor(final_statistics$motif, levels = c("GATAA", "TGA.TCA", "twist", "Trl", "ATCGAT"), 226 | labels = c("GATA", "AP-1", "twist", "Trl", "Dref")) 227 | 228 | final_statistics$model2 <- "PWM" 229 | final_statistics$model2[grep("DeepSTARR_dev", final_statistics$model)] <- "DeepSTARR log2FC (dev)" 230 | final_statistics$model2[grep("DeepSTARR_hk", final_statistics$model)] <- "DeepSTARR log2FC (hk)" 231 | final_statistics$model2 <- factor(final_statistics$model2, levels = c("PWM", "DeepSTARR log2FC (dev)", "DeepSTARR log2FC (hk)")) 232 | 233 | # correct PCC from log2FC comparisons 234 | final_statistics$PCC[-grep("DeepSTARR", final_statistics$model)] <- -final_statistics$PCC[-grep("DeepSTARR", final_statistics$model)] 235 | 236 | gg <- ggplot(final_statistics, aes(x=motif, y=PCC, fill=model2)) + 237 | geom_bar(stat="identity", position=position_dodge(), width=0.7, colour="black", size=0.2) + 238 | scale_fill_manual("Model", values=c("grey60", "orangered", "dodgerblue")) + 239 | scale_y_continuous("PCC", breaks = seq(-10,4,0.1), limits = c(0,0.55), expand = c(0,0)) + 240 | scale_x_discrete("Motif mutated", labels=c("GATA", "AP-1", "twist", "Trl", "Dref")) 241 | 242 | ggsave("Fig3D.pdf", gg, width = 7.1, height = 4) 243 | -------------------------------------------------------------------------------- /Figures/Fig5.R: -------------------------------------------------------------------------------- 1 | 2 | library(BSgenome.Dmelanogaster.UCSC.dm3) 3 | library(GenomicRanges) 4 | library(ggplot2) 5 | theme_set(theme_light() + theme(axis.text = element_text(colour = "black"))) 6 | library(patchwork) 7 | library(dplyr) 8 | 9 | high_col_list <- c(GATA=rgb(116,159,242, maxColorValue = 255), 10 | GATAA=rgb(116,159,242, maxColorValue = 255), 11 | TGA.TCA=rgb(177,53,115, maxColorValue = 255), 12 | GAGA=rgb(212,147,91, maxColorValue = 255), 13 | CATCTG=rgb(121,170,109, maxColorValue = 255), 14 | CA..TG=rgb(121,170,109, maxColorValue = 255), 15 | ATCGAT=rgb(0,128,255, maxColorValue = 255), 16 | "AP-1"=rgb(177,53,115, maxColorValue = 255), 17 | AP1=rgb(177,53,115, maxColorValue = 255), 18 | Trl=rgb(212,147,91, maxColorValue = 255), 19 | twist=rgb(121,170,109, maxColorValue = 255), 20 | ETS=rgb(255,102,102, maxColorValue = 255), 21 | SREBP=rgb(102,102,0, maxColorValue = 255), 22 | Dref=rgb(0,128,255, maxColorValue = 255), 23 | Ohler1=rgb(0,76,153, maxColorValue = 255), 24 | Ohler5="#4C0099", 25 | "Ebox/Ohler5"="#4C0099", 26 | Ohler6=rgb(0,153,153, maxColorValue = 255), 27 | Ohler7="#C899F7", 28 | Random="grey60", 29 | GGGCT="grey60") 30 | 31 | ######## 32 | # Fig 5B 33 | ######## 34 | 35 | ### load motif pair co-occurence results 36 | fisher_results <- readRDS(url("https://data.starklab.org/almeida/DeepSTARR/Figures_data/Drosophila_motif_pair_distance_preferences_genomic_enhancers_Fisher_enrichment.rds")) 37 | fisher_results <- fisher_results %>% group_by(type) %>% mutate(fisher_FDR=p.adjust(fisher_p, method="fdr"), 38 | fisher_FDR_label=ifelse(p.adjust(fisher_p, method="fdr")<0.05, "*", "")) 39 | fisher_results <- fisher_results[fisher_results$type %in% "All_pairs",] 40 | fisher_results$Distance <- factor(fisher_results$Distance, levels = c("0-25", "25-50", "50-75", "75-100", "100-125", "125-150", "150-250")) 41 | fisher_results$m1 <- gsub("AP1", "AP-1", fisher_results$m1) 42 | fisher_results$m2 <- gsub("AP1", "AP-1", fisher_results$m2) 43 | 44 | 45 | ### load linear regression association with enhancer activity 46 | lm_dist <- read.delim("https://data.starklab.org/almeida/DeepSTARR/Figures_data/Drosophila_validation_activity_motif_distance_genomic_sequences_all_per_bin.txt", stringsAsFactors = F) 47 | lm_dist <- lm_dist %>% mutate(FDR=p.adjust(pvalue, method="fdr"), 48 | FDR_label=ifelse(p.adjust(pvalue, method="fdr")<0.05, "*", "")) 49 | 50 | lm_dist$cutoff <- factor(lm_dist$cutoff, levels = c("0-25", "25-50", "50-75", "75-100", "100-125", "125-150", "150-250")) 51 | lm_dist$m1 <- gsub("AP1", "AP-1", lm_dist$m1) 52 | lm_dist$m2 <- gsub("AP1", "AP-1", lm_dist$m2) 53 | 54 | 55 | ### load in silico predictions 56 | df_summary_main <- readRDS(url("https://data.starklab.org/almeida/DeepSTARR/Figures_data/DeepSTARR_motif_pair_distance_synth_sequences_df_summary_main.rds")) 57 | 58 | candidate_motifs <- c(GATA="AGATAAGA", 59 | "AP-1"="ATGACTCAT", 60 | twist="ACATCTGT", 61 | Trl="AGAGAGA", 62 | ETS="ACCGGAAG", 63 | SREBP="ATCACGCCA", 64 | Dref="TATCGATA", 65 | Ohler1="GTGTGACC", 66 | Ohler6="AAAATACCA") 67 | 68 | ### plots per motifA at the center 69 | pdf("Fig5B.pdf", width = 7, height = 7) 70 | for(motif1 in names(candidate_motifs)){ 71 | 72 | if(motif1 %in% c("Dref", "Ohler1", "Ohler6")){type="hk"}else{type="dev"} 73 | 74 | m1_strand <- "+" 75 | m2_strand <- "+" 76 | 77 | # table to use 78 | tmp_main <- df_summary_main[df_summary_main$Motif1 %in% motif1 & df_summary_main$Motif1_strand %in% m1_strand & df_summary_main$Motif2_strand %in% m2_strand,] 79 | 80 | # enhancer type 81 | if(type=="dev"){ 82 | tmp_main <- tmp_main[!tmp_main$Motif2 %in% c("Dref", "Ohler1", "Ohler6"),] 83 | tmp_main$Backbone_motif2_act <- tmp_main$Backbone_motif2_act_dev 84 | tmp_main$Fold_change_BPNet_style <- tmp_main$Fold_change_BPNet_style_dev 85 | tmp_main$Residuals_FC <- tmp_main$Residuals_FC_dev 86 | tmp_main$Backbone_motif1_motif2_act <- tmp_main$Backbone_motif1_motif2_act_dev 87 | } 88 | if(type=="hk"){ 89 | tmp_main <- tmp_main[tmp_main$Motif2 %in% c("Dref", "Ohler1", "Ohler6", "GGGCT"),] 90 | tmp_main$Backbone_motif2_act <- tmp_main$Backbone_motif2_act_hk 91 | tmp_main$Fold_change_BPNet_style <- tmp_main$Fold_change_BPNet_style_hk 92 | tmp_main$Residuals_FC <- tmp_main$Residuals_FC_hk 93 | tmp_main$Backbone_motif1_motif2_act <- tmp_main$Backbone_motif1_motif2_act_hk 94 | } 95 | 96 | # final summarised table 97 | tmp <- tmp_main %>% 98 | group_by(Distance_Motif2_Motif1, Motif2) %>% 99 | summarise(Backbone_motif2_act=median(Backbone_motif2_act), 100 | Fold_change_BPNet_style=median(Fold_change_BPNet_style), 101 | Residuals_FC=median(Residuals_FC), 102 | Backbone_motif1_motif2_act=median(Backbone_motif1_motif2_act)) 103 | tmp$side <- factor(ifelse(tmp$Distance_Motif2_Motif1>0, "down", "up"), levels = c("up", "down")) 104 | 105 | # select pair motif 106 | if(type=="dev") m2_list <- c("GATA", "AP-1", "SREBP", "twist","ETS", "Trl") 107 | if(type=="hk") m2_list <- c("Dref", "Ohler1", "Ohler6") 108 | 109 | for(motif2 in m2_list){ 110 | tmp2 <- tmp[tmp$Motif2 %in% motif2,] 111 | 112 | # Interaction residuals FC 113 | g_smooth_res <- ggplot(tmp2, aes(abs(Distance_Motif2_Motif1), Residuals_FC, col=Motif2, group=Motif2)) + 114 | geom_point(alpha=0.3)+ 115 | geom_smooth(span = 0.1, se=F)+ 116 | ggtitle(paste0(motif1, " / ", motif2)) + 117 | theme(plot.title = element_text(hjust=0.5)) + 118 | scale_y_continuous("DeepSTARR predicted coperativity") + 119 | scale_x_continuous("Motif pair distance (bp)", breaks = seq(-200,200,25), limits = c(0,125)) + 120 | geom_hline(yintercept = 1, linetype="dashed", col="grey60") + 121 | scale_color_manual("", values=high_col_list, drop=T) + 122 | guides(col=F) + 123 | theme(axis.title.x = element_blank(), 124 | axis.text = element_text(colour="black", size=12)) 125 | 126 | ### add barplot of co-occurency 127 | fisher_results_tmp <- fisher_results[fisher_results$m1==motif1 & fisher_results$m2==motif2 & fisher_results$Distance %in% c("0-25", "25-50", "50-75", "75-100", "100-125"),] 128 | 129 | g_occurence <- ggplot(fisher_results_tmp, aes(Distance, log2(fisher_OR), fill=log2(fisher_OR))) + 130 | geom_bar(stat="identity") + 131 | geom_hline(yintercept = 0, linetype="dashed", col="grey60") + 132 | scale_fill_gradient2("Odds ratio", low = RColorBrewer::brewer.pal(7, "PRGn")[1], mid = "grey70", high = RColorBrewer::brewer.pal(7, "PRGn")[7], midpoint = 0) + 133 | # scale_fill_brewer("Odds ratio", palette="PRGn") + 134 | guides(fill=F) + 135 | ylab("Occurrence\n[log2 OR]") + 136 | xlab("Motif pair distance (bp)") + 137 | geom_text(aes(y=0, label=fisher_FDR_label), color="black", size=10) + 138 | theme(axis.text = element_text(colour="black", size=12), 139 | axis.title.x = element_text(colour="black", size=15)) 140 | 141 | ### add barplot or co-occurency 142 | lm_dist_tmp <- lm_dist[lm_dist$m1==motif1 & lm_dist$m2==motif2 & lm_dist$cutoff %in% c("0-25", "25-50", "50-75", "75-100", "100-125"),] 143 | 144 | g_association <- ggplot(lm_dist_tmp, aes(cutoff, Estimate_lower_than, fill=Estimate_lower_than)) + 145 | geom_bar(stat="identity") + 146 | geom_hline(yintercept = 0, linetype="dashed", col="grey60") + 147 | scale_fill_gradient2("lm coef", low = "#2166AC", mid = "grey70", high = "#B2182B", midpoint = 0) + 148 | guides(fill=F) + 149 | scale_y_continuous("Enh activity\n[lm coef]") + 150 | xlab("Motif pair distance (bp)") + 151 | geom_text(aes(y=0, label=FDR_label), color="black", size=10) + 152 | theme(axis.title.x = element_blank(), 153 | axis.text = element_text(colour="black", size=12)) 154 | 155 | print(g_smooth_res + g_association + g_occurence + plot_layout(ncol=1, heights = c(2.5,1,1))) 156 | 157 | } 158 | 159 | print(motif1) 160 | } 161 | 162 | dev.off() 163 | 164 | 165 | ######## 166 | # Fig 5C 167 | ######## 168 | 169 | ### plots per motifA at the center 170 | for(motif1 in names(candidate_motifs)){ 171 | 172 | func="median" 173 | if(motif1 %in% c("Dref", "Ohler1", "Ohler6")){type="hk"}else{type="dev"} 174 | 175 | pdf(paste0("Fig5C_", as.character(motif1), ".pdf"), width = 7, height = 5.5) 176 | 177 | for(m1_strand in c("+", "-")){ 178 | for(m2_strand in c("+", "-")){ 179 | 180 | # table to use 181 | tmp_main <- df_summary_main[df_summary_main$Motif1 %in% motif1 & df_summary_main$Motif1_strand %in% m1_strand & df_summary_main$Motif2_strand %in% m2_strand,] 182 | 183 | # enhancer type 184 | if(type=="dev"){ 185 | tmp_main$Backbone_motif2_act <- tmp_main$Backbone_motif2_act_dev 186 | tmp_main$Fold_change_BPNet_style <- tmp_main$Fold_change_BPNet_style_dev 187 | tmp_main$Residuals_FC <- tmp_main$Residuals_FC_dev 188 | tmp_main$Backbone_motif1_motif2_act <- tmp_main$Backbone_motif1_motif2_act_dev 189 | } 190 | if(type=="hk"){ 191 | tmp_main$Backbone_motif2_act <- tmp_main$Backbone_motif2_act_hk 192 | tmp_main$Fold_change_BPNet_style <- tmp_main$Fold_change_BPNet_style_hk 193 | tmp_main$Residuals_FC <- tmp_main$Residuals_FC_hk 194 | tmp_main$Backbone_motif1_motif2_act <- tmp_main$Backbone_motif1_motif2_act_hk 195 | } 196 | 197 | # final summarised table 198 | tmp <- tmp_main %>% 199 | group_by(Distance_Motif2_Motif1, Motif2) %>% 200 | summarise(Backbone_motif2_act=median(Backbone_motif2_act), 201 | Fold_change_BPNet_style=median(Fold_change_BPNet_style), 202 | Residuals_FC=median(Residuals_FC), 203 | Backbone_motif1_motif2_act=median(Backbone_motif1_motif2_act)) 204 | tmp$side <- factor(ifelse(tmp$Distance_Motif2_Motif1>0, "down", "up"), levels = c("up", "down")) 205 | 206 | # Interaction residuals FC 207 | g_smooth_res <- ggplot(tmp, aes(abs(Distance_Motif2_Motif1), Residuals_FC, col=Motif2, group=Motif2)) + 208 | geom_point(alpha=0.3)+ 209 | geom_smooth(span = 0.1, se=F)+ 210 | ggtitle(paste0(motif1, m1_strand, "/motif2", m2_strand, " : ", func, " across ", length(unique(df_summary_main$Backbone)), " backbones")) + 211 | theme(plot.title = element_text(hjust=0.5)) + 212 | scale_y_continuous("DeepSTARR predicted coperativity") + 213 | scale_x_continuous("Motif pair distance (bp)", breaks = c(-10,10,seq(-200,200,25))) + 214 | geom_hline(yintercept = 1, linetype="dashed", col="grey60") + 215 | scale_color_manual("", values=high_col_list, drop=T) 216 | 217 | print(g_smooth_res) 218 | 219 | } 220 | } 221 | 222 | dev.off() 223 | 224 | print(motif1) 225 | } 226 | 227 | 228 | ######## 229 | # Fig 5D,E 230 | ######## 231 | 232 | ## Get confident motif positions 233 | 234 | library(motifmatchr) 235 | library(TFBSTools) 236 | library(seqinr) 237 | 238 | # Motifs 239 | load(url("https://data.starklab.org/almeida/Motif_clustering/TF_clusters_PWMs.RData")) 240 | View(TF_clusters_PWMs$metadata) 241 | load(url("https://data.starklab.org/almeida/Drosophila_enhancers_motif_enrichment/Motif_enrichment_all.RData")) 242 | View(Results_list_all$dev_vs_ctrl) 243 | View(Results_list_all$hk_vs_ctrl) 244 | 245 | TF_motifs <- list(srp=data.frame(Motif="GATA", ID="flyfactorsurvey__srp_SANGER_5_FBgn0003507", core="GATAA"), 246 | kay_Jra=data.frame(Motif="AP1", ID="flyfactorsurvey__kay_Jra_SANGER_5_FBgn0001291", core="TGA.TCA"), 247 | twist=data.frame(Motif="twist", ID="flyfactorsurvey__twi_da_SANGER_5_FBgn0000413", core="CAGATG"), 248 | Trl=data.frame(Motif="Trl", ID="flyfactorsurvey__Trl_FlyReg_FBgn0013263", core="GAGA"), 249 | ETS=data.frame(Motif="ETS", ID="flyfactorsurvey__Ets97D_SANGER_10_FBgn0004510", core="CCGGAA"), 250 | SREBP=data.frame(Motif="SREBP", ID="flyfactorsurvey__HLH106_SANGER_10_FBgn0015234", core="TCACGCGA"), 251 | Dref=data.frame(Motif="Dref", ID="homer__AVYTATCGATAD_DREF", core="TATCGATA"), 252 | Ohler1=data.frame(Motif="Ohler1", ID="homer__MYGGTCACACTG_Unknown1", core="GGTCACACT"), 253 | Ohler6=data.frame(Motif="Ohler6", ID="homer__AAAAATACCRMA_Unknown4", core="AAATACCA")) 254 | TF_motifs <- do.call(rbind, TF_motifs) 255 | 256 | # enhancers 257 | twist_enhancers <- readRDS(url("https://data.starklab.org/almeida/DeepSTARR/Figures_data/Drosophila_mutation_individual_instances.rds")) 258 | Final_enhancers_selected <- GRanges(paste0(sapply(strsplit(twist_enhancers$Sequence_ID,"_"), `[`, 1), 259 | ":", sapply(strsplit(twist_enhancers$Sequence_ID,"_"), `[`, 2), 260 | "-", sapply(strsplit(twist_enhancers$Sequence_ID,"_"), `[`, 3)), 261 | strand = twist_enhancers$Strand, 262 | Sequence_ID=twist_enhancers$Sequence_ID, 263 | Sequence=twist_enhancers$Sequence, 264 | enhancer_group=twist_enhancers$enhancer_group, 265 | seqinfo = Dmelanogaster@seqinfo) 266 | Final_enhancers_selected <- unique(Final_enhancers_selected) 267 | 268 | # motif positions in oligo 269 | motif_ix_pos <- matchMotifs(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% TF_motifs$ID], 270 | Final_enhancers_selected$Sequence, 271 | genome = "BSgenome.Dmelanogaster.UCSC.dm3", p.cutoff = 5e-4, bg="genome", out = "positions") 272 | names(motif_ix_pos) <- name(TF_clusters_PWMs$All_pwms_log_odds[name(TF_clusters_PWMs$All_pwms_log_odds) %in% TF_motifs$ID]) 273 | 274 | motif_ix_pos2 <- lapply(motif_ix_pos, function(motif_x){ 275 | names(motif_x) <- Final_enhancers_selected$Sequence_ID 276 | motif_x <- motif_x[sapply(motif_x, length)>0] # remove sequences without motif 277 | # join all IRanges in same GRanges 278 | motif_x <- GRanges(names(unlist(motif_x)), #Use sequence ID as seqnames 279 | IRanges(start(unlist(motif_x)), end(unlist(motif_x))), 280 | strand = mcols(unlist(motif_x))$strand, 281 | score=mcols(unlist(motif_x))$score) 282 | ### reduce - keep strand information 283 | # motif_x <- my_reduce(motif_x, min.frac.ov=0.7) 284 | ## instead, use plyranges::reduce_ranges to get motif score back - but here there is no min overlap 285 | # motif_x <- reduce_ranges(motif_x, max_score = max(score), sum_score = sum(score), Number=n()) # bad because one bp overlapping is too stringent, use the adapted version below 286 | motif_x <- my_reduce_with_score(motif_x, min.frac.ov=0.5) 287 | # add wt sequence 288 | motif_x$enh_strand <- twist_enhancers$Strand[match(as.character(motif_x@seqnames), twist_enhancers$Sequence_ID)] 289 | motif_x$seq <- substr(as.character(getSeq(Dmelanogaster, GRanges(paste0(sapply(strsplit(as.character(motif_x@seqnames),"_"), `[`, 1), 290 | ":", sapply(strsplit(as.character(motif_x@seqnames),"_"), `[`, 2), 291 | "-", sapply(strsplit(as.character(motif_x@seqnames),"_"), `[`, 3)), 292 | strand = motif_x$enh_strand))), 293 | motif_x@ranges@start, 294 | motif_x@ranges@start+motif_x@ranges@width-1) 295 | return(motif_x) 296 | }) 297 | lapply(motif_ix_pos2, function(i) table(width(i))) 298 | 299 | motif_ix_pos3 <- lapply(names(motif_ix_pos2), function(i){ 300 | x <- data.frame(motif_ix_pos2[[i]], stringsAsFactors = F) 301 | names(x)[1] <- "Sequence" 302 | x$Sequence <- as.character(x$Sequence) 303 | x$strand <- as.character(x$strand) 304 | x$Motif <- TF_motifs$Motif[TF_motifs$ID %in% i] 305 | return(x) 306 | }) 307 | names(motif_ix_pos3) <- names(motif_ix_pos2) 308 | 309 | sapply(motif_ix_pos2, length) 310 | sapply(motif_ix_pos3, nrow) 311 | 312 | saveRDS(do.call(rbind, motif_ix_pos3), file = "Motif_oligo_positions_list.rds") 313 | 314 | 315 | 316 | ### motif pairs - associations with distance 317 | 318 | # mutation data 319 | mutation_data_2 <- readRDS(url("https://data.starklab.org/almeida/DeepSTARR/Figures_data/Drosophila_mutation_individual_instances_extended_info.rds")) 320 | # mutation_data_2 <- mutation_data_2[-grep("all_mutant",mutation_data_2$Oligo_ID),] 321 | mutation_data_2 <- mutation_data_2[complete.cases(mutation_data_2$enhancer_group),] 322 | 323 | # confident TF motif positions 324 | Motif_oligo_positions <- readRDS(url("Motif_oligo_positions_list.rds")) 325 | Motif_oligo_positions$instance_center <- Motif_oligo_positions$start+(Motif_oligo_positions$end-Motif_oligo_positions$start)/2 326 | table(Motif_oligo_positions$Motif) 327 | 328 | TF_motifs <- list(srp=data.frame(Motif="GATA", ID="flyfactorsurvey__srp_SANGER_5_FBgn0003507", core="GATAA"), 329 | kay_Jra=data.frame(Motif="AP1", ID="flyfactorsurvey__kay_Jra_SANGER_5_FBgn0001291", core="TGA.TCA"), 330 | twist=data.frame(Motif="twist", ID="flyfactorsurvey__twi_da_SANGER_5_FBgn0000413", core="CAGATG"), 331 | Trl=data.frame(Motif="Trl", ID="flyfactorsurvey__Trl_FlyReg_FBgn0013263", core="GAGA"), 332 | ETS=data.frame(Motif="ETS", ID="flyfactorsurvey__Ets97D_SANGER_10_FBgn0004510", core="CCGGAA"), 333 | SREBP=data.frame(Motif="SREBP", ID="flyfactorsurvey__HLH106_SANGER_10_FBgn0015234", core="TCACGCGA"), 334 | Dref=data.frame(Motif="Dref", ID="homer__AVYTATCGATAD_DREF", core="TATCGATA"), 335 | Ohler1=data.frame(Motif="Ohler1", ID="homer__MYGGTCACACTG_Unknown1", core="GGTCACACT"), 336 | Ohler6=data.frame(Motif="Ohler6", ID="homer__AAAAATACCRMA_Unknown4", core="AAATACCA")) 337 | TF_motifs <- do.call(rbind, TF_motifs) 338 | 339 | for(m1 in TF_motifs$Motif[c(1:4,7)]){ 340 | 341 | pdf(paste0("Fig5D_5E_", m1, ".pdf"), width = 3, height = 4.5) 342 | 343 | if(m1=="GATA") m1_twist <- "GATAA" 344 | if(m1=="AP1") m1_twist <- "TGA.TCA" 345 | if(m1=="Trl") m1_twist <- "GAGA" 346 | if(m1=="twist") m1_twist <- "CA..TG" 347 | if(m1=="Dref") m1_twist <- "ATCGAT" 348 | 349 | tmp <- Motif_oligo_positions[Motif_oligo_positions$Motif %in% m1 & Motif_oligo_positions$Sequence %in% mutation_data_2$Sequence_ID[mutation_data_2$Motif_mutated %in% m1_twist],] 350 | 351 | # mutations 352 | class <- ifelse(m1=="Dref", "hk", "dev") 353 | mutation_data_2_tmp <- mutation_data_2[mutation_data_2$Motif_mutated == m1_twist & mutation_data_2$enhancer_group %in% class,] 354 | 355 | tmp$log2FC <- sapply(1:nrow(tmp), function(i){ 356 | if(m1!="Dref") out <- mutation_data_2_tmp$dev_log2FC_wt_mut[subjectHits(findOverlaps(GRanges(tmp$Sequence[i], IRanges(tmp$start[i], tmp$end[i])), 357 | GRanges(mutation_data_2_tmp$Sequence_ID, IRanges(mutation_data_2_tmp$instance_start, mutation_data_2_tmp$instance_end)), 358 | minoverlap = unique(nchar(mutation_data_2_tmp$wt_instance))))] 359 | if(m1=="Dref") out <- mutation_data_2_tmp$hk_log2FC_wt_mut[subjectHits(findOverlaps(GRanges(tmp$Sequence[i], IRanges(tmp$start[i], tmp$end[i])), 360 | GRanges(mutation_data_2_tmp$Sequence_ID, IRanges(mutation_data_2_tmp$instance_start, mutation_data_2_tmp$instance_end)), 361 | minoverlap = unique(nchar(mutation_data_2_tmp$wt_instance))))] 362 | if(length(out)>0){return(mean(out))}else{return(NA)} 363 | }) 364 | tmp <- tmp[complete.cases(tmp$log2FC),] 365 | 366 | # get number of instances and their distance to each partner motif 367 | add_info <- lapply(TF_motifs$Motif, function(m2){ 368 | Number <- sapply(1:nrow(tmp), function(s){ 369 | m2_df <- Motif_oligo_positions[Motif_oligo_positions$Sequence %in% tmp$Sequence[s] & Motif_oligo_positions$Motif %in% m2,] 370 | if(nrow(m2_df)>0){return(nrow(m2_df))}else{return(0)} 371 | }) 372 | Score <- sapply(1:nrow(tmp), function(s){ 373 | m2_df <- Motif_oligo_positions[Motif_oligo_positions$Sequence %in% tmp$Sequence[s] & Motif_oligo_positions$Motif %in% m2,] 374 | if(nrow(m2_df)>0){return(sum(m2_df$max_score))}else{return(0)} 375 | }) 376 | Distance <- sapply(1:nrow(tmp), function(s){ 377 | m2_df <- Motif_oligo_positions[Motif_oligo_positions$Sequence %in% tmp$Sequence[s] & Motif_oligo_positions$Motif %in% m2 & !rownames(Motif_oligo_positions) %in% rownames(tmp)[s],] 378 | if(nrow(m2_df)>0){ 379 | dist <- m2_df$instance_center-tmp$instance_center[s] 380 | return(dist[abs(dist)==min(abs(dist))][1]) 381 | }else{return(NA)} 382 | }) 383 | # double check 384 | out <- data.frame(Number, Score, Distance) 385 | names(out) <- paste0(m2, c("_n", "_score", "_dist")) 386 | return(out) 387 | }) 388 | 389 | tmp2 <- cbind(tmp, do.call(cbind, add_info)) 390 | 391 | # plots 392 | # select pair motif 393 | if(class=="dev") m2_list <- c("GATA", "AP1", "SREBP", "twist","ETS", "Trl") 394 | if(class=="hk") m2_list <- c("Dref", "Ohler1", "Ohler6") 395 | 396 | for(m2 in m2_list){ 397 | 398 | gg_df <- tmp2[complete.cases(tmp2[,paste0(m2, "_dist")]),] # restricting by complete distances means it has at least 2 when is homotypic pair, and at least one of each when heterotypic. Should I limit homotypic to 2 instances 399 | 400 | # remove overlapping instances (distance between centers > motif length) 401 | gg_df <- gg_df[abs(gg_df[,paste0(m2, "_dist")]) > median(gg_df$width),] 402 | 403 | gg_df$class <- NA 404 | gg_df$class[abs(gg_df[,paste0(m2, "_dist")])<25] <- "<25" 405 | gg_df$class[abs(gg_df[,paste0(m2, "_dist")])>50] <- ">50" 406 | table(gg_df$class) 407 | 408 | g2 <- ggplot(gg_df[complete.cases(gg_df$class),], aes(class, log2FC, colour=class)) + 409 | geom_boxplot(fill=NA, size=1) + 410 | scale_colour_manual(values=c("<25"="#2166AC", ">50"="#B2182B")) + 411 | guides(col=F) + 412 | scale_y_continuous(paste0("log2 FC enhancer activity [",m1,"]"), breaks = seq(-10,10,2)) + 413 | scale_x_discrete(paste0(m1, "-", m2," distance (bp)"), 414 | labels=c(paste0("<25\n(n=", length(which(gg_df$class=="<25")),")"), 415 | paste0(">50\n(n=", length(which(gg_df$class==">50")),")"))) + 416 | geom_hline(yintercept = 0, linetype="dashed", col="grey60") 417 | 418 | wilcox.test(gg_df$log2FC~gg_df$class) 419 | 420 | print(g2) 421 | 422 | } 423 | 424 | dev.off() 425 | print(m1) 426 | 427 | } 428 | 429 | -------------------------------------------------------------------------------- /Figures/Fig7.R: -------------------------------------------------------------------------------- 1 | 2 | library(GenomicRanges) 3 | library(ggplot2) 4 | theme_set(theme_light() + theme(axis.text = element_text(colour = "black"))) 5 | library(cowplot) 6 | library(dplyr) 7 | 8 | ######## 9 | # Fig 7A 10 | ######## 11 | 12 | df_twist <- read.delim("https://data.starklab.org/almeida/DeepSTARR/Figures_data/Twist_mutagenesis_Drosophila_final_table.txt", stringsAsFactors = F)[,c(1:5,7,8,27,31)] 13 | df_twist <- df_twist[complete.cases(df_twist$dev_log2FoldChange) & complete.cases(df_twist$hk_log2FoldChange),] 14 | 15 | tmp <- read.delim("https://data.starklab.org/almeida/DeepSTARR/Figures_data/Table_synthetic_enhancers.txt") 16 | tmp$obs <- tmp$dev_log2FoldChange 17 | tmp$pred <- tmp$Predictions_dev 18 | 19 | ### add wt enhancers - just to keep same axis limits 20 | tmp_wt <- df_twist[df_twist$Enhancer_type %in% c("dev") & df_twist$dev_log2FoldChange>3.149511 & df_twist$dev_log2FoldChange>df_twist$hk_log2FoldChange,] 21 | tmp_wt$obs <- tmp_wt$dev_log2FoldChange 22 | 23 | tmp$pred_bins <- cut(tmp$pred, breaks = c(floor(min(tmp$pred)), 2,3,4,5, ceiling(max(tmp$pred))), include.lowest = T) 24 | 25 | # names 26 | bins_labels <- sapply(levels(tmp$pred_bins), function(x){ 27 | paste0(x, "\n(n=", table(tmp$pred_bins)[x], ")") 28 | }) 29 | 30 | gg_boxplot <- ggplot() + 31 | geom_boxplot(data=tmp_wt, aes("Native", obs), 32 | fill="orangered") + 33 | geom_boxplot(data=tmp, aes(pred_bins, obs, alpha=pred_bins), 34 | fill="grey30") + 35 | scale_alpha_manual(values=seq(0.3,1,length.out = 6)) + 36 | guides(col=F, alpha=F) + 37 | scale_y_continuous(paste0("STARR-seq enhancer activity [log2]"), 38 | # limits = lim, 39 | breaks = seq(-12,20,2)) + 40 | scale_x_discrete(paste0("DeepSTARR predicted activity [log2]"), limits=c("Native", levels(tmp$pred_bins)), 41 | labels=c("Native", bins_labels)) + 42 | ggtitle(paste0("Synthetic enhancers (n=", nrow(tmp), ")")) + 43 | theme_bw() + 44 | theme(panel.background = element_rect(fill="white",colour="white"), panel.grid = element_blank(), axis.line=element_line(colour="black"), 45 | axis.text=element_text(size=13, colour="black"), axis.title=element_text(size=16, colour="black"), 46 | axis.title.x=element_text(margin = margin(5,0,0,0)), axis.title.y=element_text(margin = margin(0,5,0,0)), 47 | plot.title = element_text(size=18, hjust = 0.5, colour="black"), plot.subtitle = element_text(size=14, hjust = 0.5)) 48 | 49 | gg_scater <- ggplot(tmp, aes(pred, obs)) + 50 | geom_point(size=1, col="grey30", aes(alpha=pred_bins)) + 51 | geom_point(data=tmp_wt, aes(1, obs), 52 | col=NA) + 53 | scale_alpha_manual(values=seq(0.4,1,length.out = 6)) + 54 | guides(col=F, alpha=F) + 55 | scale_y_continuous(paste0("STARR-seq enhancer activity [log2]"), 56 | # limits = lim, 57 | breaks = seq(-12,20,2)) + 58 | scale_x_continuous(paste0("DeepSTARR predicted activity [log2]"), 59 | # limits = lim, 60 | breaks = seq(-12,20,2)) + 61 | ggtitle(paste0("Synthetic enhancers (n=", nrow(tmp), ")")) + 62 | theme_bw() + 63 | theme(panel.background = element_rect(fill="white",colour="white"), panel.grid = element_blank(), axis.line=element_line(colour="black"), 64 | axis.text=element_text(size=13, colour="black"), axis.title=element_text(size=16, colour="black"), 65 | axis.title.x=element_text(margin = margin(5,0,0,0)), axis.title.y=element_text(margin = margin(0,5,0,0)), 66 | plot.title = element_text(size=18, hjust = 0.5, colour="black"), plot.subtitle = element_text(size=14, hjust = 0.5)) + 67 | annotate("text", x=min(tmp$pred, na.rm=T), y = max(tmp_wt$obs, na.rm=T), 68 | label = paste0("PCC: ", round(cor(tmp$obs, tmp$pred),2)), 69 | vjust=1, hjust=0, size=5) 70 | 71 | 72 | pdf("Fig7A.pdf", height = 5.5, width = 10.5) 73 | plot_grid(plotlist = list(gg_boxplot,gg_scater), nrow = 1, rel_widths = c(1,1.2), align = "h") 74 | dev.off() 75 | 76 | 77 | 78 | ######## 79 | # Fig 7B 80 | ######## 81 | 82 | # function to plot logos of enhancers 83 | library(ggseqlogo) 84 | my.logo <- function(x, cutoff=NULL){ 85 | p <- ggseqlogo(x, method='custom', seq_type='dna', ncol=1) + 86 | scale_x_continuous(breaks=seq(0,249,25), expand=c(0,0)) + 87 | scale_y_continuous(expand=c(0,0)) + 88 | theme(panel.border = element_rect(colour="black", fill=NA), 89 | axis.ticks = element_line(colour="black")) 90 | if(!is.null(cutoff)) p <- p + geom_hline(yintercept = cutoff, lty="dashed") 91 | p 92 | } 93 | 94 | # load scores 95 | twist_contr_scores <- readRDS(url("https://data.starklab.org/almeida/DeepSTARR/Figures_data/DeepSTARR_contr_scores_oligo_library.rds")) 96 | 97 | pdf("Fig7B.pdf", height = 2.1, width = 13) 98 | for(i in c("Synth_enh_dev_45", "Synth_enh_dev_53", "Synth_enh_dev_89")){ 99 | p_dev <- my.logo(twist_contr_scores$dev[[grep(paste0(i,"_"), names(twist_contr_scores$dev))]]) + ggtitle(i) + scale_y_continuous("Dev scores") 100 | print(p_dev) 101 | } 102 | dev.off() 103 | -------------------------------------------------------------------------------- /Figures/README.md: -------------------------------------------------------------------------------- 1 | # Code for Figures 2 | Scripts to reproduce each main figure can be found here and the respective processed data [here](https://data.starklab.org/almeida/DeepSTARR/Figures_data/). 3 | 4 | ## Questions 5 | If you have any questions or are missing any data please contact me at [bernardo.almeida94@gmail.com](mailto:bernardo.almeida94@gmail.com). 6 | -------------------------------------------------------------------------------- /GenomeWide_UMISTARRseq/README.md: -------------------------------------------------------------------------------- 1 | # Scripts for processing genome-wide UMI-STARR-seq data 2 | 3 | Main script: [UMISTARRseq_pipeline.sh](UMISTARRseq_pipeline.sh) 4 | 5 | Steps: 6 | - Map reads with bowtie and collapse by UMIs 7 | - Select reads with specific lengths - 150-250bp 8 | - Create coverage BigWig files 9 | - Call STARR-seq peaks 10 | 11 | The raw sequencing and processed data are available from GEO under accession number [GSE183939](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE183939). 12 | Genome browser tracks are available at https://genome.ucsc.edu/s/bernardo.almeida/DeepSTARR_manuscript. 13 | 14 | ## Questions 15 | If you have any questions/requests/comments please contact me at [bernardo.almeida94@gmail.com](mailto:bernardo.almeida94@gmail.com). 16 | -------------------------------------------------------------------------------- /GenomeWide_UMISTARRseq/STARRseq_UMI_collapsing.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=FALSE) 4 | options("scipen"=100, "digits"=4) 5 | 6 | ################## 7 | # OPTION PARSING 8 | ################## 9 | 10 | suppressPackageStartupMessages(library("optparse")) 11 | 12 | option_list <- list( 13 | make_option(c("-i", "--input"), action = "store", type="character", default=NULL, 14 | help="$TMP/collapsed_frags.bed", metavar="character"), 15 | make_option(c("-m", "--MM"), action = "store", type="numeric", default=1, 16 | help="$UMI_MM", metavar="character"), 17 | make_option(c("-c", "--core"), action = "store", type="numeric", default=1, 18 | help="number of cores", metavar="character"), 19 | make_option(c("-o", "--out"), action = "store", type="character", default="average.txt", 20 | help="$TMP/reads.filtered.3.bed", metavar="character") 21 | ) 22 | 23 | opt_parser <- OptionParser( 24 | usage = "%prog [options]", 25 | option_list=option_list, 26 | description = "UMI collapsing" 27 | ) 28 | arguments <- parse_args(opt_parser, positional_arguments = TRUE) 29 | opt <- arguments$options 30 | 31 | #------------ 32 | # LIBRARIES 33 | #------------ 34 | 35 | suppressPackageStartupMessages(library("rtracklayer")) 36 | suppressPackageStartupMessages(library("parallel")) 37 | suppressPackageStartupMessages(library("stringdist")) 38 | 39 | # print options 40 | cat("\nRunning UMI collapsing\n") 41 | 42 | opt 43 | 44 | #------------ 45 | # Prepare data 46 | #------------ 47 | 48 | test_big <- import.bed(opt$input) 49 | test_big$ID <- paste(test_big@seqnames, test_big@ranges@start, test_big@ranges@start+test_big@ranges@width-1, test_big@strand, sep="_") 50 | lvl <- names(sort(table(test_big$ID))) 51 | test_big$ID <- factor(test_big$ID, levels = rev(lvl)) 52 | test_big_sorted <- test_big[order(test_big$ID)] 53 | 54 | f3 = function(bar,c=1){ 55 | keep <- vector("numeric",length(bar)) 56 | names(keep) = names(bar) 57 | while (length(bar)>0) { 58 | rmv = which(stringdist(bar[1],bar,method="hamming",nthread =c)<=opt$MM) 59 | keep[names(bar)[1]] = length(rmv) 60 | bar = bar[-rmv] 61 | } 62 | return(keep) 63 | } 64 | 65 | #------------ 66 | # Run 67 | #------------ 68 | 69 | # Calculate the number of cores 70 | no_cores <- opt$core 71 | # Initiate cluster 72 | cl <- makeCluster(no_cores) 73 | 74 | clusterExport(cl, "stringdist") 75 | clusterExport(cl, "f3") 76 | clusterExport(cl, "opt") 77 | 78 | results <- unlist(parLapply(cl, split(test_big_sorted$name,test_big_sorted$ID), 79 | function(x){ 80 | names(x) <- 1:length(x) 81 | f3(x) 82 | }) 83 | ) 84 | 85 | # Finish 86 | stopCluster(cl) 87 | 88 | test_big_sorted$counts <- as.numeric(results) 89 | # print to see top 90 | test_big_sorted 91 | test_big_sorted <- test_big_sorted[test_big_sorted$counts>0] 92 | test_big_sorted 93 | 94 | out <- as.data.frame(test_big_sorted) 95 | out$start <- out$start-1 96 | out$name <- paste(out$name, out$score, sep="_") 97 | 98 | write.table(out[,c(1:3,6,9,5)], opt$out, sep="\t", row.names = F, col.names = F, quote = F) 99 | 100 | sessionInfo() 101 | -------------------------------------------------------------------------------- /GenomeWide_UMISTARRseq/UMISTARRseq_pipeline.sh: -------------------------------------------------------------------------------- 1 | ####################### 2 | ## Process genome-wide STARR-seq sequencing data 3 | ####################### 4 | 5 | folder=GenomeWide_UMISTARRseq 6 | cd $folder 7 | 8 | # folder to write results 9 | dataFolder=$folder/data 10 | mkdir -p $dataFolder 11 | 12 | # wrapper to submit jobs to cluster 13 | bsub=bsub_gridengine 14 | 15 | ## NOTE ## 16 | # The original sequencing data at our institute is provided in a BAM file containing the reads from a whole lane together with their the i5 and i7 indexes. 17 | # The script below is based on this BAM file as input and requires barcodes to demultiplex the reads and map them with bowtie. 18 | # However we can only provide fastq files of the demultiplexed reads for each experiment. 19 | # For samples with unique molecular identifiers (UMIs) at the i7 index, the UMI information is included in the read name and can be used to collapse reads with identical UMIs. 20 | 21 | ####################### 22 | ## paired-end mapping with bowtie 23 | ####################### 24 | 25 | ## make an exeriment file with all sample information and respective barcodes for demultiplexing from main sequencing BAM file --> $dataFolder/experiment.txt 26 | 27 | head $dataFolder/experiment.txt 28 | experimentFile=$dataFolder/experiment.txt 29 | 30 | GENOME="dm3" #download genome fasta file (no_chrU_chrUextra_chrM) 31 | mkdir -p log_mapping 32 | 33 | ALIGNER=bowtie_pe.sh 34 | 35 | grep -v -E "^#" $experimentFile | \ 36 | while read line; do 37 | INFILE=$( echo $line | awk '{print $2}' ) 38 | outdir=$dataFolder 39 | BARCODES14=$( echo $line | awk '{print $9}' ) 40 | BARCODES12=$( echo $line | awk '{print $11}' ) 41 | OUTFILE=$( echo $line | awk '{print $10}') 42 | BARCODE14_LEN=$( echo $BARCODES14 | awk '{print length($1)}' ) 43 | BARCODE12_LEN=$( echo $BARCODES12 | awk '{print length($1)}' ) 44 | 45 | # with UMIs 46 | if [ "$BARCODES14" == "UMI" ]; then 47 | BARCODE14_LEN=10 # length of UMI 48 | $bsub -o log_mapping -C 10 -T '2-00:00:00' -n "${OUTFILE}_mapping" "${ALIGNER} -i $INFILE -o ${outdir}/${OUTFILE}.bb -B $BARCODES12 -L $BARCODE12_LEN -l $BARCODE14_LEN --umi -f A -g $GENOME " > log_mapping/msg.$OUTFILE.tmp 49 | else 50 | # no UMIs 51 | $bsub -o log_mapping -C 10 -n "${OUTFILE}_mapping" "${ALIGNER} -i $INFILE -o ${outdir}/${OUTFILE}.bb -B $BARCODES12 -L $BARCODE12_LEN -b $BARCODES14 -l $BARCODE14_LEN -f A -g $GENOME " > log_mapping/msg.$OUTFILE.tmp 52 | fi 53 | 54 | # get job IDs to wait for mapping to finish 55 | ID=$(paste log_mapping/msg.$OUTFILE.tmp|grep Submitted|awk '{print $4}') 56 | if [ "$OS_id" == "Debian" ]; then 57 | ID="${OUTFILE}_mapping" 58 | fi 59 | 60 | # bigwig files 61 | $bsub -o log_bigwig -C 5 -d "$ID" "bigBedToBigWig.sh -i ${outdir}/${OUTFILE}.all.bb -t libSize -g $GENOME" #all mapped reads 62 | $bsub -o log_bigwig -C 5 -d "$ID" "bigBedToBigWig.sh -i ${outdir}/${OUTFILE}.UMI.bb -t libSize -g $GENOME" #UMI-collapsed reads 63 | 64 | done 65 | 66 | ####################### 67 | ## select reads with specific lengths - 150-250bp 68 | ####################### 69 | 70 | # bigBedToBed from kent_tools 71 | 72 | SIZES=dm3.chrom.sizes 73 | 74 | for f in `ls ${dataFolder}/*200*bb` 75 | do echo $f 76 | bigBedToBed $f stdout | awk '{if($3-$2 >= 150 && $3-$2 <= 250)print $0}' > test.bed 77 | bedToBigBed test.bed $SIZES ${f%.bb}_cut.bb 78 | ${prog_dir}/bigBedToBigWig.sh -i ${f%.bb}_cut.bb -t libSize -g $GENOME 79 | rm test.bed 80 | done 81 | 82 | ####################### 83 | ## call STARR-seq peaks 84 | ####################### 85 | 86 | ### join replicates to call peaks 87 | for f in DSCP_200bp_gw RpS12_200bp_gw 88 | do echo $f 89 | bigBedToBed ${dataFolder}/${f}_Rep1.UMI_cut.bb rep1 90 | bigBedToBed ${dataFolder}/${f}_Rep2.UMI_cut.bb rep2 91 | cat rep1 rep2 | sort -k1,1 -k2,2n > sorted.bed 92 | bedToBigBed sorted.bed $SIZES ${dataFolder}/${f}.UMI_cut_merged.bb 93 | ${prog_dir}/bigBedToBigWig.sh -i ${dataFolder}/${f}.UMI_cut_merged.bb -t libSize -g $GENOME 94 | rm rep1 95 | rm rep2 96 | rm sorted.bed 97 | done 98 | 99 | ### Get median fragment size 100 | rm ${dataFolder}/fragment_size_median.txt 101 | for f in `ls ${dataFolder}/*_cut*.bb` 102 | do echo $f 103 | size=$(bigBedToBed $f stdout | awk '{ print $3-$2+1 }' | median.R -i - ) 104 | echo -e "$(basename $f)\t$size" >> ${dataFolder}/fragment_size_median.txt 105 | done 106 | 107 | ### call peaks 108 | for f in DSCP_200bp_gw.UMI RpS12_200bp_gw.UMI 109 | do echo $f 110 | if [ "$f" == "DSCP_200bp_gw.UMI" ]; then Input=input_DSCP_200bp.all_cut.bb; fi 111 | if [ "$f" == "RpS12_200bp_gw.UMI" ]; then Input=input_RPS12_200bp.all_cut.bb; fi 112 | echo $Input 113 | bsub -o log_peaks -C 5 "module load python/2.7.13-foss-2017a; module load rpy2/2.8.6-foss-2017a-python-2.7.13; \ 114 | call_peaks.sh -e ${dataFolder}/${f}_cut_merged.bb -b ${dataFolder}/$Input -g ${GENOME} -M 3 -W 1000 -Z 1.67 -P 0.001" 115 | done 116 | -------------------------------------------------------------------------------- /GenomeWide_UMISTARRseq/bigBedToBigWig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o pipefail 4 | 5 | # Stark Lab in house pipeline. Adapted by Bernardo Almeida 2020 6 | 7 | ################################################################################ 8 | # Requirements 9 | ################################################################################ 10 | 11 | # Programs: 12 | # * Jim Kent's utils (http://hgwdev.cse.ucsc.edu/~kent/src/) 13 | # * bedtools (http://code.google.com/p/bedtools/) 14 | 15 | module load stark.grp/0.0.2 16 | 17 | ################################################################################ 18 | # Set default values 19 | ################################################################################ 20 | 21 | ASSEMBLY="dm3" # Genome assembly 22 | OUTFILE="" # Outfile, if not supplied ${input%.bb}.bw 23 | EXT="0" # Fragment length of the experiment 24 | EXT2="0" # Addititional extension 25 | NORM="1" # Normalize read coverge to 1 million mapped reads 26 | TYPE="libSize" # Normalization type 27 | SCALING_FILE="" # Normalize to the precalculated scalling factor 28 | STRAND="0" #make strand-specific tracks 29 | add="none" 30 | 31 | ################################################################################ 32 | # Help 33 | ################################################################################ 34 | 35 | if [ $# -eq 0 ]; then 36 | echo >&2 " 37 | $(basename $0) - Compute a BigWig file from a BigBed file of mapped reads 38 | 39 | USAGE: $(basename $0) -i [OPTIONS] 40 | -i Input file (BigBed) [ required ] 41 | -o Output file (BigWig) [default: .bw ] 42 | -g Genome assembly (e.g. dm3, hg19) [default: $ASSEMBLY] 43 | -e Extend mapped read coordinates to total length of n (off: 0) [default: $EXT ] 44 | Set to 0 for paired-end data 45 | Set to fragment size (e.g. 600) for single-end data to extend fragment coordinates 46 | -E Additional extension [default: $EXT2 ] 47 | Enables the artificial extension of reads. 48 | -t Type of normalization: library size [libSize], pre-calculated scaling factor[scaling], or 49 | none [default: $TYPE ] 50 | -n Normalize the coverage to 1 million mapped reads (0/1) [default: $NORM] 51 | -F File with precalculated scalling factor. [default: none ] 52 | File should be tabular separated and contain two column: experimentId and its 53 | scaling factor. Experiment ID should be exactly the same as in experiment file, 54 | usually it is a field Outfile 55 | -s Make strand-specific bigWig files [0/1] [default: 0 ] 56 | -a 'Add' to the fragment start. Used for PRO-seq, where we need only start postion of [default: $add] 57 | the fragment ( -a 1) 58 | 59 | NOTES: 60 | The program requires a file with the chromosome sizes of your assembly stored 61 | here /groups/stark/genomes/chrom/ASSEMBLY.chrom.sizes. 62 | 63 | The program uses up to 4 CPU cores, and about 2GB of RAM for a 100MB BigBed file. 64 | 65 | Also script required 66 | " 67 | exit 1 68 | fi 69 | 70 | ################################################################################ 71 | # Parse input 72 | ################################################################################ 73 | 74 | while getopts "i:o:g:e:n:E:F:t:s:a:" o 75 | do 76 | case "$o" in 77 | i) INFILE="$OPTARG";; 78 | o) OUTFILE="$OPTARG";; 79 | g) ASSEMBLY="$OPTARG";; 80 | e) EXT="$OPTARG";; 81 | E) EXT2="$OPTARG";; 82 | t) TYPE="$OPTARG";; 83 | F) SCALING_FILE="$OPTARG";; 84 | n) NORM="$OPTARG";; 85 | s) STRAND="$OPTARG";; 86 | a) add="$OPTARG";; 87 | \?) exit 1;; 88 | esac 89 | done 90 | 91 | ################################################################################ 92 | # Set chromosome size file 93 | ################################################################################ 94 | 95 | # Set $INDEX and $SIZES 96 | if [ "$ASSEMBLY" = "dm3" ]; then 97 | INDEX=dm3_no_chrU_chrUextra_chrM/dm3 98 | SIZES=dm3.chrom.sizes 99 | else 100 | INDEX=${ASSEMBLY}/${ASSEMBLY} 101 | SIZES=${ASSEMBLY}.chrom.sizes 102 | fi 103 | 104 | # Throw error message if chromosome sizes file does not exist 105 | if [ ! -e "$SIZES" ]; then 106 | echo >&2 "ERROR: No chromosome size file found for genome assembly ${ASSEMBLY}!" 107 | exit 1 108 | fi 109 | 110 | 111 | #some patch to be able to run this script for old data sets with old parameters 112 | #TYPE parameter has priority on the NORM 113 | if [ "$TYPE"=="none" ]; then 114 | NORM="0" 115 | fi 116 | 117 | #but if it is not set, then NORM will set priority 118 | if [ -z "$TYPE" ]; then 119 | if [ "$NORM" = "1" ]; then 120 | TYPE="libSize" 121 | else 122 | TYPE="none" 123 | fi 124 | fi 125 | 126 | 127 | 128 | ################################################################################ 129 | # Run main program 130 | ################################################################################ 131 | 132 | # Get number of total fragments 133 | 134 | N=$(bigBedInfo $INFILE | awk '(/^itemCount/){gsub(/,/,"",$2);print $2}') 135 | 136 | #Get experiment ID from the file name 137 | 138 | expId=$(echo $INFILE | perl -ne 'chomp($_); ($id)=$_=~/\/*([^\/.]+)\.[^\/]*bb/; print $id;') 139 | 140 | if [ $expId == "" ]; then 141 | echo "Experiment is not found!\n" 142 | exit 1 143 | fi 144 | 145 | #get scalling factor 146 | if [ "$TYPE" = "scaling" ]; then 147 | if [ -z "$SCALING_FILE" ]; then 148 | echo "Scaling file is missing! " 149 | exit 1; 150 | fi 151 | 152 | SFline=$(grep -w $expId $SCALING_FILE) 153 | if [ "$SFline" == "" ]; then 154 | echo "cant find scalling factor for sample $expId!" 155 | exit 1; 156 | fi 157 | 158 | SF=$(grep -w $expId $SCALING_FILE| awk '{print $2}') 159 | 160 | fi 161 | 162 | echo "$SF" 163 | echo "normalization..." 164 | 165 | # Get mapped read length and compute extension paramter E 166 | # Set to zero if EXT should be zero 167 | E=$(bigBedToBed $INFILE stdout -maxItems=1 | awk -v E=$EXT 'NR==1{if(E==0){print "0"}else{print E-($3-$2)}}') 168 | 169 | # Error if extension lower than 0 170 | if [ "$E" -lt "0" ]; then 171 | echo >&2 "ERROR: Total fragment length should be larger than read length!" 172 | exit 1 173 | fi 174 | 175 | # Set outfile to infile.bw or to the user given name 176 | if [ "$OUTFILE" = "" ]; then 177 | OUTFILE=${INFILE%.bb}.bw 178 | else 179 | OUTFILE=$OUTFILE 180 | fi 181 | 182 | 183 | # Compute coverage and get BedGraph file 184 | # Convert BedGraph file to BigBed 185 | # If required extend the reads prior to computing the coverage 186 | # EXT2 is and additional extension parameter 187 | 188 | #update 21.01.19 - sometimes we need strand-specific tracks 189 | 190 | if [ "$STRAND" = "1" ]; then 191 | strandList=("+" "-") 192 | else 193 | strandList="none" 194 | fi 195 | 196 | for strand in ${strandList[*]}; do 197 | echo $strand 198 | 199 | if [ "$strand" = "+" ]; then 200 | OUTFILE=${OUTFILE%.bw}_ps.bw 201 | if [ $add != "none" ]; then 202 | bigBedToBed $INFILE stdout | awk -vOFS="\t" -vadd=$add '($6=="+"){$3=$2+add; print}' > ${OUTFILE}.bed 203 | else 204 | bigBedToBed $INFILE stdout | awk -vOFS="\t" '($6=="+"){print}' > ${OUTFILE}.bed 205 | fi 206 | elif [ "$strand" = "-" ]; then 207 | OUTFILE=${OUTFILE%_ps.bw}_ns.bw 208 | if [ $add != "none" ]; then 209 | bigBedToBed $INFILE stdout | awk -vOFS="\t" -vadd=$add '($6=="-"){$2=$3-add; print}' > ${OUTFILE}.bed 210 | else 211 | bigBedToBed $INFILE stdout | awk -vOFS="\t" '($6=="-"){ print}' > ${OUTFILE}.bed 212 | fi 213 | else 214 | bigBedToBed $INFILE ${OUTFILE}.bed 215 | fi 216 | 217 | echo $OUTFILE 218 | cat ${OUTFILE}.bed |\ 219 | if [ "$E" = "0" -a "$EXT2" = "0" ]; then 220 | cat 221 | else 222 | awk -vE=$E -vE2=$EXT2 -vC=$SIZES -vOFS="\t" ' 223 | BEGIN {while(getlinechr[$1]){$3=chr[$1]} 228 | print $0 229 | }' 230 | fi |\ 231 | genomeCoverageBed -i stdin -bg -g $SIZES |\ 232 | if [ "$TYPE" = "libSize" ]; then 233 | awk -vN=$N -vOFS="\t" '{print $1,$2,$3,1e6*$4/N}' 234 | elif [ "$TYPE" = "scaling" ]; then #scaling normalization 235 | awk -vsf=$SF -vOFS="\t" '{print $1,$2,$3,$4*sf}' 236 | else #no normalization 237 | awk -vN=$N -vOFS="\t" '{print $1,$2,$3,$4}' 238 | fi | \ 239 | if [ "$strand" = "-" ]; then 240 | awk -vOFS="\t" '{print $1,$2,$3,-1*$4}' 241 | else 242 | awk -vOFS="\t" '{print $1,$2,$3,$4}' 243 | fi | \ 244 | 245 | wigToBigWig stdin $SIZES $OUTFILE 246 | rm ${OUTFILE}.bed 247 | done 248 | 249 | 250 | # Exit 251 | exit 0 252 | -------------------------------------------------------------------------------- /GenomeWide_UMISTARRseq/bsub_gridengine: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ############################################### 4 | ## bsub 5 | ## mimics bsub of the LSF system for the SGE 6 | ## WARNING: options are different 7 | ## 8 | ############################################### 9 | 10 | #TODO - add message and email parameter in case job dies/exit unexpectedly, i.e. not enough memory problem 11 | 12 | #update 21/06/2018 13 | #now we have two environment - one using gridengine submission system and another - slurm 14 | #and they are running diffferent OS - Debian vs CentOS. 15 | #this how I will get where I am... 16 | 17 | #update 2019/06/17 in the new cluster (clip) this command does not work any more. 18 | #OS_id=`lsb_release -idrc|grep Description|perl -ne 'chomp($_); ($name)=$_=~/\:\s+(\S+)\s+/; print $name."\n";'` 19 | 20 | #update 2019/06/17 21 | # Instead Seren Uemit has recommended to use $LMOD_SYSHOST system variable 22 | # So, for the moment I have 'CLIP' , 'IMPIMBA-2', or '' (empty for old cluster, including nodes stark-1 stark-2) values 23 | OS_id=$(echo $LMOD_SYSHOST) 24 | 25 | #Debian and old cluster specific defaults 26 | #if [ "$OS_id" == "Debian" ]; then ## commad was for lsb_release 27 | if [ "$OS_id" == "" ]; then 28 | # update 6.07.18 It is not safe to load this module multiple times (when script is running from loops for example.), 29 | # so its better to load this module separately, only once, before run any command 30 | # module load gridengine/2011.11 31 | QUEUE=public.q 32 | # centOS and new cluster specific defaults 33 | #else 34 | 35 | fi 36 | 37 | ## defaults 38 | SHELL=/bin/bash #/bin/sh does not inherit environment, use /bin/bash! 39 | QUEUE2= 40 | LOG=/dev/null 41 | ERRORLOG=/dev/null 42 | TMPDIR=/tmp/ 43 | SYNC=no 44 | HELP=0 45 | MEMORY=0 46 | SPACE=0 47 | NAME= 48 | DEPENDS= 49 | HOSTS="" 50 | CORES="" 51 | NODES="" 52 | ARRAY=0 53 | RESTRICT=0 54 | EMAIL="" 55 | TIME= 56 | 57 | ## parse command line parameters 58 | while getopts s:q:p:o:e:m:n:t:d:H:A:R:C:K:h:M:N:T: o 59 | do case "$o" in 60 | s) SHELL="$OPTARG";; 61 | q) QUEUE="$OPTARG";; 62 | o) LOG="$OPTARG"; TMPDIR="$OPTARG";; 63 | e) ERRORLOG="$OPTARG";; 64 | m) MEMORY="$OPTARG";; 65 | n) NAME="$OPTARG";; 66 | t) SPACE="$OPTARG";; 67 | d) DEPENDS="$OPTARG";; 68 | C) CORES="$OPTARG";; 69 | N) NODES="$OPTARG";; 70 | H) HOSTS="$OPTARG";; 71 | A) ARRAY="$OPTARG";; 72 | R) RESTRICT="$OPTARG";; 73 | K) SYNC=yes;; 74 | h) HELP=1;; 75 | M) EMAIL="$OPTARG";; 76 | T) TIME="$OPTARG";; 77 | [?]) echo >&2 "ERROR: command line parameter not recognized."; HELP=1;; 78 | esac 79 | done 80 | 81 | shift $(($OPTIND-1)) 82 | 83 | if [ $HELP -eq 1 ]; then 84 | echo >&2 "USAGE: $0 [OPTIONS] 85 | -n name of the job [optional] 86 | -s shell [default: $SHELL] 87 | -q queue (for gridengine only) [default: $QUEUE] 88 | -o stdout directory, also the directory where the script for submission by qsub will be written [default: $LOG] 89 | -e stderr directory [default: $ERRORLOG] 90 | -m memory requirements in gigabites (0 for off) [default: $MEMORY] 91 | -t tmp-space requirements in Gigabites (0 for off) [default: $SPACE] 92 | -C number of cpus (cores) per task required on host (number (2) or range (2-6)) [default: off] 93 | -N number of nodes per tasks (number). For those software that allows cross talks between nodes. 94 | Note, that total number of CPUs will be calculated as N*C. For slurm only [default: off] 95 | -H use only indicated hosts (e.g. \"compute-3-*\" or \"compute-3-*|compute-4-*\") [default: off] 96 | -A this is an array job with N tasks numbered 1-N and accessible by \$SGE_TASK_ID(<2 for off) [optional] 97 | -R restrict array job such that no more than R jobs run in parallel (0 for off) [default: $RESTRICT] 98 | -K wait for the job to complete [optional] 99 | -d name of the dependent job [optional] 100 | -M email to get internal log information about job finishing 101 | -T add time parameter, for slurm in the format hh:mm:ss Example 2 days:'2-00:00:00' [default: $TIME] 102 | -h print this help 103 | " 104 | exit 1 105 | fi 106 | 107 | 108 | ## set ERRORLOG to LOG if not defined otherwise 109 | if [ $ERRORLOG = "/dev/null" ]; then 110 | ERRORLOG=$LOG 111 | fi 112 | 113 | ## test output directories 114 | if [ $LOG != "/dev/null" ]; then 115 | if [ ! -e $LOG ]; then 116 | mkdir -p $LOG 117 | else 118 | if [ ! -d $LOG ]; then 119 | echo >&2 "ERROR: $LOG exists but is not a directory" 120 | exit 0 121 | fi 122 | fi 123 | fi 124 | 125 | if [ $ERRORLOG != "/dev/null" ]; then 126 | if [ ! -e $ERRORLOG ]; then 127 | mkdir -p $ERRORLOG 128 | else 129 | if [ ! -d $ERRORLOG ]; then 130 | echo >&2 "ERROR: $ERRORLOG exists but is not a directory" 131 | exit 0 132 | fi 133 | fi 134 | fi 135 | 136 | ## get unique save temp file 137 | TMPFILE=$(mktemp -p $TMPDIR) 138 | 139 | ## write header to temp file 140 | if [ "$OS_id" == "Debian" ]; then 141 | echo "#!/bin/sh" >> $TMPFILE 142 | echo "#$ -S $SHELL" >> $TMPFILE 143 | echo "#$ -q $QUEUE" >> $TMPFILE 144 | if [ ! -z "$QUEUE2" ]; then 145 | echo "#$ -q $QUEUE2" >> $TMPFILE 146 | fi 147 | echo "#$ -cwd" >> $TMPFILE 148 | 149 | ## add job name if set 150 | if [ ! -z "$NAME" ]; then 151 | echo "#$ -N $NAME" >> $TMPFILE 152 | fi 153 | 154 | ## add memory requirements if set 155 | if [ $MEMORY != 0 ]; then 156 | echo "#$ -l vf=${MEMORY}G" >> $TMPFILE 157 | fi 158 | 159 | ## add tmp-space requirements if set 160 | if [ $SPACE != 0 ]; then 161 | echo "#$ -l tf=${SPACE}G" >> $TMPFILE 162 | fi 163 | 164 | ## add numbers of cores if set 165 | if [[ -n $CORES ]]; then 166 | echo "#$ -pe smp ${CORES}" >> $TMPFILE 167 | fi 168 | 169 | ## add host requirements if set 170 | if [[ -n $HOSTS ]]; then 171 | echo "#$ -l hostname=\"${HOSTS}\"" >> $TMPFILE 172 | fi 173 | 174 | ## add info about array job 175 | if [ $ARRAY -gt 1 ]; then 176 | echo "#$ -t 1-${ARRAY}" >> $TMPFILE 177 | fi 178 | 179 | ## add info about array job restriction 180 | if [ $RESTRICT -gt 0 ]; then 181 | echo "#$ -tc ${RESTRICT}" >> $TMPFILE 182 | fi 183 | 184 | if [ ! -z "$DEPENDS" ]; then 185 | echo "#$ -hold_jid ${DEPENDS}" >> $TMPFILE 186 | fi 187 | 188 | if [ ! -z "$TIME" ]; then 189 | echo "#$ -l walltime=${TIME}" >> $TMPFILE 190 | fi 191 | 192 | #send email at the end of the job, this includes abort 193 | if [ ! -z "$EMAIL" ]; then 194 | echo "#$ -M $EMAIL" >> $TMPFILE 195 | echo "#$ -m e" >> $TMPFILE 196 | fi 197 | 198 | ## write command to temp file (stdin or rest of command line) 199 | if [ $# -eq 0 ]; then 200 | cat >> $TMPFILE 201 | else 202 | echo $@ >> $TMPFILE 203 | fi 204 | 205 | ## add submit command as comment 206 | echo -e "\n# submit-command is: qsub -sync $SYNC -e $ERRORLOG -o $LOG $TMPFILE" >> $TMPFILE 207 | 208 | ## submit 209 | qsub -sync $SYNC -e $ERRORLOG -o $LOG $TMPFILE 210 | 211 | else #for slurm submission system syntax is different 212 | currDir=`pwd` 213 | echo "#!/usr/bin/env bash" >> $TMPFILE 214 | echo "#SBATCH --chdir $currDir" >> $TMPFILE 215 | # echo "#SBATCH --nodes=1" >> $TMPFILE 216 | # echo "#SBATCH --ntasks=1" >> $TMPFILE 217 | # echo "#SBATCH --cpus-per-task=1" >> $TMPFILE 218 | # echo "#SBATCH --mem-per-cpu=1G" >> $TMPFILE 219 | 220 | # echo "#SBATCH -p $QUEUE" >> $TMPFILE 221 | ## add job name if set 222 | if [ ! -z "$NAME" ]; then 223 | echo "#SBATCH --job-name $NAME" >> $TMPFILE 224 | fi 225 | 226 | ## add memory requirements if set 227 | if [ $MEMORY != 0 ]; then 228 | echo "#SBATCH --mem=${MEMORY}G" >> $TMPFILE 229 | fi 230 | 231 | ## add tmp-space requirements if set -- dont need it in the slurm system, the resources are unlimited 232 | # if [ $SPACE != 0 ]; then 233 | # echo "#SBATCH --gres tf=${SPACE}G" >> $TMPFILE 234 | # fi 235 | 236 | ## add numbers of cores if set 237 | 238 | if [[ -n $CORES ]]; then 239 | echo "#SBATCH --cpus-per-task=${CORES}" >> $TMPFILE 240 | fi 241 | 242 | if [[ -n $NODES ]]; then 243 | echo "#SBATCH --nodes=${NODES}" >> $TMPFILE 244 | fi 245 | 246 | ## add host requirements if set 247 | if [[ -n $HOSTS ]]; then 248 | echo "#SBATCH --nodelist=\"${HOSTS}\"" >> $TMPFILE 249 | fi 250 | 251 | ## add info about array job 252 | if [ $ARRAY -gt 1 ]; then 253 | addArray="" 254 | if [ $RESTRICT -gt 0 ]; then 255 | addArray="%${RESTRICT}" 256 | fi 257 | echo "#SBATCH --array=1-${ARRAY}${addArray}" >> $TMPFILE 258 | fi 259 | 260 | if [ ! -z "$DEPENDS" ]; then 261 | echo "#SBATCH --dependency=afterok:${DEPENDS}" >> $TMPFILE 262 | fi 263 | 264 | if [ ! -z "$TIME" ]; then 265 | echo "#SBATCH --time=${TIME}" >> $TMPFILE 266 | #after 4h we need medium queue, system does not resolve it automatically 267 | queueID=$(echo $TIME |perl -ne 'chomp($_); @d=split(":",$_); if($d[0]=~/(\d+)\-(\d+)/){ 268 | $days=$1; 269 | $hours=$2; 270 | $hours=~s/^0//; 271 | $h=$days*24+$hours; 272 | }else{ 273 | $h=$d[0]; 274 | $h=~s/^0//; 275 | } 276 | $qId="short"; 277 | if($h>48){ 278 | $qId='long'; 279 | }elsif($h>4){ 280 | $qId="medium"; 281 | } 282 | print $qId; 283 | ') 284 | if [ "$queueID" != "short" ]; then 285 | echo "#SBATCH --qos=$queueID" >> $TMPFILE 286 | fi 287 | fi 288 | 289 | #send email at the end of the job, this includes abort 290 | if [ ! -z "$EMAIL" ]; then 291 | echo "#SBATCH --mail-user=$EMAIL" >> $TMPFILE 292 | echo "#SBATCH --mail-type=FAIL" >> $TMPFILE 293 | fi 294 | 295 | # wait for a job to finish 296 | if [ "$SYNC" == "yes" ]; then 297 | echo "#SBATCH --wait" >> $TMPFILE 298 | fi 299 | 300 | ## write command to temp file (stdin or rest of command line) 301 | if [ $# -eq 0 ]; then 302 | cat >> $TMPFILE 303 | else 304 | echo $@ >> $TMPFILE 305 | fi 306 | 307 | ## add submit command as comment 308 | echo -e "\n# submit-command is: sbatch -e ${TMPFILE}.err -o ${TMPFILE}.out $TMPFILE" >> $TMPFILE 309 | 310 | ## submit 311 | echo "sbatch -e ${TMPFILE}.err -o ${TMPFILE}.out $TMPFILE" 312 | sbatch -e ${TMPFILE}.err -o ${TMPFILE}.out $TMPFILE 313 | 314 | fi #slurm 315 | 316 | ## remove tmpfile for submission (if in /tmp/ otherwise keep) 317 | if [ $TMPDIR = "/tmp/" ]; then 318 | rm $TMPFILE 319 | fi 320 | -------------------------------------------------------------------------------- /GenomeWide_UMISTARRseq/call_peaks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Stark Lab in house pipeline. Adapted by Bernardo Almeida 2020 4 | 5 | ################################################################################ 6 | # Requirements 7 | ################################################################################ 8 | 9 | # Programs: 10 | # * Jim Kent's utils 11 | # * bedtools 12 | # * grep-overlap 13 | # * score-overlap 14 | # * median.R 15 | # * hyper.R 16 | # * precomp_pvals.R 17 | 18 | #update 2020/03/25 We have only one cluster now, clip, therefore we dont need other configurations 19 | module load stark.grp/0.0.2 20 | Rexec="singularity run --app Rscript /groups/stark/software-all/singularity_img/singularity.R.with_pckg.simg " 21 | 22 | 23 | # Files: 24 | # * /groups/stark/genomes/chrom/${ASSEMBLY}.chrom.sizes 25 | 26 | ################################################################################ 27 | # Set default values 28 | ################################################################################ 29 | 30 | # params to precompute min. starrs-seq count such that a sign. enrichment can be 31 | # reached given the input count and the overall library sizes 32 | # be cautious when increasing these params - this increases with (nearly) MS*MI 33 | MS=500 # precompute for starr-seq fragment count up to MS 34 | MI=200 # precompute for input fragment count up to MI 35 | 36 | WIN=500 # window to evaluate local background 37 | DST=0 # min. distance between peak-summits (=peak width) 38 | EXT=0 # extend fragments to that length 39 | Z=1.67 # z-score for conf-ratio 40 | PVL=0.001 # p-value cutoff for bed file 41 | LPVL=0 # loose p-value cutoff during peak calling 42 | ASSEMBLY=dm3 # assembly/chromosome sizes 43 | STRICT=0 # strict peak calling taking only the lowest abundant strand 44 | 45 | MINENR=1 # fold enrichment cutoff for peak calling (candidates must be > MINENR enriched) 46 | 47 | PRUNEP=1 # prune peaks with Daniel's code 48 | memory=4G 49 | #tempDir="/scratch-cbe/users/stark/pipelines/" 50 | tempDir="/tmp/" 51 | ################################################################################ 52 | # Help 53 | ################################################################################ 54 | 55 | if [ $# -eq 0 ]; then 56 | echo >&2 " 57 | $(basename $0) - Call peaks 58 | 59 | USAGE: $(basename $0) -e -e [OPTIONS] 60 | -e Input file for experiment (BigBed) [required] 61 | -b Input file for background/input (BigBed) [required] 62 | -o Prefix for output files (.txt and .bed) [default: prefix for experiment] 63 | -P P-value cutoff [default: $PVL] 64 | -M Min. fold enrichment cutoff [default: $MINENR] 65 | -W Window to evaluate local background [default: $WIN] 66 | -D Min. distance between peak summits (=peak width) [default $DST] 67 | 0: use median of all input fragments (e.g. for paired-end data) 68 | int: use user defined summit-distance/peak-width (e.g. for single-end data; e.g. 600) 69 | -E Extend fragments to fixed fragment length E [default: $EXT] 70 | 0: use fragment size itself (e.g. for paired-end data) 71 | int: use constant fragment length int (e.g. for single-end data) 72 | -Z Z-score for correcting enrichments [default: $Z] 73 | -g Genome assembly (e.g. dm3, hg19) [default: $ASSEMBLY] 74 | -L Loose P-value cutoff during peak-calling (OFF: 0) [default: $LPVL] 75 | -S Strict peak calling taking the lowest strand coverage [default: $STRICT] 76 | -R Prune peaks (remove peaks that are on flanks of other peaks (0/1) [default: $PRUNEP] 77 | -m Memory restrictions for shell sort [default: $memory] 78 | 79 | NOTES: 80 | A file with chromosome sizes must bed stored here /groups/stark/genomes/chrom/ASSEMBLY.chrom.sizes. 81 | Uses 4 cores for sorting. 82 | " 83 | exit 1 84 | fi 85 | 86 | ################################################################################ 87 | # Parse input 88 | ################################################################################ 89 | 90 | while getopts "e:b:o:P:M:W:D:E:Z:g:C:L:S:R:m:" o 91 | do 92 | case "$o" in 93 | e) EF="$OPTARG";; 94 | b) BF="$OPTARG";; 95 | o) OF="$OPTARG";; 96 | P) PVL="$OPTARG";; 97 | M) MINENR="$OPTARG";; 98 | W) WIN="$OPTARG";; 99 | D) DST="$OPTARG";; 100 | E) EXT="$OPTARG";; 101 | Z) Z="$OPTARG";; 102 | g) ASSEMBLY="$OPTARG";; 103 | L) LPVL="$OPTARG";; 104 | S) STRICT="$OPTARG";; 105 | R) PRUNEP="$OPTARG";; 106 | m) memory="$OPTARG";; 107 | \?) exit 1;; 108 | esac 109 | done 110 | 111 | if [ -z "$EF" -o -z "$BF" ]; then 112 | echo >&2 "ERROR: -e -b are required!"; exit 1 113 | fi 114 | 115 | if [ -z "$EF" -o -z "$BF" ]; then 116 | echo >&2 "ERROR: input files $EF or $BF not found"; exit 1 117 | fi 118 | 119 | if [ $(awk -vP=$PVL -vL=$LPVL 'BEGIN{if(L 1 128 | # MINENR=1 129 | #fi 130 | 131 | 132 | ################################################################################ 133 | # Set chromosome sizes 134 | ################################################################################ 135 | 136 | # Set $INDEX and $SIZES 137 | if [ "$ASSEMBLY" = "dm3" ]; then 138 | INDEX=/groups/stark/indices/bowtie/dm3_no_chrU_chrUextra_chrM/dm3 139 | CS=/groups/stark/genomes/chrom/dm3.chrom.sizes 140 | elif [ "$ASSEMBLY" = "ecoli" ]; then 141 | INDEX=/groups/stark/gerlach/data/ecoli/ecoli 142 | CS=/groups/stark/gerlach/data/ecoli/ecoli.chrom.sizes 143 | else 144 | INDEX=/groups/stark/indices/bowtie/${ASSEMBLY}/${ASSEMBLY} 145 | CS=/groups/stark/genomes/chrom/${ASSEMBLY}.chrom.sizes 146 | fi 147 | 148 | # Throw error message if chromosome sizes file does not exist 149 | [ -e "$CS" ] || echo >&2 "ERROR: No chromosome size file found for genome assembly ${ASSEMBLY}!" 150 | 151 | 152 | ################################################################################ 153 | # Set output file name, TMP directory, compute number of fragments, p-vals 154 | ################################################################################ 155 | 156 | ## Path in which the script is located 157 | DIR="$(cd "$(dirname "$0")" && pwd)" 158 | 159 | ## If output prefix is not set, use prefix of EF 160 | if [ -z "$OF" ]; then 161 | OF=${EF%.bb} 162 | else 163 | if [ ! -d "$(dirname ${OF})" ]; then 164 | mkdir -p "$(dirname ${OF})" 165 | fi 166 | fi 167 | 168 | ## Temporary directory 169 | TMP=$(mktemp -d -p "${tempDir}") 170 | trap "rm -rf $TMP" EXIT 171 | 172 | ## Get total number of fragments in each file 173 | E=$(bigBedInfo $EF | awk '(/^itemCount/){gsub(/,/,"",$2);print $2}') 174 | B=$(bigBedInfo $BF | awk '(/^itemCount/){gsub(/,/,"",$2);print $2}') 175 | 176 | ## Pre-compute cutoff values for sign. enrichment given STARR-seq coverage values 0 to MS 177 | ## (only try input coverage values 0 to MI) 178 | ## Format is STARR, Input, .., .. 179 | precomp_pvals.R -MS $MS -MI $MI -LS $E -LI $B -p $LPVL -e $MINENR -FU H > $TMP/pvalues 180 | 181 | ## MINCOV : min. STARR-seq coverage for peak calling (candidates require >= MINCOV fragments) 182 | MINCOV=$(awk '{if($2==1){M=$1}} END{if(M<1){M=1} print M}' $TMP/pvalues) 183 | 184 | ################################################################################ 185 | # Extend reads if necessary 186 | ################################################################################ 187 | 188 | if [ "$EXT" = "0" ]; then 189 | # convert to bed format, do NOT extend 190 | bigBedToBed $EF stdout > $TMP/EF.bed 191 | bigBedToBed $BF stdout > $TMP/BF.bed 192 | if [ "$DST" = "0" ]; then 193 | # get median fragment length to use as distance D between summits 194 | DST=$(awk '{print $3-$2}' $TMP/BF.bed | median.R -i - | awk '{print int(($1/10)+0.5)*10}') 195 | fi 196 | else 197 | # do extend fragment size to a total size of EXT 198 | # use median fragment length (here: EXT) as distance D between summits 199 | if [ "$DST" = "0" ]; then 200 | DST=$EXT 201 | fi 202 | # redefine EXT to be the extension 203 | EXT=$(bigBedToBed $EF stdout | head -n 1 | awk -v E=$EXT '{print E-($3-$2)}') 204 | # convert to bed format, extend 205 | bigBedToBed $EF stdout | \ 206 | awk -v OFS="\t" -v E=$EXT -v C=$CS 'BEGIN{ while(getlinechr[$1]){$3=chr[$1]}}else{$2=$2-E;if($2<0){$2=0}} print $0 }' | \ 207 | sort -k1,1 -k2,2n -k6,6 -k5,5n -k4,4 -S $memory --parallel 4 > $TMP/EF.bed 208 | bigBedToBed $BF stdout | \ 209 | awk -v OFS="\t" -v E=$EXT -v C=$CS 'BEGIN{ while(getlinechr[$1]){$3=chr[$1]}}else{$2=$2-E;if($2<0){$2=0}} print $0 }' | \ 210 | sort -k1,1 -k2,2n -k6,6 -k5,5n -k4,4 -S $memory --parallel 4 > $TMP/BF.bed 211 | fi 212 | 213 | 214 | 215 | ################################################################################ 216 | # Get coverage for Input & summit candidates for STARR-seq (from coverage) 217 | ################################################################################ 218 | 219 | if [ "$STRICT" = "1" ]; then 220 | ## strict counting - STARR-seq counts are corrected UP 221 | ## to the level of the strand with the least depletion 222 | 223 | ## get strand-specific coverage files for EF and non-strand-specific for BF (bedgraph format) 224 | PIDS="" 225 | for s in "+" "-"; do 226 | cat $TMP/EF.bed | genomeCoverageBed -i stdin -bg -g $CS -strand "$s" > $TMP/EF.coverage.$s & 227 | PIDS="$PIDS $! " 228 | done 229 | cat $TMP/BF.bed | genomeCoverageBed -i stdin -bg -g $CS > $TMP/BF.coverage.b & 230 | PIDS="$PIDS $! " 231 | 232 | wait $PIDS 233 | 234 | ## merge strand-specific coverage files to one overall file 235 | ## -> record only region centers as summit candidates (if they pass a coverage cutoff) 236 | ## (still missing: merge touching regions with identical fragment count) 237 | bedtools unionbedg -i $TMP/EF.coverage.+ $TMP/EF.coverage.- | \ 238 | awk -v OFS="\t" '{p=int(($2+1+$3)/2); S=($4<=$5 ? $4*2 : $5*2); print $1,p,p,S}' > $TMP/summit-candidates 239 | ## clean up strand-specific coverage files 240 | for s in "+" "-"; do 241 | rm $TMP/EF.coverage.$s 242 | done 243 | 244 | else 245 | ## normal counting - sum of both strands is used, we obtain this directly... 246 | ## for STARR-seq, only region centers as summit candidates are recorded (if they pass a coverage cutoff) 247 | 248 | PIDS="" 249 | cat $TMP/BF.bed | genomeCoverageBed -i stdin -bg -g $CS > $TMP/BF.coverage.b & 250 | PIDS="$PIDS $! " 251 | cat $TMP/EF.bed | genomeCoverageBed -i stdin -bg -g $CS | awk -vM=$MINCOV '($4>=M){p=int(($2+1+$3)/2); print $1,p,p,$4}' > $TMP/summit-candidates & 252 | PIDS="$PIDS $! " 253 | 254 | wait $PIDS 255 | 256 | fi 257 | 258 | 259 | ################################################################################ 260 | # Filter summit candidates based on their enrichment over input 261 | ################################################################################ 262 | 263 | ## this first intersects the 1 nts long summit candidates with the input coverage as it is much faster 264 | ## filtering is based on whether the enrichment and pvalue (precomp.) cutoffs can be reached 265 | ## in a 2nd step, summit regions are expandedto +/- DST and a non-overlapping set of candidates is computed using score-overlap 266 | 267 | awk -vCS=$CS 'BEGIN{while((getline0){C[$1]=$2}} {if($1!=OC){OC=$1; print $1,1,C[$1],0} print $1,$2+1,$3,$4}' $TMP/BF.coverage.b | \ 268 | grep-overlap -s ' ' -c 0 $TMP/summit-candidates - | awk '{k=($1" "$2); if(k!=o){ if(o!=""){if(b<1){b=1} print o,e,b } o=k } e=$4; b=$8 } END{if(b<1){b=1} print o,e,b }' | \ 269 | awk -vPVS=$TMP/pvalues -vME=$MINENR -vE=$E -vB=$B -vD=$DST -vW=$WIN -vCS=$CS 'BEGIN{R=E/B; DE=1/ME; while((getline0){MINSTARR[$2]=$1} MAX=$2; while((getline0){C[$1]=$2}} 270 | { P=0; if($4 in MINSTARR){ if($3>=MINSTARR[$4]){P=1} }else{ if(($3>=MINSTARR[MAX]) && (R*$4/$3C[$1]){e=C[$1]} print $3,$1,s,$2+int(D/2),$1,b,e,$2,$3 } }' | \ 272 | score-overlap > $TMP/summits.b 273 | 274 | 275 | ################################################################################ 276 | # Get fragment counts for summit positions 277 | ################################################################################ 278 | 279 | # Below, I use as input the max(local input at position OR average of a W window) 280 | # We don't use a pseudocount but set input to 1 if it is 0 281 | # Future: use rpy to calculate binom. p-value inside python -> should be faster 282 | 283 | cat $TMP/BF.coverage.b | \ 284 | awk -vCS=$CS 'BEGIN{while((getline0){C[$1]=$2}} {if($1!=OC){OC=$1; print $1,1,C[$1],0} print $1,$2+1,$3,$4}' | \ 285 | grep-overlap -s ' ' -c 0 $TMP/summits.b - | \ 286 | awk '{k=($1" "$4);if(k!=o){if(o!=""){print o,e,b,int(s/l+0.5);s=0}o=k;l=$3-$2+1} if($9>0){s+=$12*$9} if($4>=$7 && $4<=$8){e=$5;b=$9}} END{print o,e,b,int(s/l+0.5)}' | \ 287 | awk -vE=$E -vB=$B -vME=$MINENR 'BEGIN{R=E/B} {b=$4>$5?$4:$5; if(b<1){b=1} if(($3/b)/R>ME){print $3,E,b,B,$3,$3+b,E,E+B,$1,$2,$3,b,E,B,($3/b)/R}}' | \ 288 | ${DIR}/conf_ratio.py -z $Z | hyper.R -i - -m 5 -n 6 -M 7 -N 8 -o TRUE -u FALSE | \ 289 | cut -f 9-1000 | awk -vP=$LPVL '$9<=P' | \ 290 | sort -k1,1 -k2,2n -S $memory --parallel 4 > $TMP/peaks.b 291 | 292 | 293 | ################################################################################ 294 | # Peak pruning 295 | # Remove peaks which are on the flanks of a peak primary peak. 296 | # Highest fragment count does not overlap with the summit position but is 297 | # shifted towards the ends. 298 | ################################################################################ 299 | 300 | # Print all full peaks regions, intersect with fragments, count height for each position 301 | # in the peak, report diference of original summit and new highest point, 302 | # sort by highest point and then by difference to summit position, 303 | # unique peak chr and old summit, and report the difference in bp, in %, and the new height 304 | # filter on more than 10% different than original summit 305 | 306 | if [ "$PRUNEP" = "1" ]; then 307 | cat $TMP/peaks.b | \ 308 | awk -vDST=$DST '{D=DST/2} {for (i=-D;i<=D;i++) {print $1,$2+i,$2+i,$2,$0}}' | \ 309 | grep-overlap -c 0 -s " " - <(awk '{$2+=1; print $0}' $TMP/EF.bed) | \ 310 | awk '{x[$1" "$2" "$3" "$4" "$7" "$8" "$9" "$10" "$11" "$12" "$13]++} END{for (i in x) {print x[i],i}}' | \ 311 | awk 'function abs(x){return (((x < 0) ? -x : x) + 0)} {print abs($5-$3),$0}' | \ 312 | sort -k2,2nr -k1,1n -S $memory --parallel 4 | \ 313 | awk -vD=$DST -vOFS="\t" '!x[$3" "$6]++{print $3,$6,$7,$8,$9,$10,$11,$12,$13,$1,$1/D*100,$2}' | \ 314 | sort -k8,8gr -S $memory --parallel 4 | \ 315 | awk '$11<=10' | \ 316 | cut -f1-9 > $OF.peaks.txt 317 | else 318 | sort -k8,8gr -S $memory --parallel 4 $TMP/peaks.b > $OF.peaks.txt 319 | fi 320 | 321 | 322 | ################################################################################ 323 | # Filter peaks and write a BED file 324 | ################################################################################ 325 | 326 | cat $OF.peaks.txt | \ 327 | awk -vD=$DST -vCS=$CS -vP=$PVL -vOFS="\t" 'BEGIN{while((getline0){S[$1]=$2}} 328 | ($9<=P){s=$2-int(D/2)-1;if(s<0)s=0;e=$2+int(D/2);if(e>S[$1])e=S[$1];print $1,s,e,"peak_"NR,$3<=1000?$3:1000,".",($2-1>=0?$2-1:0),$2,"50,0,0"}' | \ 329 | sort -k1,1 -k2,2n -S $memory --parallel 4 > $OF.peaks.bed 330 | 331 | ################################################################################ 332 | # Convert BED to BigBed 333 | ################################################################################ 334 | 335 | bedToBigBed $OF.peaks.bed $CS $OF.peaks.bb 2>/dev/null 336 | 337 | # Exit 338 | rm -rf ${TMP} 339 | 340 | exit 0 341 | -------------------------------------------------------------------------------- /GenomeWide_UMISTARRseq/slippage_filter_pe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o pipefail 4 | 5 | # Stark Lab in house pipeline. Adapted by Bernardo Almeida 2020 6 | 7 | ################################################################################ 8 | # Set default values 9 | ################################################################################ 10 | 11 | MM="2" 12 | L="10" 13 | OFFSET="1" 14 | D="2000" 15 | memory=4G #memory restrictions for shell sort 16 | 17 | ################################################################################ 18 | # Help 19 | ################################################################################ 20 | 21 | if [ $# -eq 0 ]; then 22 | echo >&2 " 23 | $(basename $0) - Filter putative slippage fragments in paired-end data 24 | 25 | USAGE: cat BED file | $(basename $0) - [OPTIONS] -- 26 | -m Maximum number of mismatches for sequence comparison [default: $MM] 27 | -l Sequence length for comparison [default: $L] 28 | -o Offset for sequence to compare [default: $OFFSET] 29 | -d Maximum distance for filtering within a cluster [default: $D] 30 | -m Memory restrictions for shell sort [default: $memory] 31 | 32 | " 33 | exit 1 34 | fi 35 | 36 | ################################################################################ 37 | # Parse input 38 | ################################################################################ 39 | 40 | while getopts "m:l:o:d:" o 41 | do 42 | case "$o" in 43 | m) MM="$OPTARG";; 44 | l) L="$OPTARG";; 45 | o) OFFSET="$OPTARG";; 46 | d) D="$OPTARG";; 47 | m) memory="$OPTARG";; 48 | \?) exit 1;; 49 | esac 50 | done 51 | 52 | ################################################################################ 53 | # Run program 54 | ############################################################################### 55 | 56 | # Sort by strand, chr, start (lower on top), end (higher on top), and mismatches 57 | # The longest fragment always appears first 58 | 59 | sort -k6,6 -k1,1 -k2,2n -k3,3nr -k5,5n -S $memory| \ 60 | awk -vD=$D -vL=$L -vMM=$MM -vO=$OFFSET ' 61 | function dist(a,b, i,sa,sb,m) 62 | { 63 | m=0 64 | for(i=1;i<=L;i++){ 65 | sa=substr(a,i,1);sb=substr(b,i,1) 66 | if(sa!=sb && sa!="N" && sb!="N"){m++} 67 | } 68 | return m 69 | } 70 | {split($4,seq,"_"); ok=1} 71 | (NR>1 && $6==strand && $1==chr) { 72 | if($2==start && $3>=end-D && $3<=end && strand=="+"){ 73 | SEQS[seq_r]=1; for(SEQ in SEQS){DIFF=dist(substr(seq[2],O,L),SEQ);if(DIFF<=MM){ok=0; break}} 74 | } 75 | else if($2==start && $3>=end-D && $3<=end && strand=="-"){ 76 | SEQS[seq_l]=1; for(SEQ in SEQS){DIFF=dist(substr(seq[1],O,L),SEQ);if(DIFF<=MM){ok=0; break}} 77 | } 78 | else if($2<=start+D && $2>start && $3==end && strand=="+"){ 79 | SEQS[seq_l]=1; for(SEQ in SEQS){DIFF=dist(substr(seq[1],O,L),SEQ);if(DIFF<=MM){ok=0; break}} 80 | } 81 | else if($2<=start+D && $2>start && $3==end && strand=="-"){ 82 | SEQS[seq_r]=1; for(SEQ in SEQS){DIFF=dist(substr(seq[2],O,L),SEQ);if(DIFF<=MM){ok=0; break}} 83 | } 84 | else{ 85 | for (SEQ in SEQS){delete SEQS[SEQ]} 86 | } 87 | } 88 | (NR>1 && ($6!=strand || $1!=chr)){for (SEQ in SEQS){delete SEQS[SEQ]}} 89 | {chr=$1;start=$2;end=$3;strand=$6;seq_l=substr(seq[1],O,L);seq_r=substr(seq[2],O,L)} 90 | {if(ok){print $0}}' | \ 91 | sort -k1,1 -k2,2n -k3,3nr -k6,6 -k5,5n -k4,4 -S $memory| \ 92 | awk '!x[$1" "$2" "$3" "$6]++' 93 | 94 | exit 0 95 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Bernardo P. de Almeida 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Oligo_UMISTARRseq/Drosophila_oligo_library_processing.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Drosophila oligo library processing" 3 | author: "Bernardo Almeida" 4 | output: html_document 5 | --- 6 | 7 | ```{r set working directory} 8 | 9 | setwd("Oligo_UMISTARRseq") 10 | 11 | library(BSgenome.Dmelanogaster.UCSC.dm3) 12 | library(GenomicRanges) 13 | library(rtracklayer) 14 | library(dplyr) 15 | library(patchwork) 16 | library(ggplot2) 17 | 18 | library(ggseqlogo) 19 | library(cowplot) 20 | library(TFBSTools) 21 | library(ggpointdensity) 22 | 23 | ``` 24 | 25 | # Load sequencing data and process the data 26 | 27 | ```{r Load oligo info and experiment tables} 28 | 29 | Oligo_library_metadata <- read.delim("Drosophila_oligo_library_oligo_info.txt") 30 | 31 | # experiment tables 32 | experiment_table <- read.delim("Drosophila_oligo_library_experiment.txt") 33 | 34 | ``` 35 | 36 | ```{r load UMI STARR-seq and input reads} 37 | 38 | for(i in experiment_table$simple_name){ 39 | possible_types <- c("UMI", "all") 40 | 41 | for(type in possible_types){ 42 | 43 | path <- experiment_table$Outpath[experiment_table$simple_name == i] 44 | mapped <- import.bed(paste0(path, "/", experiment_table$Outfile[experiment_table$simple_name %in% i], ".", type, ".bed")) 45 | 46 | # choose sequences with correct length 47 | mapped_correct_length <- mapped[width(mapped)==249 & mapped@strand %in% "+"] 48 | 49 | # add counts info to Oligo_library_metadata table 50 | Oligo_library_metadata <- merge(Oligo_library_metadata, as.data.frame(table(mapped_correct_length@seqnames)), by=1, all.x=T) 51 | names(Oligo_library_metadata)[ncol(Oligo_library_metadata)] <- paste0(i, "_", type) 52 | 53 | print(paste0(i, "_", type)) 54 | 55 | } 56 | } 57 | 58 | # correct NAs to 0 counts 59 | Oligo_library_metadata[14:ncol(Oligo_library_metadata)][is.na(Oligo_library_metadata[14:ncol(Oligo_library_metadata)])] <- 0 60 | 61 | write.table(Oligo_library_metadata, paste0("Drosophila_oligo_library_Oligo_counts.txt"), sep="\t", quote=F, row.names = F) 62 | 63 | ``` 64 | 65 | ### Input quality 66 | 67 | ```{r plot distribution of sequences} 68 | 69 | Oligo_library_metadata <- read.delim("Drosophila_oligo_library_Oligo_counts.txt") 70 | 71 | pdf("Drosophila_oligo_library_Distribution.of.fragment.counts.pdf", width = 7, height = 4.5) 72 | 73 | for(i in names(Oligo_library_metadata)[14:ncol(Oligo_library_metadata)]){ 74 | 75 | nr.reads.per.variant <- Oligo_library_metadata[[i]] 76 | 77 | expected_total_oligos=nrow(Oligo_library_metadata) 78 | 79 | if(length(grep("STARR", i))==0) position="topleft" 80 | if(length(grep("STARR", i))>0) position="topright" 81 | 82 | plot(density(log10(nr.reads.per.variant)), xlab = "log10 counts of sequenced oligos", 83 | main = gsub("_", " ", i)) 84 | legend("topleft", legend = c(paste0("Nr of perfect reads mapped = ", round(sum(Oligo_library_metadata[[i]])/1e6,1), " M"), 85 | paste0("Nr oligos detected = ", length(Oligo_library_metadata[[i]][!Oligo_library_metadata[[i]]==0]), " (", formatC(length(Oligo_library_metadata[[i]][!Oligo_library_metadata[[i]]==0])/expected_total_oligos*100, format = "f", digits = 2), "%)"), 86 | paste0("Mean = ", round(mean(nr.reads.per.variant))), 87 | paste0("Median = ", median(nr.reads.per.variant))), 88 | text.col = c("black", "black", "tomato3", "steelblue3"), bty = "n", 89 | cex=0.85) 90 | abline(v = c(log10(median(nr.reads.per.variant)), log10(mean(nr.reads.per.variant))), col = c("steelblue3", "tomato3"), lwd = 1) 91 | 92 | # make main plot for inputs to compare the different libraries 93 | if(length(grep("input", i))>0){ 94 | gg <- ggplot() + 95 | geom_density(data=Oligo_library_metadata, 96 | aes(x=Oligo_library_metadata[[i]]), 97 | fill="grey60") + 98 | geom_density(data=Oligo_library_metadata, 99 | aes(x=Oligo_library_metadata[[i]], 100 | fill = Experiment %in% "wt"), alpha = 0.3) + 101 | scale_fill_brewer("", palette = "Dark2", 102 | labels=c("FALSE"="Twist mut", 103 | "TRUE"="Twist wt")) + 104 | scale_x_log10("log10 counts of sequenced oligos")+ 105 | scale_y_continuous(expand = c(0,0)) + 106 | geom_vline(xintercept = median(nr.reads.per.variant), col = "black", linetype="dashed") + 107 | ggtitle(gsub("_", " ", i)) + 108 | theme_light(base_size = 15) 109 | 110 | print(gg) 111 | } 112 | 113 | print(i) 114 | } 115 | 116 | dev.off() 117 | 118 | ``` 119 | 120 | ```{r How consistent are the missing oligos between inputs, replicates?} 121 | 122 | Oligo_library_metadata <- read.delim("Drosophila_oligo_library_Oligo_counts.txt") 123 | 124 | # barplot of oligos with 0 counts in x, y or xy 125 | library(gridExtra) 126 | plot_list_tmp <- list() 127 | for(id in 1:4){ 128 | if(id==1){ 129 | a="input_dev_rep1_UMI" 130 | b="input_dev_rep2_UMI" 131 | main="dev input replicates" 132 | } 133 | if(id==2){ 134 | a="input_hk_rep1_UMI" 135 | b="input_hk_rep2_UMI" 136 | main="hk input replicates" 137 | } 138 | if(id==3){ 139 | a="input_dev_rep1_UMI" 140 | b="input_hk_rep1_UMI" 141 | main="dev & hk inputs rep1" 142 | } 143 | if(id==4){ 144 | a="input_dev_rep2_UMI" 145 | b="input_hk_rep2_UMI" 146 | main="dev & hk inputs rep2" 147 | } 148 | 149 | t <- table(Oligo_library_metadata[,a]>0, 150 | Oligo_library_metadata[,b]>0) 151 | 152 | df <- as.data.frame(t)[-4,] 153 | df$ID <- c("Both", 154 | gsub("_", " ", substr(a, 1, nchar(a)-4)), 155 | gsub("_", " ", substr(b, 1, nchar(b)-4))) 156 | df$ID <- factor(df$ID, levels=c(gsub("_", " ", substr(a, 1, nchar(a)-4)), 157 | gsub("_", " ", substr(b, 1, nchar(b)-4)), 158 | "Both")) 159 | 160 | p<-ggplot(data=df, aes(x=ID, y=Freq, fill=ID)) + 161 | geom_bar(stat="identity", width=0.7) + 162 | scale_y_continuous("# missing oligos (out of 58k)", breaks = seq(0,1000,100)) + 163 | xlab("Sample") + 164 | guides(fill=F) + 165 | scale_fill_manual(values=c("grey70", "grey70", "grey30")) + 166 | ggtitle(main) + 167 | theme_bw(base_size = 11) + 168 | theme(plot.title = element_text(hjust = 0.5), 169 | axis.text.x = element_text(size = 11), 170 | axis.text.y = element_text(size = 10), 171 | axis.title.x= element_text(margin = margin(t = 7, r = 0, b = 0, l = 0)), 172 | axis.title = element_text(size = 15)) 173 | 174 | plot_list_tmp[[id]] = ggplotGrob(p) 175 | 176 | } 177 | 178 | # multiplot 179 | pdf("Drosophila_oligo_library_Consistency_of_missing_oligos_between_inputs_replicates.pdf", 180 | width = 9.5, height = 10) 181 | print(gridExtra::grid.arrange(grobs = plot_list_tmp, ncol = 2)) 182 | dev.off() 183 | 184 | ``` 185 | 186 | ### Check UMI collapsing rate 187 | 188 | ```{r Compare UMI counts with all reads} 189 | 190 | plot_list_a = list() 191 | for(i in experiment_table$simple_name){ 192 | 193 | a <- paste0(i,"_UMI") 194 | b <- paste0(i,"_all") 195 | 196 | p <- ggplot(data = Oligo_library_metadata, aes_string(Oligo_library_metadata[,a], Oligo_library_metadata[,b])) + 197 | geom_abline(intercept = 0) + 198 | geom_point(size = 0.5) + 199 | guides(alpha="none", fill="none")+ 200 | xlab(gsub("_", " ", a)) + 201 | ylab(gsub("_", " ", b)) + 202 | ggtitle(gsub("_", " ", i)) + 203 | theme_bw() 204 | 205 | plot_list_a[[i]] = p 206 | 207 | } 208 | 209 | pdf("Drosophila_oligo_library_Collapsing_UMI_vs_all_reads.pdf", width = 12, height = 12) 210 | cowplot::plot_grid(plotlist = plot_list_a, ncol = 3) 211 | dev.off() 212 | 213 | ``` 214 | 215 | ### Compare replicates 216 | 217 | ```{r Compare replicates} 218 | 219 | library(ggpointdensity) 220 | 221 | Oligo_library_metadata <- read.delim("Drosophila_oligo_library_Oligo_counts.txt") 222 | 223 | pdf(paste0("Drosophila_oligo_library_Replicate_correlations.pdf"), width = 15, height = 4.5) 224 | 225 | for(f in c("UMI")){ 226 | 227 | plot_list_tmp = list() 228 | 229 | if(f=="UMI") Oligo_counts <- Oligo_library_metadata[,grep("rep._UMI", names(Oligo_library_metadata))] 230 | if(f=="all") Oligo_counts <- Oligo_library_metadata[,grep("rep._all", names(Oligo_library_metadata))] 231 | 232 | # normalise to 1 million mapped fragments 233 | Counts_per_million_cpm <- as.data.frame(apply(Oligo_counts, 2, function(x) x/sum(x)*1e6)) 234 | 235 | for(id in c("input_dev_rep", "STARRseq_dev_rep", "input_hk_rep", "STARRseq_hk_rep")){ 236 | if(id=="STARRseq_dev_rep") t="Dev STARR-seq" 237 | if(id=="STARRseq_hk_rep") t="Hk STARR-seq" 238 | if(id=="input_dev_rep") t="Dev input" 239 | if(id=="input_hk_rep") t="Hk input" 240 | 241 | df_tmp <- Counts_per_million_cpm[,grep(id, names(Counts_per_million_cpm))] 242 | 243 | comparison_list <- list(a_b=c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]], 244 | names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]]), 245 | a_c=c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]], 246 | names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[3]]), 247 | b_c=c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]], 248 | names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[3]])) 249 | 250 | if(length(grep(id, names(Counts_per_million_cpm)))==4) comparison_list[["a_d"]] <- c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]], 251 | names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[4]]) 252 | 253 | plot_list_tmp <- lapply(comparison_list, function(x){ 254 | 255 | a <- x[1] 256 | b <- x[2] 257 | 258 | # PCC 259 | pc <- cor.test(log10(df_tmp[apply(df_tmp,1,min)>0,a]), 260 | log10(df_tmp[apply(df_tmp,1,min)>0,b]), 261 | method = "pearson") 262 | 263 | if(length(grep("dev", id))>0) my_col=c("orangered","orangered4") else{my_col=c("dodgerblue","dodgerblue4")} 264 | 265 | # plot 266 | scater <- ggplot(df_tmp, aes(df_tmp[,a], df_tmp[,b])) + 267 | geom_pointdensity(adjust = 0.4, size=0.4) + 268 | scale_color_gradient(low = my_col[1], high = my_col[2]) + 269 | scale_x_log10(gsub("_", " ", a), 270 | limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)), 271 | breaks=c(0,1,10,100,1000, 10000)) + 272 | scale_y_log10(gsub("_", " ", b), 273 | limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)), 274 | breaks=c(0,1,10,100,1000, 10000)) + 275 | guides(color=F) + 276 | theme_bw(base_size = 16) + 277 | theme(panel.grid = element_blank(), 278 | axis.text = element_text(colour="black"), 279 | plot.title = element_text(hjust=0.5), 280 | plot.margin = margin(0.5, 1, 0.5, 0.5, "cm")) + 281 | annotate("text", x=min(df_tmp[df_tmp!=0], na.rm=T), y = max(df_tmp, na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5) 282 | 283 | return(ggplotGrob(scater)) 284 | 285 | }) 286 | 287 | # multiplot 288 | print(gridExtra::grid.arrange(grobs = plot_list_tmp, nrow = 1)) 289 | 290 | } 291 | 292 | } 293 | 294 | dev.off() 295 | 296 | ``` 297 | 298 | ### Calculate activity of each oligo with DESeq2 299 | 300 | ```{r Twist oligo fold-change to input with DESeq2} 301 | 302 | Count_table <- read.delim("Drosophila_oligo_library_Oligo_counts.txt") 303 | Count_table <- Count_table[,c(1:13,grep("UMI", names(Count_table)))] 304 | rownames(Count_table) <- Count_table$Oligo_ID 305 | 306 | 307 | # http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html 308 | library(DESeq2) 309 | Count_table_final <- Count_table 310 | for(e in c("dev", "hk")){ 311 | 312 | # only sequences with at least 10 reads in all inputs 313 | Count_table_2 <- Count_table[rowSums(Count_table[,grep(paste0("input_", e), names(Count_table))]<10)==0,] 314 | 315 | cts <- Count_table_2[,grep(e, names(Count_table_2), ignore.case = T)] 316 | rownames(cts) <- Count_table_2$Oligo_ID 317 | 318 | # added one read pseudocount to oligos with zero RNA counts 319 | cts[cts==0] <- 1 320 | 321 | # design 322 | coldata <- data.frame(type=factor(rep(c("Input", "Experiment"),each=3),levels=c("Input", "Experiment")), 323 | row.names = names(cts)) 324 | 325 | if (!identical(which(coldata$type=="Input"), grep("input", rownames(coldata)))){ 326 | print("Input in design matrix does not match input samples") 327 | break 328 | } 329 | if (!all(rownames(coldata) %in% colnames(cts))){ 330 | print("Rownames do not match colnames") 331 | break 332 | } 333 | if (!all(rownames(coldata) == colnames(cts))){ 334 | print("Rownames do not match colnames") 335 | break 336 | } 337 | 338 | dds <- DESeqDataSetFromMatrix(countData = as.matrix(cts), 339 | colData = coldata, 340 | design= ~ type) 341 | 342 | # counts of wildtype negative regions in each library as scaling factors between samples 343 | sizeFactors(dds)=estimateSizeFactorsForMatrix(as.matrix(cts[grep("_wt_NegativeRegions", rownames(cts)),])) 344 | dds <- DESeq(dds) 345 | #resultsNames(dds) # lists the coefficients 346 | 347 | pdf(paste0("Drosophila_oligo_library_Twist_oligo_FC_DESeq2_",e,".pdf")) 348 | 349 | # plots quality control 350 | plotDispEsts(dds) 351 | 352 | # plot normal FC 353 | res <- results(dds, alpha=0.05) 354 | summary(res) 355 | DESeq2::plotMA(res) 356 | mcols(res)$description 357 | 358 | boxplot(res$log2FoldChange~Count_table_2$Enhancer_type) 359 | abline(h=0) 360 | 361 | # plot merged RNA vs merged DNA 362 | # this gives log2(n + 1) 363 | ntd <- as.data.frame(assay(normTransform(dds))) 364 | ntd$input_mean <- rowMeans(ntd[,c(1,2)]) 365 | ntd$experiment_mean <- rowMeans(ntd[,c(3,4)]) 366 | plot(ntd$input_mean, ntd$experiment_mean, col=c("black", "red")[factor(res$padj<0.05)]) 367 | abline(0,1) 368 | 369 | dev.off() 370 | 371 | # merge with main table 372 | tmp <- as.data.frame(res)[,c(1,2,5,6)] 373 | names(tmp) <- paste0(e,"_",names(tmp)) 374 | Count_table_final <- merge(Count_table_final, tmp, by.x=1, by.y=0, all.x=T) 375 | 376 | print(e) 377 | } 378 | 379 | write.table(Count_table_final, "Drosophila_oligo_library_final_table_all_oligos.txt", sep="\t", quote=F, row.names = F) 380 | 381 | # remove oligos with no activity in both dev and hk screens 382 | Count_table_final <- Count_table_final[!(!complete.cases(Count_table_final$dev_log2FoldChange) & !complete.cases(Count_table_final$hk_log2FoldChange)),] 383 | 384 | write.table(Count_table_final, "Drosophila_oligo_library_final_table.txt", sep="\t", quote=F, row.names = F) 385 | 386 | ``` 387 | -------------------------------------------------------------------------------- /Oligo_UMISTARRseq/Human_oligo_library_processing.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Human oligo library processing" 3 | author: "Bernardo Almeida" 4 | output: html_document 5 | --- 6 | 7 | ```{r set working directory} 8 | 9 | setwd("Oligo_UMISTARRseq") 10 | 11 | library(BSgenome.Hsapiens.UCSC.hg19) 12 | library(GenomicRanges) 13 | library(rtracklayer) 14 | library(dplyr) 15 | library(patchwork) 16 | library(ggplot2) 17 | 18 | library(ggseqlogo) 19 | library(cowplot) 20 | library(TFBSTools) 21 | library(ggpointdensity) 22 | 23 | ``` 24 | 25 | # Load sequencing data and process the data 26 | 27 | ```{r Load oligo info and experiment tables} 28 | 29 | Human_oligo_library_metadata <- read.delim("Human_oligo_library_oligo_info.txt") 30 | 31 | # experiment tables 32 | experiment_table <- read.delim("Human_oligo_library_experiment.txt") 33 | 34 | ``` 35 | 36 | ```{r load UMI STARR-seq and input reads} 37 | 38 | for(i in experiment_table$simple_name){ 39 | possible_types <- c("UMI", "all") 40 | 41 | for(type in possible_types){ 42 | 43 | path <- paste0(experiment_table$path[experiment_table$simple_name %in% i], "data/") 44 | mapped <- import.bed(paste0(path, experiment_table$Outfile[experiment_table$simple_name %in% i], ".", type, ".bed")) 45 | 46 | # choose sequences with correct length 47 | mapped_correct_length <- mapped[width(mapped)==249 & mapped@strand %in% "+"] 48 | 49 | # add counts info to Human_oligo_library_metadata table 50 | Human_oligo_library_metadata <- merge(Human_oligo_library_metadata, as.data.frame(table(mapped_correct_length@seqnames)), by=1, all.x=T) 51 | names(Human_oligo_library_metadata)[ncol(Human_oligo_library_metadata)] <- paste0(i, "_", type) 52 | 53 | print(paste0(i, "_", type)) 54 | 55 | } 56 | } 57 | 58 | # correct NAs to 0 counts 59 | Human_oligo_library_metadata[18:ncol(Human_oligo_library_metadata)][is.na(Human_oligo_library_metadata[18:ncol(Human_oligo_library_metadata)])] <- 0 60 | 61 | write.table(Human_oligo_library_metadata, paste0("Human_oligo_library_counts.txt"), sep="\t", quote=F, row.names = F) 62 | 63 | ``` 64 | 65 | ## Input quality 66 | 67 | ```{r plot distribution of sequences} 68 | 69 | Human_oligo_library_metadata <- read.delim("Human_oligo_library_counts.txt") 70 | 71 | pdf("Human_oligo_library_Distribution.of.fragment.counts.pdf", width = 7, height = 4.5) 72 | 73 | for(i in names(Human_oligo_library_metadata)[18:ncol(Human_oligo_library_metadata)]){ 74 | 75 | nr.reads.per.variant <- Human_oligo_library_metadata[[i]] 76 | 77 | expected_total_oligos=nrow(Human_oligo_library_metadata) 78 | 79 | if(length(grep("STARR", i))==0) position="topleft" 80 | if(length(grep("STARR", i))>0) position="topright" 81 | 82 | plot(density(log10(nr.reads.per.variant)), xlab = "log10 counts of sequenced oligos", 83 | main = gsub("_", " ", i)) 84 | legend("topleft", legend = c(paste0("Nr of perfect reads mapped = ", round(sum(Human_oligo_library_metadata[[i]])/1e6,1), " M"), 85 | paste0("Nr oligos detected = ", length(Human_oligo_library_metadata[[i]][!Human_oligo_library_metadata[[i]]==0]), " (", formatC(length(Human_oligo_library_metadata[[i]][!Human_oligo_library_metadata[[i]]==0])/expected_total_oligos*100, format = "f", digits = 2), "%)"), 86 | paste0("Mean = ", round(mean(nr.reads.per.variant))), 87 | paste0("Median = ", median(nr.reads.per.variant))), 88 | text.col = c("black", "black", "tomato3", "steelblue3"), bty = "n", 89 | cex=0.85) 90 | abline(v = c(log10(median(nr.reads.per.variant)), log10(mean(nr.reads.per.variant))), col = c("steelblue3", "tomato3"), lwd = 1) 91 | 92 | # make main plot for inputs to compare the different libraries 93 | if(length(grep("input", i))>0){ 94 | gg <- ggplot() + 95 | geom_density(data=Human_oligo_library_metadata, 96 | aes(x=Human_oligo_library_metadata[[i]]), 97 | fill="grey60") + 98 | scale_fill_brewer("", palette = "Dark2", 99 | labels=c("FALSE"="Twist mut", 100 | "TRUE"="Twist wt")) + 101 | scale_x_log10("log10 counts of sequenced oligos")+ 102 | scale_y_continuous(expand = c(0,0)) + 103 | geom_vline(xintercept = median(nr.reads.per.variant), col = "black", linetype="dashed") + 104 | ggtitle(gsub("_", " ", i)) + 105 | theme_light(base_size = 15) 106 | 107 | print(gg) 108 | } 109 | 110 | print(i) 111 | } 112 | 113 | dev.off() 114 | 115 | ``` 116 | 117 | ```{r How consistent are the missing oligos between inputs, replicates?} 118 | 119 | Human_oligo_library_metadata <- read.delim("Human_oligo_library_counts.txt") 120 | 121 | # barplot of oligos with 0 counts in x, y or xy 122 | library(gridExtra) 123 | plot_list_tmp <- list() 124 | for(id in 1){ 125 | if(id==1){ 126 | a="input_rep1_UMI" 127 | b="input_rep2_UMI" 128 | main="Input replicates" 129 | } 130 | 131 | t <- table(Human_oligo_library_metadata[,a]>0, 132 | Human_oligo_library_metadata[,b]>0) 133 | 134 | df <- as.data.frame(t)[-4,] 135 | df$ID <- c("Both", 136 | gsub("_", " ", substr(a, 1, nchar(a)-4)), 137 | gsub("_", " ", substr(b, 1, nchar(b)-4))) 138 | df$ID <- factor(df$ID, levels=c(gsub("_", " ", substr(a, 1, nchar(a)-4)), 139 | gsub("_", " ", substr(b, 1, nchar(b)-4)), 140 | "Both")) 141 | 142 | p<-ggplot(data=df, aes(x=ID, y=Freq, fill=ID)) + 143 | geom_bar(stat="identity", width=0.7) + 144 | scale_y_continuous("# oligos (out of 23k)", breaks = seq(0,1000,100)) + 145 | xlab("Sample") + 146 | guides(fill=F) + 147 | scale_fill_manual(values=c("grey70", "grey70", "grey30")) + 148 | ggtitle(main) + 149 | theme_bw(base_size = 11) + 150 | theme(plot.title = element_text(hjust = 0.5), 151 | axis.text = element_text(size = 11), 152 | axis.title.x= element_text(margin = margin(t = 7, r = 0, b = 0, l = 0)), 153 | axis.title = element_text(size = 15)) 154 | 155 | plot_list_tmp[[id]] = ggplotGrob(p) 156 | 157 | } 158 | 159 | # multiplot 160 | pdf("Human_oligo_library_Consistency_of_missing_oligos_between_inputs_replicates.pdf", 161 | width = 5, height = 5) 162 | print(gridExtra::grid.arrange(grobs = plot_list_tmp)) 163 | dev.off() 164 | 165 | ``` 166 | 167 | ## Check UMI collapsing rate 168 | 169 | ```{r Compare UMI counts with all reads} 170 | 171 | plot_list_a = list() 172 | for(i in experiment_table$simple_name){ 173 | 174 | a <- paste0(i,"_UMI") 175 | b <- paste0(i,"_all") 176 | 177 | p <- ggplot(data = Human_oligo_library_metadata, aes_string(Human_oligo_library_metadata[,a], Human_oligo_library_metadata[,b])) + 178 | geom_abline(intercept = 0) + 179 | geom_point(size = 0.5) + 180 | #stat_bkde2d(aes(fill=..level..), geom="polygon")+ 181 | guides(alpha="none", fill="none")+ 182 | #scale_fill_viridis() + 183 | #scale_x_continuous(limits = c(0,max(Merged_table[,grep("UMI", names(Merged_table))]))) + 184 | #scale_y_continuous(limits = c(0,max(Merged_table[,grep("all", names(Merged_table))]))) + 185 | xlab(gsub("_", " ", a)) + 186 | ylab(gsub("_", " ", b)) + 187 | ggtitle(gsub("_", " ", i)) + 188 | theme_bw() 189 | 190 | plot_list_a[[i]] = p 191 | 192 | } 193 | 194 | pdf("Human_oligo_library_Collapsing_UMI_vs_all_reads.pdf", width = 7, height = 10) 195 | cowplot::plot_grid(plotlist = plot_list_a, ncol = 2) 196 | dev.off() 197 | 198 | ``` 199 | 200 | ## Compare replicates 201 | 202 | ```{r Compare replicates} 203 | 204 | library(ggpointdensity) 205 | 206 | Human_oligo_library_metadata <- read.delim("Human_oligo_library_counts.txt") 207 | 208 | pdf(paste0("Human_oligo_library_Replicate_correlations.pdf"), width = 10, height = 9.5) 209 | 210 | for(f in c("UMI", "all")){ 211 | 212 | plot_list_tmp = list() 213 | 214 | if(f=="UMI") Oligo_counts <- Human_oligo_library_metadata[,grep("rep._UMI", names(Human_oligo_library_metadata))] 215 | if(f=="all") Oligo_counts <- Human_oligo_library_metadata[,grep("rep._all", names(Human_oligo_library_metadata))] 216 | 217 | # normalise to 1 million mapped fragments 218 | Counts_per_million_cpm <- as.data.frame(apply(Oligo_counts, 2, function(x) x/sum(x)*1e6)) 219 | 220 | for(id in c("input_rep", "STARRseq_rep")){ 221 | if(id=="STARRseq_rep") t="STARR-seq" 222 | if(id=="input_rep") t="input" 223 | 224 | df_tmp <- Counts_per_million_cpm[,grep(id, names(Counts_per_million_cpm))] 225 | 226 | comparison_list <- list(a_b=c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]], 227 | names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]])) 228 | names(comparison_list) <- paste0(id, "a_b") 229 | 230 | if(length(grep(id, names(Counts_per_million_cpm)))==3){ 231 | comparison_list[[paste0(id, "a_c")]] <- c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[1]], 232 | names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[3]]) 233 | comparison_list[[paste0(id, "b_c")]] <- c(names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[2]], 234 | names(Counts_per_million_cpm)[grep(id, names(Counts_per_million_cpm))[3]]) 235 | } 236 | 237 | for(x in names(comparison_list)){ 238 | 239 | a <- comparison_list[[x]][1] 240 | b <- comparison_list[[x]][2] 241 | 242 | # PCC 243 | pc <- cor.test(log10(df_tmp[apply(df_tmp,1,min)>0,a]), 244 | log10(df_tmp[apply(df_tmp,1,min)>0,b]), 245 | method = "pearson") 246 | 247 | my_col=c("#737E92","#063887") 248 | 249 | # plot 250 | scater <- ggplot(df_tmp, aes(df_tmp[,a], df_tmp[,b])) + 251 | geom_pointdensity(adjust = 0.4, size=0.4) + 252 | scale_color_gradient(low = my_col[1], high = my_col[2]) + 253 | scale_x_log10(gsub("_", " ", a), 254 | limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)), 255 | breaks=c(0,1,10,100,1000)) + 256 | scale_y_log10(gsub("_", " ", b), 257 | limits=c(min(df_tmp[df_tmp!=0], na.rm=T), max(df_tmp, na.rm=T)), 258 | breaks=c(0,1,10,100,1000)) + 259 | guides(color=F) + 260 | theme_bw(base_size = 16) + 261 | theme(panel.grid = element_blank(), 262 | axis.text = element_text(colour="black"), 263 | plot.title = element_text(hjust=0.5)) + 264 | ggtitle(t) + 265 | annotate("text", x=min(df_tmp[df_tmp!=0], na.rm=T), y = max(df_tmp, na.rm=T), label = paste0("PCC: ", round(pc$estimate,2)), vjust=1, hjust=0, size=5) 266 | 267 | plot_list_tmp[[x]] <- ggplotGrob(scater) 268 | 269 | } 270 | 271 | } 272 | 273 | # multiplot 274 | print(gridExtra::grid.arrange(grobs = plot_list_tmp, ncol = 2, nrow=2)) 275 | 276 | } 277 | 278 | dev.off() 279 | 280 | 281 | ``` 282 | 283 | ## Calculate activity of each oligo with DESeq2 284 | 285 | ```{r Twist oligo fold-change to input with DESeq2} 286 | 287 | Count_table <- read.delim("Human_oligo_library_counts.txt") 288 | Count_table <- Count_table[,c(1:17,grep("UMI", names(Count_table)))] 289 | rownames(Count_table) <- Count_table$Oligo_ID 290 | 291 | # http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html 292 | library(DESeq2) 293 | Count_table_final <- Count_table 294 | 295 | # only sequences with at least 10 reads in both inputs 296 | table(rowSums(Count_table[,grep("input_", names(Count_table))]==0)) 297 | Count_table_2 <- Count_table[rowSums(Count_table[,grep("input_", names(Count_table))]<10)==0,] 298 | 299 | cts <- Count_table_2[,grep("input|STARR", names(Count_table_2), ignore.case = T)] 300 | rownames(cts) <- Count_table_2$Oligo_ID 301 | 302 | # added one read pseudocount to oligos with zero RNA counts 303 | cts[cts==0] <- 1 304 | 305 | # design 306 | coldata <- data.frame(type=factor(c(rep("Input",2), rep("Experiment",2)),levels=c("Input", "Experiment")), 307 | row.names = names(cts)) 308 | 309 | if (!identical(which(coldata$type=="Input"), grep("input", rownames(coldata)))){ 310 | print("Input in design matrix does not match input samples") 311 | break 312 | } 313 | if (!all(rownames(coldata) %in% colnames(cts))){ 314 | print("Rownames do not match colnames") 315 | break 316 | } 317 | if (!all(rownames(coldata) == colnames(cts))){ 318 | print("Rownames do not match colnames") 319 | break 320 | } 321 | 322 | dds <- DESeqDataSetFromMatrix(countData = as.matrix(cts), 323 | colData = coldata, 324 | design= ~ type) 325 | 326 | # counts of wildtype negative regions in each library as scaling factors between samples 327 | sizeFactors(dds)=estimateSizeFactorsForMatrix(as.matrix(cts[grep("Neg_region_", rownames(cts)),])) 328 | dds <- DESeq(dds) 329 | #resultsNames(dds) # lists the coefficients 330 | 331 | 332 | pdf(paste0("Human_oligo_library_Twist_oligo_FC_DESeq2.pdf")) 333 | 334 | # plots quality control 335 | plotDispEsts(dds) 336 | 337 | # plot normal FC 338 | res <- results(dds, alpha=0.05) 339 | summary(res) 340 | DESeq2::plotMA(res) 341 | mcols(res)$description 342 | 343 | # plot merged RNA vs merged DNA 344 | # this gives log2(n + 1) 345 | ntd <- as.data.frame(assay(normTransform(dds))) 346 | ntd$input_mean <- rowMeans(ntd[,c(1,2)]) 347 | ntd$experiment_mean <- rowMeans(ntd[,c(3:5)]) 348 | plot(ntd$input_mean, ntd$experiment_mean, col=c("black", "red")[factor(res$padj<0.05)]) 349 | abline(0,1) 350 | 351 | dev.off() 352 | 353 | # merge with main table 354 | tmp <- as.data.frame(res)[,c(1,2,5,6)] 355 | Count_table_final <- merge(Count_table_final, tmp, by.x=1, by.y=0, all.x=T) 356 | 357 | write.table(Count_table_final, "Human_oligo_library_final_table_all_oligos.txt", sep="\t", quote=F, row.names = F) 358 | 359 | # remove oligos with no activity in both CPs 360 | Count_table_final <- Count_table_final[complete.cases(Count_table_final$log2FoldChange),] 361 | 362 | write.table(Count_table_final, "Human_oligo_library_final_table.txt", sep="\t", quote=F, row.names = F) 363 | 364 | ``` 365 | -------------------------------------------------------------------------------- /Oligo_UMISTARRseq/README.md: -------------------------------------------------------------------------------- 1 | # Scripts for processing oligo UMI-STARR-seq data 2 | 3 | Pipeline for mapping reads with bowtie (no mismtaches) to reference index (oligos included in the library) and UMI collapsing: [oligo_UMISTARRseq_pipeline.sh](oligo_UMISTARRseq_pipeline.sh) 4 | 5 | R markdowns for processing mapped reads, check quality of screens, replicates, and calculate activity of each oligo with DESeq2: 6 | - Drosophila library: [Drosophila_oligo_library_processing.Rmd](Drosophila_oligo_library_processing.Rmd) 7 | - Human library: [Human_oligo_library_processing.Rmd](Human_oligo_library_processing.Rmd) 8 | 9 | The raw sequencing and processed data are available from GEO under accession number [GSE183939](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE183939). 10 | 11 | ## Questions 12 | If you have any questions/requests/comments please contact me at [bernardo.almeida94@gmail.com](mailto:bernardo.almeida94@gmail.com). 13 | -------------------------------------------------------------------------------- /Oligo_UMISTARRseq/bsub_gridengine: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ############################################### 4 | ## bsub 5 | ## mimics bsub of the LSF system for the SGE 6 | ## WARNING: options are different 7 | ## 8 | ############################################### 9 | 10 | #TODO - add message and email parameter in case job dies/exit unexpectedly, i.e. not enough memory problem 11 | 12 | #update 21/06/2018 13 | #now we have two environment - one using gridengine submission system and another - slurm 14 | #and they are running diffferent OS - Debian vs CentOS. 15 | #this how I will get where I am... 16 | 17 | #update 2019/06/17 in the new cluster (clip) this command does not work any more. 18 | #OS_id=`lsb_release -idrc|grep Description|perl -ne 'chomp($_); ($name)=$_=~/\:\s+(\S+)\s+/; print $name."\n";'` 19 | 20 | #update 2019/06/17 21 | # Instead Seren Uemit has recommended to use $LMOD_SYSHOST system variable 22 | # So, for the moment I have 'CLIP' , 'IMPIMBA-2', or '' (empty for old cluster, including nodes stark-1 stark-2) values 23 | OS_id=$(echo $LMOD_SYSHOST) 24 | 25 | #Debian and old cluster specific defaults 26 | #if [ "$OS_id" == "Debian" ]; then ## commad was for lsb_release 27 | if [ "$OS_id" == "" ]; then 28 | # update 6.07.18 It is not safe to load this module multiple times (when script is running from loops for example.), 29 | # so its better to load this module separately, only once, before run any command 30 | # module load gridengine/2011.11 31 | QUEUE=public.q 32 | # centOS and new cluster specific defaults 33 | #else 34 | 35 | fi 36 | 37 | ## defaults 38 | SHELL=/bin/bash #/bin/sh does not inherit environment, use /bin/bash! 39 | QUEUE2= 40 | LOG=/dev/null 41 | ERRORLOG=/dev/null 42 | TMPDIR=/tmp/ 43 | SYNC=no 44 | HELP=0 45 | MEMORY=0 46 | SPACE=0 47 | NAME= 48 | DEPENDS= 49 | HOSTS="" 50 | CORES="" 51 | NODES="" 52 | ARRAY=0 53 | RESTRICT=0 54 | EMAIL="" 55 | TIME= 56 | 57 | ## parse command line parameters 58 | while getopts s:q:p:o:e:m:n:t:d:H:A:R:C:K:h:M:N:T: o 59 | do case "$o" in 60 | s) SHELL="$OPTARG";; 61 | q) QUEUE="$OPTARG";; 62 | o) LOG="$OPTARG"; TMPDIR="$OPTARG";; 63 | e) ERRORLOG="$OPTARG";; 64 | m) MEMORY="$OPTARG";; 65 | n) NAME="$OPTARG";; 66 | t) SPACE="$OPTARG";; 67 | d) DEPENDS="$OPTARG";; 68 | C) CORES="$OPTARG";; 69 | N) NODES="$OPTARG";; 70 | H) HOSTS="$OPTARG";; 71 | A) ARRAY="$OPTARG";; 72 | R) RESTRICT="$OPTARG";; 73 | K) SYNC=yes;; 74 | h) HELP=1;; 75 | M) EMAIL="$OPTARG";; 76 | T) TIME="$OPTARG";; 77 | [?]) echo >&2 "ERROR: command line parameter not recognized."; HELP=1;; 78 | esac 79 | done 80 | 81 | shift $(($OPTIND-1)) 82 | 83 | if [ $HELP -eq 1 ]; then 84 | echo >&2 "USAGE: $0 [OPTIONS] 85 | -n name of the job [optional] 86 | -s shell [default: $SHELL] 87 | -q queue (for gridengine only) [default: $QUEUE] 88 | -o stdout directory, also the directory where the script for submission by qsub will be written [default: $LOG] 89 | -e stderr directory [default: $ERRORLOG] 90 | -m memory requirements in gigabites (0 for off) [default: $MEMORY] 91 | -t tmp-space requirements in Gigabites (0 for off) [default: $SPACE] 92 | -C number of cpus (cores) per task required on host (number (2) or range (2-6)) [default: off] 93 | -N number of nodes per tasks (number). For those software that allows cross talks between nodes. 94 | Note, that total number of CPUs will be calculated as N*C. For slurm only [default: off] 95 | -H use only indicated hosts (e.g. \"compute-3-*\" or \"compute-3-*|compute-4-*\") [default: off] 96 | -A this is an array job with N tasks numbered 1-N and accessible by \$SGE_TASK_ID(<2 for off) [optional] 97 | -R restrict array job such that no more than R jobs run in parallel (0 for off) [default: $RESTRICT] 98 | -K wait for the job to complete [optional] 99 | -d name of the dependent job [optional] 100 | -M email to get internal log information about job finishing 101 | -T add time parameter, for slurm in the format hh:mm:ss Example 2 days:'2-00:00:00' [default: $TIME] 102 | -h print this help 103 | " 104 | exit 1 105 | fi 106 | 107 | 108 | ## set ERRORLOG to LOG if not defined otherwise 109 | if [ $ERRORLOG = "/dev/null" ]; then 110 | ERRORLOG=$LOG 111 | fi 112 | 113 | ## test output directories 114 | if [ $LOG != "/dev/null" ]; then 115 | if [ ! -e $LOG ]; then 116 | mkdir -p $LOG 117 | else 118 | if [ ! -d $LOG ]; then 119 | echo >&2 "ERROR: $LOG exists but is not a directory" 120 | exit 0 121 | fi 122 | fi 123 | fi 124 | 125 | if [ $ERRORLOG != "/dev/null" ]; then 126 | if [ ! -e $ERRORLOG ]; then 127 | mkdir -p $ERRORLOG 128 | else 129 | if [ ! -d $ERRORLOG ]; then 130 | echo >&2 "ERROR: $ERRORLOG exists but is not a directory" 131 | exit 0 132 | fi 133 | fi 134 | fi 135 | 136 | ## get unique save temp file 137 | TMPFILE=$(mktemp -p $TMPDIR) 138 | 139 | ## write header to temp file 140 | if [ "$OS_id" == "Debian" ]; then 141 | echo "#!/bin/sh" >> $TMPFILE 142 | echo "#$ -S $SHELL" >> $TMPFILE 143 | echo "#$ -q $QUEUE" >> $TMPFILE 144 | if [ ! -z "$QUEUE2" ]; then 145 | echo "#$ -q $QUEUE2" >> $TMPFILE 146 | fi 147 | echo "#$ -cwd" >> $TMPFILE 148 | 149 | ## add job name if set 150 | if [ ! -z "$NAME" ]; then 151 | echo "#$ -N $NAME" >> $TMPFILE 152 | fi 153 | 154 | ## add memory requirements if set 155 | if [ $MEMORY != 0 ]; then 156 | echo "#$ -l vf=${MEMORY}G" >> $TMPFILE 157 | fi 158 | 159 | ## add tmp-space requirements if set 160 | if [ $SPACE != 0 ]; then 161 | echo "#$ -l tf=${SPACE}G" >> $TMPFILE 162 | fi 163 | 164 | ## add numbers of cores if set 165 | if [[ -n $CORES ]]; then 166 | echo "#$ -pe smp ${CORES}" >> $TMPFILE 167 | fi 168 | 169 | ## add host requirements if set 170 | if [[ -n $HOSTS ]]; then 171 | echo "#$ -l hostname=\"${HOSTS}\"" >> $TMPFILE 172 | fi 173 | 174 | ## add info about array job 175 | if [ $ARRAY -gt 1 ]; then 176 | echo "#$ -t 1-${ARRAY}" >> $TMPFILE 177 | fi 178 | 179 | ## add info about array job restriction 180 | if [ $RESTRICT -gt 0 ]; then 181 | echo "#$ -tc ${RESTRICT}" >> $TMPFILE 182 | fi 183 | 184 | if [ ! -z "$DEPENDS" ]; then 185 | echo "#$ -hold_jid ${DEPENDS}" >> $TMPFILE 186 | fi 187 | 188 | if [ ! -z "$TIME" ]; then 189 | echo "#$ -l walltime=${TIME}" >> $TMPFILE 190 | fi 191 | 192 | #send email at the end of the job, this includes abort 193 | if [ ! -z "$EMAIL" ]; then 194 | echo "#$ -M $EMAIL" >> $TMPFILE 195 | echo "#$ -m e" >> $TMPFILE 196 | fi 197 | 198 | ## write command to temp file (stdin or rest of command line) 199 | if [ $# -eq 0 ]; then 200 | cat >> $TMPFILE 201 | else 202 | echo $@ >> $TMPFILE 203 | fi 204 | 205 | ## add submit command as comment 206 | echo -e "\n# submit-command is: qsub -sync $SYNC -e $ERRORLOG -o $LOG $TMPFILE" >> $TMPFILE 207 | 208 | ## submit 209 | qsub -sync $SYNC -e $ERRORLOG -o $LOG $TMPFILE 210 | 211 | else #for slurm submission system syntax is different 212 | currDir=`pwd` 213 | echo "#!/usr/bin/env bash" >> $TMPFILE 214 | echo "#SBATCH --chdir $currDir" >> $TMPFILE 215 | # echo "#SBATCH --nodes=1" >> $TMPFILE 216 | # echo "#SBATCH --ntasks=1" >> $TMPFILE 217 | # echo "#SBATCH --cpus-per-task=1" >> $TMPFILE 218 | # echo "#SBATCH --mem-per-cpu=1G" >> $TMPFILE 219 | 220 | # echo "#SBATCH -p $QUEUE" >> $TMPFILE 221 | ## add job name if set 222 | if [ ! -z "$NAME" ]; then 223 | echo "#SBATCH --job-name $NAME" >> $TMPFILE 224 | fi 225 | 226 | ## add memory requirements if set 227 | if [ $MEMORY != 0 ]; then 228 | echo "#SBATCH --mem=${MEMORY}G" >> $TMPFILE 229 | fi 230 | 231 | ## add tmp-space requirements if set -- dont need it in the slurm system, the resources are unlimited 232 | # if [ $SPACE != 0 ]; then 233 | # echo "#SBATCH --gres tf=${SPACE}G" >> $TMPFILE 234 | # fi 235 | 236 | ## add numbers of cores if set 237 | 238 | if [[ -n $CORES ]]; then 239 | echo "#SBATCH --cpus-per-task=${CORES}" >> $TMPFILE 240 | fi 241 | 242 | if [[ -n $NODES ]]; then 243 | echo "#SBATCH --nodes=${NODES}" >> $TMPFILE 244 | fi 245 | 246 | ## add host requirements if set 247 | if [[ -n $HOSTS ]]; then 248 | echo "#SBATCH --nodelist=\"${HOSTS}\"" >> $TMPFILE 249 | fi 250 | 251 | ## add info about array job 252 | if [ $ARRAY -gt 1 ]; then 253 | addArray="" 254 | if [ $RESTRICT -gt 0 ]; then 255 | addArray="%${RESTRICT}" 256 | fi 257 | echo "#SBATCH --array=1-${ARRAY}${addArray}" >> $TMPFILE 258 | fi 259 | 260 | if [ ! -z "$DEPENDS" ]; then 261 | echo "#SBATCH --dependency=afterok:${DEPENDS}" >> $TMPFILE 262 | fi 263 | 264 | if [ ! -z "$TIME" ]; then 265 | echo "#SBATCH --time=${TIME}" >> $TMPFILE 266 | #after 4h we need medium queue, system does not resolve it automatically 267 | queueID=$(echo $TIME |perl -ne 'chomp($_); @d=split(":",$_); if($d[0]=~/(\d+)\-(\d+)/){ 268 | $days=$1; 269 | $hours=$2; 270 | $hours=~s/^0//; 271 | $h=$days*24+$hours; 272 | }else{ 273 | $h=$d[0]; 274 | $h=~s/^0//; 275 | } 276 | $qId="short"; 277 | if($h>48){ 278 | $qId='long'; 279 | }elsif($h>4){ 280 | $qId="medium"; 281 | } 282 | print $qId; 283 | ') 284 | if [ "$queueID" != "short" ]; then 285 | echo "#SBATCH --qos=$queueID" >> $TMPFILE 286 | fi 287 | fi 288 | 289 | #send email at the end of the job, this includes abort 290 | if [ ! -z "$EMAIL" ]; then 291 | echo "#SBATCH --mail-user=$EMAIL" >> $TMPFILE 292 | echo "#SBATCH --mail-type=FAIL" >> $TMPFILE 293 | fi 294 | 295 | # wait for a job to finish 296 | if [ "$SYNC" == "yes" ]; then 297 | echo "#SBATCH --wait" >> $TMPFILE 298 | fi 299 | 300 | ## write command to temp file (stdin or rest of command line) 301 | if [ $# -eq 0 ]; then 302 | cat >> $TMPFILE 303 | else 304 | echo $@ >> $TMPFILE 305 | fi 306 | 307 | ## add submit command as comment 308 | echo -e "\n# submit-command is: sbatch -e ${TMPFILE}.err -o ${TMPFILE}.out $TMPFILE" >> $TMPFILE 309 | 310 | ## submit 311 | echo "sbatch -e ${TMPFILE}.err -o ${TMPFILE}.out $TMPFILE" 312 | sbatch -e ${TMPFILE}.err -o ${TMPFILE}.out $TMPFILE 313 | 314 | fi #slurm 315 | 316 | ## remove tmpfile for submission (if in /tmp/ otherwise keep) 317 | if [ $TMPDIR = "/tmp/" ]; then 318 | rm $TMPFILE 319 | fi 320 | -------------------------------------------------------------------------------- /Oligo_UMISTARRseq/oligo_UMISTARRseq_pipeline.sh: -------------------------------------------------------------------------------- 1 | 2 | ####################### 3 | ## Process oligo genome-wide STARR-seq sequencing data 4 | ####################### 5 | 6 | folder=Oligo_UMISTARRseq 7 | cd $folder 8 | 9 | # folder to write results 10 | dataFolder=$folder/data 11 | mkdir -p $dataFolder 12 | 13 | # wrapper to submit jobs to cluster 14 | bsub=bsub_gridengine 15 | 16 | ## NOTE ## 17 | # The original sequencing data at our institute is provided in a BAM file containing the reads from a whole lane together with their the i5 and i7 indexes. 18 | # The script below is based on this BAM file as input and requires barcodes to demultiplex the reads and map them with bowtie. 19 | # However we can only provide fastq files of the demultiplexed reads for each experiment. 20 | # For samples with unique molecular identifiers (UMIs) at the i7 index, the UMI information is included in the read name and can be used to collapse reads with identical UMIs. 21 | 22 | ####################### 23 | ## paired-end mapping with bowtie 24 | ####################### 25 | 26 | ## make an exeriment file with all sample information and respective barcodes for demultiplexing from main sequencing BAM file --> $dataFolder/experiment.txt 27 | head $dataFolder/experiment.txt 28 | experimentFile=$dataFolder/experiment.txt 29 | 30 | # to submit to the cluster 31 | mkdir -p log_mapping 32 | 33 | # GENOME is a reference index containing 249 bp long sequences included in the library 34 | ALIGNER=bowtie_pe_NoRevCompMapping.sh # no mapping against the reverse-complement reference strand 35 | 36 | # map with no mismatches 37 | BOWTIE_MM=0 38 | 39 | grep -v -E "^#" $experimentFile | \ 40 | while read line; do 41 | INFILE=$( echo $line | awk '{print $2}' ) 42 | outdir=$dataFolder 43 | BARCODES14=$( echo $line | awk '{print $9}' ) 44 | BARCODES12=$( echo $line | awk '{print $11}' ) 45 | OUTFILE=$( echo $line | awk '{print $10}') 46 | BARCODE14_LEN=$( echo $BARCODES14 | awk '{print length($1)}' ) 47 | BARCODE12_LEN=$( echo $BARCODES12 | awk '{print length($1)}' ) 48 | GENOME=$( echo $line | awk '{print $5}' ) # using my own index 49 | 50 | # with UMIs 51 | if [ "$BARCODES14" == "UMI" ]; then 52 | BARCODE14_LEN=10 # length of UMI 53 | $bsub -o log_mapping -C 10 -T '5:00:00' -n "${OUTFILE}_mapping" "$ALIGNER -i $INFILE -o ${outdir}/${OUTFILE}.bb -B $BARCODES12 -L $BARCODE12_LEN -l $BARCODE14_LEN --umi -f A -g $GENOME -m $BOWTIE_MM" > log_mapping/msg.$OUTFILE.tmp 54 | else 55 | # no UMIs 56 | $bsub -o log_mapping -C 10 -n "${OUTFILE}_mapping" "$ALIGNER -i $INFILE -o ${outdir}/${OUTFILE}.bb -B $BARCODES12 -L $BARCODE12_LEN -b $BARCODES14 -l $BARCODE14_LEN -f A -g $GENOME -m $BOWTIE_MM" > log_mapping/msg.$OUTFILE.tmp 57 | fi 58 | done 59 | -------------------------------------------------------------------------------- /Oligo_UMISTARRseq/slippage_filter_pe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | set -o pipefail 4 | 5 | # Stark Lab in house pipeline. Adapted by Bernardo Almeida 2020 6 | 7 | ################################################################################ 8 | # Set default values 9 | ################################################################################ 10 | 11 | MM="2" 12 | L="10" 13 | OFFSET="1" 14 | D="2000" 15 | memory=4G #memory restrictions for shell sort 16 | 17 | ################################################################################ 18 | # Help 19 | ################################################################################ 20 | 21 | if [ $# -eq 0 ]; then 22 | echo >&2 " 23 | $(basename $0) - Filter putative slippage fragments in paired-end data 24 | 25 | USAGE: cat BED file | $(basename $0) - [OPTIONS] -- 26 | -m Maximum number of mismatches for sequence comparison [default: $MM] 27 | -l Sequence length for comparison [default: $L] 28 | -o Offset for sequence to compare [default: $OFFSET] 29 | -d Maximum distance for filtering within a cluster [default: $D] 30 | -m Memory restrictions for shell sort [default: $memory] 31 | 32 | " 33 | exit 1 34 | fi 35 | 36 | ################################################################################ 37 | # Parse input 38 | ################################################################################ 39 | 40 | while getopts "m:l:o:d:" o 41 | do 42 | case "$o" in 43 | m) MM="$OPTARG";; 44 | l) L="$OPTARG";; 45 | o) OFFSET="$OPTARG";; 46 | d) D="$OPTARG";; 47 | m) memory="$OPTARG";; 48 | \?) exit 1;; 49 | esac 50 | done 51 | 52 | ################################################################################ 53 | # Run program 54 | ############################################################################### 55 | 56 | # Sort by strand, chr, start (lower on top), end (higher on top), and mismatches 57 | # The longest fragment always appears first 58 | 59 | sort -k6,6 -k1,1 -k2,2n -k3,3nr -k5,5n -S $memory| \ 60 | awk -vD=$D -vL=$L -vMM=$MM -vO=$OFFSET ' 61 | function dist(a,b, i,sa,sb,m) 62 | { 63 | m=0 64 | for(i=1;i<=L;i++){ 65 | sa=substr(a,i,1);sb=substr(b,i,1) 66 | if(sa!=sb && sa!="N" && sb!="N"){m++} 67 | } 68 | return m 69 | } 70 | {split($4,seq,"_"); ok=1} 71 | (NR>1 && $6==strand && $1==chr) { 72 | if($2==start && $3>=end-D && $3<=end && strand=="+"){ 73 | SEQS[seq_r]=1; for(SEQ in SEQS){DIFF=dist(substr(seq[2],O,L),SEQ);if(DIFF<=MM){ok=0; break}} 74 | } 75 | else if($2==start && $3>=end-D && $3<=end && strand=="-"){ 76 | SEQS[seq_l]=1; for(SEQ in SEQS){DIFF=dist(substr(seq[1],O,L),SEQ);if(DIFF<=MM){ok=0; break}} 77 | } 78 | else if($2<=start+D && $2>start && $3==end && strand=="+"){ 79 | SEQS[seq_l]=1; for(SEQ in SEQS){DIFF=dist(substr(seq[1],O,L),SEQ);if(DIFF<=MM){ok=0; break}} 80 | } 81 | else if($2<=start+D && $2>start && $3==end && strand=="-"){ 82 | SEQS[seq_r]=1; for(SEQ in SEQS){DIFF=dist(substr(seq[2],O,L),SEQ);if(DIFF<=MM){ok=0; break}} 83 | } 84 | else{ 85 | for (SEQ in SEQS){delete SEQS[SEQ]} 86 | } 87 | } 88 | (NR>1 && ($6!=strand || $1!=chr)){for (SEQ in SEQS){delete SEQS[SEQ]}} 89 | {chr=$1;start=$2;end=$3;strand=$6;seq_l=substr(seq[1],O,L);seq_r=substr(seq[2],O,L)} 90 | {if(ok){print $0}}' | \ 91 | sort -k1,1 -k2,2n -k3,3nr -k6,6 -k5,5n -k4,4 -S $memory| \ 92 | awk '!x[$1" "$2" "$3" "$6]++' 93 | 94 | exit 0 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepSTARR 2 | **DeepSTARR** is a deep learning model built to quantitatively predict the activities of developmental and housekeeping enhancers from DNA sequence in *Drosophila melanogaster* S2 cells. 3 | 4 | For more information, see the DeepSTARR publication: 5 | [*DeepSTARR predicts enhancer activity from DNA sequence and enables the de novo design of synthetic enhancers*](https://www.nature.com/articles/s41588-022-01048-5) 6 | Bernardo P. de Almeida, Franziska Reiter, Michaela Pagani, Alexander Stark. Nature Genetics, 2022. 7 | [Presentation at ISCB Webinar](https://www.youtube.com/watch?v=vg32mqptMdQ) 8 | 9 | This repository contains the code used to process genome-wide and oligo UMI-STARR-seq data and train DeepSTARR. 10 | 11 | ## Genome-wide enhancer activity maps of developmental and housekeeping enhancers 12 | We used **UMI-STARR-seq** ([Arnold et al., 2013](http://www.sciencemag.org/lookup/doi/10.1126/science.1232542); [Neumayr et al., 2019](https://doi.org/10.1002/cpmb.105)) to generate genome-wide high resolution, quantitative activity maps of developmental and housekeeping enhancers, representing the two main transcriptional programs in *Drosophila* S2 cells ([Arnold et al., 2017](http://dx.doi.org/doi:10.1038/nbt.3739); [Haberle et al., 2019](https://doi.org/10.1038/s41586-019-1210-7); [Zabidi et al., 2015](http://dx.doi.org/10.1038/nature13994)). 13 | 14 |

15 | 16 | The raw sequencing data are available from GEO under accession number [GSE183939](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE183936). 17 | You can find the code to process the data [here](GenomeWide_UMISTARRseq). 18 | 19 | ## DeepSTARR model 20 | 21 | **DeepSTARR** is a multi-task convolutional neural network that maps 249 bp long DNA sequences to both their developmental and their housekeeping enhancer activities. We adapted the Basset convolutional neural network architecture ([Kelley et al., 2016](https://github.com/davek44/Basset)) and designed DeepSTARR with four convolution layers, each followed by a max-pooling layer, and two fully connected layers. The convolution layers identify local sequence features (e.g. TF motifs) and increasingly complex patterns (e.g. TF motif syntax), while the fully connected layers combine these features and patterns to predict enhancer activity separately for each enhancer type. 22 | 23 |

24 | 25 | 26 |

27 | 28 | You can find the code used to train DeepSTARR and compute nucleotide contribution scores [here](DeepSTARR). 29 | Data used to train and evaluate the DeepSTARR model as well as the final trained model are available on zenodo at https://doi.org/10.5281/zenodo.5502060. 30 | DeepSTARR is also deposited in [Kipoi](http://kipoi.org/models/DeepSTARR/). 31 | 32 | ### Tutorial 33 | An end-to-end example to train DeepSTARR, compute the nucleotide contribution scores and modisco TF motifs is contained in the following colab notebook: https://colab.research.google.com/drive/1Xgak40TuxWWLh5P5ARf0-4Xo0BcRn0Gd. You can run this notebook yourself to experiment with DeepSTARR. 34 | 35 | ### Predict developmental and housekeeping enhancer activity of new DNA sequences 36 | To predict the developmental and housekeeping enhancer activity in *Drosophila melanogaster* S2 cells for new DNA sequences, please run: 37 | ``` 38 | # Clone this repository 39 | git clone https://github.com/bernardo-de-almeida/DeepSTARR.git 40 | cd DeepSTARR/DeepSTARR 41 | 42 | # download the trained DeepSTARR model from zenodo (https://doi.org/10.5281/zenodo.5502060) 43 | 44 | # create 'DeepSTARR' conda environment by running the following: 45 | conda create --name DeepSTARR python=3.7 tensorflow=1.14.0 keras=2.2.4 # or tensorflow-gpu/keras-gpu if you are using a GPU 46 | source activate DeepSTARR 47 | pip install git+https://github.com/AvantiShri/shap.git@master 48 | pip install 'h5py<3.0.0' 49 | pip install deeplift==0.6.13.0 50 | 51 | # Run prediction script 52 | python DeepSTARR_pred_new_sequence.py -s Sequences_example.fa -m DeepSTARR.model 53 | ``` 54 | Where: 55 | * -s FASTA file with input DNA sequences 56 | 57 | ## UMI-STARR-seq with designed oligo libraries to test more than 40,000 wildtype and mutant Drosophila and human enhancers 58 | 59 | We designed and synthesised (in oligo pools by [Twist Bioscience](https://www.twistbioscience.com/resources/product-sheet/twist-oligo-pools)) wildtype and TF motif-mutant sequences of Drosophila and human enhancers. The activity of each sequence in the oligo libraries was assessed experimentally by **UMI-STARR-seq** in *Drosophila melanogaster* S2 (both developmental and housekeeping UMI-STARR-seq; see figure below) and human HCT116 cells, respectively. 60 | 61 |

62 | 63 | The raw sequencing data are available from GEO under accession number [GSE183939](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE183939). 64 | You can find the code to analyse *Drosophila* and human oligo UMI-STARR-seq screens [here](Oligo_UMISTARRseq). 65 | 66 | ## Code for Figures 67 | Scripts to reproduce each main figure can be found [here](Figures) and the respective processed data [here](https://data.starklab.org/almeida/DeepSTARR/Figures_data/). 68 | 69 | ## UCSC Genome Browser tracks 70 | Genome browser tracks showing genome-wide UMI-STARR-seq and DeepSTARR predictions in *Drosophila*, including nucleotide contribution scores for all enhancer sequences, together with the enhancers used for mutagenesis, mutated motif instances and respective log2 fold-changes in enhancer activity, are available at https://genome.ucsc.edu/s/bernardo.almeida/DeepSTARR_manuscript. 71 | Dynamic sequence tracks and contribution scores are also available as a [Reservoir Genome Browser session](https://resgen.io/paper-data/Almeida...%202021%20-%20DeepSTARR/views/VNZrgd8oSsCpfZfwByDlwA). 72 | 73 | ## Questions 74 | If you have any questions/requests/comments please contact me at [bernardo.almeida94@gmail.com](mailto:bernardo.almeida94@gmail.com). 75 | -------------------------------------------------------------------------------- /img/DeepSTARR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bernardo-de-almeida/DeepSTARR/b02e460c7581934bb6c8910e53be04da10688781/img/DeepSTARR.png -------------------------------------------------------------------------------- /img/DeepSTARR_predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bernardo-de-almeida/DeepSTARR/b02e460c7581934bb6c8910e53be04da10688781/img/DeepSTARR_predictions.png -------------------------------------------------------------------------------- /img/gw_UMISTARRseq.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bernardo-de-almeida/DeepSTARR/b02e460c7581934bb6c8910e53be04da10688781/img/gw_UMISTARRseq.png -------------------------------------------------------------------------------- /img/gw_UMISTARRseq_UMISTARRseq.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bernardo-de-almeida/DeepSTARR/b02e460c7581934bb6c8910e53be04da10688781/img/gw_UMISTARRseq_UMISTARRseq.pdf -------------------------------------------------------------------------------- /img/oligo_UMISTARRseq_enh_mutants.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bernardo-de-almeida/DeepSTARR/b02e460c7581934bb6c8910e53be04da10688781/img/oligo_UMISTARRseq_enh_mutants.png --------------------------------------------------------------------------------