├── .gitattributes ├── README.md ├── cnnsvr.py ├── data ├── HCT116.csv ├── HEK293T.csv ├── HELA.csv ├── HL60.csv ├── benchmark_dataset.csv ├── testing_example.csv └── training_example.csv └── weights └── weights.h5 /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CNN-SVR 2 | ## Overview 3 | CNN-SVR is a deep learning-based method for CRISPR/Cas9 guide RNA (gRNA) on-target cleavage efficacy prediction. It is composed of two major components: a merged CNN as the front-end for extracting gRNA and epigenetic features as well as an SVR as the back-end for regression and predicting gRNA cleavage efficiency. 4 | 5 | ## Pre-requisite: 6 | * **Ubuntu 16.04** 7 | * **Anaconda 3-5.2.0** 8 | * **Python packages:** 9 | [numpy](https://numpy.org/) 1.16.4 10 | [pandas](https://pandas.pydata.org/) 0.23.0 11 | [scikit-learn](https://scikit-learn.org/stable/) 0.19.1 12 | [scipy](https://www.scipy.org/) 1.1.0 13 | * **[Keras](https://keras.io/) 2.1.0** 14 | * **Tensorflow and dependencies:** 15 | [Tensorflow](https://tensorflow.google.cn/) 1.4.0 16 | CUDA 8.0 (for GPU use) 17 | cuDNN 6.0 (for GPU use) 18 | 19 | ## Installation guide 20 | #### **Operation system** 21 | Ubuntu 16.04 download from https://www.ubuntu.com/download/desktop 22 | #### **Python and packages** 23 | Download Anaconda 3-5.2.0 tarball on https://www.anaconda.com/distribution/#download-section 24 | #### **Tensorflow installation:** 25 | pip install tensorflow-gpu==1.4.0 (for GPU use) 26 | pip install tensorflow==1.4.0 (for CPU use) 27 | #### **CUDA toolkit 8.0 (for GPU use)** 28 | Download CUDA tarball on https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run 29 | #### **cuDNN 6.1.10 (for GPU use)** 30 | Download cuDNN tarball on https://developer.nvidia.com/cudnn 31 | 32 | ## Content 33 | * **data:** the training and testing examples with gRNA sequence and corresponding epigenetic features and label indicating the on-target cleavage efficacy 34 | * **weights/weights.h5:** the well-trained weights for our model 35 | * **cnnsvr.py:** the python code, it can be ran to reproduce our results 36 | 37 | ## Usage 38 | #### **python cnnsvr.py** 39 | **Note:** 40 | * The input training and testing files should include gRNA sequence with length of 23 bp and four "A-N" symbolic corresponding epigenetic features seuqnces with length of 23 as well as label in each gRNA sequence. 41 | * The train.csv, test.csv can be replaced or modified to include gRNA sequence and four epigenetic features of interest 42 | 43 | ## Demo instructions 44 | #### **Input (gRNA sequence and four epigenetic features):** 45 | * #### **Data format:** 46 | **gRNA:** TGAGAAGTCTATGAGCTTCAAGG (23bp) 47 | **CTCF:** NNNNNNNNNNNNNNNNNNNNNNN 48 | **Dnase:** AAAAAAAAAAAAAAAAAAAAAAA 49 | **H3K4me3:** NNNNNNNNNNNNNNNNNNNNNNN 50 | **RRBS:** NNNNNNNNNNNNNNNNNNNNNNN 51 | #### **Load weights (Pre-trained weight file):** 52 | weights/weights.h5 53 | #### **Run script:** 54 | python cnnsvr.py 55 | #### **Output (Predicted activity score for gRNA):** 56 | 0.22743436 57 | -------------------------------------------------------------------------------- /cnnsvr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import os 4 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 5 | 6 | # import tensorflow as tf 7 | # import keras.backend.tensorflow_backend as KTF 8 | # 9 | # config = tf.ConfigProto() 10 | # config.gpu_options.per_process_gpu_memory_fraction = 0.95 11 | # session = tf.Session(config=config) 12 | # 13 | # KTF.set_session(session) 14 | 15 | from keras.models import Model 16 | from keras.layers import Input 17 | from keras.layers.core import Dense, Dropout, Activation, Flatten 18 | from keras.layers.convolutional import Convolution1D, AveragePooling1D 19 | from keras.layers.merge import concatenate 20 | import numpy as np 21 | import pandas as pd 22 | from sklearn.svm import SVR 23 | import scipy.stats as stats 24 | 25 | 26 | def grna_preprocess(lines): 27 | length = 23 28 | data_n = len(lines) 29 | seq = np.zeros((data_n, length, 4), dtype=int) 30 | for l in range(data_n): 31 | data = lines[l] 32 | seq_temp = data 33 | for i in range(length): 34 | if seq_temp[i] in "Aa": 35 | seq[l, i, 0] = 1 36 | elif seq_temp[i] in "Cc": 37 | seq[l, i, 1] = 1 38 | elif seq_temp[i] in "Gg": 39 | seq[l, i, 2] = 1 40 | elif seq_temp[i] in "Tt": 41 | seq[l, i, 3] = 1 42 | return seq 43 | 44 | 45 | def epi_preprocess(lines): 46 | length = 23 47 | data_n = len(lines) 48 | epi = np.zeros((data_n, length), dtype=int) 49 | for l in range(data_n): 50 | data = lines[l] 51 | epi_temp = data 52 | for i in range(length): 53 | if epi_temp[i] in "A": 54 | epi[l, i] = 1 55 | elif epi_temp[i] in "N": 56 | epi[l, i] = 0 57 | return epi 58 | 59 | 60 | def preprocess(file_path, usecols): 61 | data = pd.read_csv(file_path, usecols=usecols) 62 | data = np.array(data) 63 | ctcf, dnase, h3k4me3, rrbs = epi_preprocess(data[:, 0]), epi_preprocess(data[:, 1]), epi_preprocess(data[:, 2]), epi_preprocess(data[:, 3]) 64 | epi = [] 65 | for i in range(len(data)): 66 | ctcf_t, dnase_t, h3k4me3_t, rrbs_t = pd.DataFrame(ctcf[i]), pd.DataFrame(dnase[i]), pd.DataFrame(h3k4me3[i]), pd.DataFrame(rrbs[i]) 67 | epi_t = pd.concat([ctcf_t, dnase_t, h3k4me3_t, rrbs_t], axis=1) 68 | epi_t = np.array(epi_t) 69 | epi.append(epi_t) 70 | epi = np.array(epi) 71 | return epi 72 | 73 | 74 | def load_data(train_file, test_file): 75 | train_data = pd.read_csv(train_file, usecols=[4, 9]) 76 | train_data = np.array(train_data) 77 | train_seq, train_y = train_data[:, 0], train_data[:, 1] 78 | train_seq = grna_preprocess(train_seq) 79 | train_epi = preprocess(train_file, [5, 6, 7, 8]) 80 | train_y = train_y.reshape(len(train_y), -1) 81 | 82 | test_data = pd.read_csv(test_file, usecols=[4, 9]) 83 | test_data = np.array(test_data) 84 | test_seq, test_y = test_data[:, 0], test_data[:, 1] 85 | test_seq = grna_preprocess(test_seq) 86 | test_epi = preprocess(test_file, [5, 6, 7, 8]) 87 | test_y = test_y.reshape(len(test_y), -1) 88 | return train_seq, test_seq, train_epi, test_epi, train_y, test_y 89 | 90 | 91 | # Build model 92 | def build_model(): 93 | dropout = 0.3 94 | seq_input = Input(shape=(23, 4)) 95 | seq_conv1 = Convolution1D(256, 5, kernel_initializer='glorot_uniform', name='seq_conv_1')(seq_input) 96 | seq_act1 = Activation('relu', name='seq_activation1')(seq_conv1) 97 | seq_pool1 = AveragePooling1D(2, name='seq_pooling_1')(seq_act1) 98 | seq_drop1 = Dropout(dropout)(seq_pool1) 99 | 100 | seq_conv2 = Convolution1D(256, 5, kernel_initializer='glorot_uniform', name='seq_conv_2')(seq_drop1) 101 | seq_act2 = Activation('relu', name='seq_activation_2')(seq_conv2) 102 | seq_pool2 = AveragePooling1D(2, name='seq_pooling_2')(seq_act2) 103 | seq_drop2 = Dropout(dropout)(seq_pool2) 104 | seq_flat = Flatten()(seq_drop2) 105 | 106 | seq_dense1 = Dense(256, activation='relu', name='seq_dense_1')(seq_flat) 107 | seq_drop3 = Dropout(dropout)(seq_dense1) 108 | seq_dense2 = Dense(128, activation='relu', name='seq_dense_2')(seq_drop3) 109 | seq_drop4 = Dropout(dropout)(seq_dense2) 110 | seq_dense3 = Dense(64, activation='relu', name='seq_dense_3')(seq_drop4) 111 | seq_drop5 = Dropout(dropout)(seq_dense3) 112 | seq_out = Dense(40, activation='relu', name='seq_dense_4')(seq_drop5) 113 | 114 | epi_input = Input(shape=(23, 4)) 115 | epi_conv1 = Convolution1D(256, 5, kernel_initializer='glorot_uniform', name='epi_conv_1')(epi_input) 116 | epi_act1 = Activation('relu', name='epi_activation_1')(epi_conv1) 117 | epi_pool1 = AveragePooling1D(2, name='epi_pooling_1')(epi_act1) 118 | epi_drop1 = Dropout(dropout)(epi_pool1) 119 | 120 | epi_conv2 = Convolution1D(256, 5, kernel_initializer='glorot_uniform', name='epi_conv_2')(epi_drop1) 121 | epi_act2 = Activation('relu', name='epi_activation_2')(epi_conv2) 122 | epi_pool2 = AveragePooling1D(2, name='epi_pooling_2')(epi_act2) 123 | epi_drop2 = Dropout(dropout)(epi_pool2) 124 | epi_flat = Flatten()(epi_drop2) 125 | 126 | epi_dense1 = Dense(256, activation='relu', name='epi_dense_1')(epi_flat) 127 | epi_drop3 = Dropout(dropout)(epi_dense1) 128 | epi_dense2 = Dense(128, activation='relu', name='epi_dense_2')(epi_drop3) 129 | epi_drop4 = Dropout(dropout)(epi_dense2) 130 | epi_dense3 = Dense(64, activation='relu', name='epi_dense_3')(epi_drop4) 131 | epi_drop5 = Dropout(dropout)(epi_dense3) 132 | epi_out = Dense(40, activation='relu', name='epi_dense_4')(epi_drop5) 133 | 134 | merged = concatenate([seq_out, epi_out], axis=-1) 135 | 136 | pretrain_model = Model(inputs=[seq_input, epi_input], outputs=[merged]) 137 | 138 | # Load weights for the model 139 | pretrain_model.load_weights("weights/weights.h5", by_name=True) 140 | 141 | prediction = Dense(1, activation='linear', name='prediction')(merged) 142 | model = Model([seq_input, epi_input], prediction) 143 | return merged, model 144 | 145 | 146 | if __name__ == '__main__': 147 | 148 | train_path = "data/training_example.csv" 149 | test_path = "data/testing_example.csv" 150 | 151 | # Load data 152 | seq_train, seq_test, epi_train, epi_test, y_train, y_test = load_data(train_path, test_path) 153 | 154 | merged, model = build_model() 155 | 156 | new_model = Model(model.inputs, outputs=[merged]) 157 | x_train = new_model.predict([seq_train, epi_train]) 158 | x_test = new_model.predict([seq_test, epi_test]) 159 | 160 | x_train, x_test = np.array(x_train), np.array(x_test) 161 | 162 | # Select important features from initial CNN features 163 | selected_cnn_fea_cols = [17, 26, 9, 19, 30, 6, 12, 39, 36, 21, 22, 3, 25] 164 | x_train = x_train[:, selected_cnn_fea_cols] 165 | x_test = x_test[:, selected_cnn_fea_cols] 166 | 167 | y_train = np.array(y_train).ravel() 168 | y_test = np.array(y_test).ravel() 169 | 170 | clf = SVR(kernel="rbf", gamma=0.12, C=1.7, epsilon=0.11, verbose=1) 171 | 172 | # Fit the SVR model according to the given training data 173 | clf.fit(x_train, y_train) 174 | 175 | # Perform regression on samples in x_test 176 | y_pred = clf.predict(x_test) 177 | print(y_pred) 178 | 179 | # Calculate Spearman correlation coefficient 180 | # Spearman_correlation, _ = stats.stats.spearmanr(y_test, y_pred) 181 | 182 | # Print Spearman correlation result 183 | # print("Spearman correlation=%.3f" % (Spearman_correlation)) 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /data/testing_example.csv: -------------------------------------------------------------------------------- 1 | chr,start,end,direction,seq,ctcf,dnase,h3k4me3,rrbs,indel_frequency 2 | chr1,166839012,166839034,+,TGAGAAGTCTATGAGCTTCAAGG,NNNNNNNNNNNNNNNNNNNNNNN,AAAAAAAAAAAAAAAAAAAAAAA,NNNNNNNNNNNNNNNNNNNNNNN,NNNNNNNNNNNNNNNNNNNNNNN,0.382065546 3 | chr17,29422343,29422365,-,ACGGCCTGGACCCATTCCACCGG,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAA,NAANNNNNNNNNNNNNNNNNNAN,0.191863287 4 | -------------------------------------------------------------------------------- /weights/weights.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Peppags/CNN-SVR/60c7f657cb1e9270a4861f3ca86aa102d0fa0e7d/weights/weights.h5 --------------------------------------------------------------------------------