├── .gitattributes ├── 5cv_33class ├── 5cv_1D_CNN_33class.py ├── 5cv_Vanilla_33class.py └── 5cv_hybrid_33class.py ├── 5cv_34class ├── 5cv_1D_CNN_34class.py ├── 5cv_Vanilla_34class.py └── 5cv_hybrid_34class.py ├── Hyperparameter tuning ├── 1d_CNN_33class_hyperparameters.py └── vanilla_CNN_33class_hyperparameters.py ├── README.md └── image └── git_models_fig.png /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /5cv_33class/5cv_1D_CNN_33class.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is written by Milad Mostavi, one of authors of 3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper. 4 | Please cite this paper in the case it was useful in your research 5 | ''' 6 | import pickle 7 | from numpy import array 8 | from numpy import argmax 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.preprocessing import OneHotEncoder 11 | import numpy as np 12 | from sklearn.model_selection import train_test_split 13 | import collections 14 | import matplotlib.pyplot as plt 15 | import pandas as pd 16 | from keras.models import Sequential, Model 17 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, Input 18 | from keras.callbacks import EarlyStopping, ModelCheckpoint 19 | from keras.layers.normalization import BatchNormalization 20 | from keras.layers.advanced_activations import LeakyReLU 21 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score 22 | from sklearn.model_selection import StratifiedKFold 23 | 24 | 25 | A = open('TCGA_new_pre_second.pckl', 'rb') 26 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new, 27 | project_ids_new] = pickle.load(A) 28 | A.close() 29 | 30 | f = open('TCGA_new_pre_first.pckl', 'rb') 31 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f) 32 | f.close() 33 | 34 | 35 | ## embedding labels 36 | # integer encode 37 | label_encoder = LabelEncoder() 38 | integer_encoded = label_encoder.fit_transform(project_ids_new) 39 | # binary encode 40 | onehot_encoder = OneHotEncoder(sparse=False) 41 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 42 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 43 | 44 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values 45 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values 46 | onehot_encoded_cancer_samples = onehot_encoded[remain_cancer_ids_ind] 47 | onehot_encoded_normal_samples = onehot_encoded[remain_normal_ids_ind] 48 | 49 | X_cancer_samples_mat = np.concatenate((X_cancer_samples,np.zeros((len(X_cancer_samples),9))),axis=1) 50 | ## add nine zeros to the end of each sample 51 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100)) 52 | 53 | ## This line is useful when only one fold training is needed 54 | x_train, x_test, y_train, y_test = train_test_split(X_cancer_samples_mat, onehot_encoded_cancer_samples, 55 | stratify= onehot_encoded_cancer_samples, 56 | test_size=0.25, random_state=42) 57 | 58 | 59 | img_rows, img_cols = len(x_test[0]), len(x_test[0][0]) 60 | num_classes = len(y_train[0]) 61 | batch_size = 128 62 | epochs = 20 63 | seed = 7 64 | np.random.seed(seed) 65 | 66 | 67 | input_Xs = X_cancer_samples_mat 68 | y_s = train_labels 69 | 70 | 71 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) 72 | cvscores = [] 73 | 74 | for j in range(10): 75 | i = 0 76 | for train, test in kfold.split(input_Xs, y_s): 77 | 78 | input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1) 79 | input_shape = (img_rows, img_cols, 1) 80 | input_Xs = input_Xs.astype('float32') 81 | 82 | label_encoder = LabelEncoder() 83 | integer_encoded = label_encoder.fit_transform(y_s) 84 | # binary encode 85 | onehot_encoder = OneHotEncoder(sparse=False) 86 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 87 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 88 | num_classes = len(onehot_encoded[0]) 89 | 90 | model = Sequential() 91 | ## *********** First layer Conv 92 | model.add(Conv2D(32, kernel_size=(1, 71), strides=(1, 1), 93 | input_shape=input_shape)) 94 | model.add(Activation('relu')) 95 | model.add(MaxPooling2D(1, 2)) 96 | ## ********* Classification layer 97 | model.add(Flatten()) 98 | model.add(Dense(128, activation='relu')) 99 | model.add(Dense(num_classes, activation='softmax')) 100 | model.compile(loss='categorical_crossentropy', 101 | optimizer='adam', 102 | metrics=['categorical_accuracy']) 103 | callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)] 104 | if i==0: 105 | model.summary() 106 | i = i +1 107 | history = model.fit(input_Xs[train], onehot_encoded[train], 108 | batch_size=batch_size, 109 | epochs=epochs, 110 | verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test])) 111 | scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0) 112 | # print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) 113 | cvscores.append(scores[1] * 100) 114 | 115 | print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) 116 | -------------------------------------------------------------------------------- /5cv_33class/5cv_Vanilla_33class.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is written by Milad Mostavi, one of authors of 3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper. 4 | Please cite this paper in the case it was useful in your research 5 | ''' 6 | import pickle 7 | from numpy import array 8 | from numpy import argmax 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.preprocessing import OneHotEncoder 11 | import numpy as np 12 | from sklearn.model_selection import train_test_split 13 | import collections 14 | import matplotlib.pyplot as plt 15 | import pandas as pd 16 | from keras.models import Sequential, Model 17 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, Input 18 | from keras.callbacks import EarlyStopping, ModelCheckpoint 19 | from keras.layers.normalization import BatchNormalization 20 | from keras.layers.advanced_activations import LeakyReLU 21 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score 22 | from sklearn.model_selection import StratifiedKFold 23 | 24 | 25 | A = open('TCGA_new_pre_second.pckl', 'rb') 26 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new, 27 | project_ids_new] = pickle.load(A) 28 | A.close() 29 | 30 | f = open('TCGA_new_pre_first.pckl', 'rb') 31 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f) 32 | f.close() 33 | 34 | 35 | ## embedding labels 36 | # integer encode 37 | label_encoder = LabelEncoder() 38 | integer_encoded = label_encoder.fit_transform(project_ids_new) 39 | # binary encode 40 | onehot_encoder = OneHotEncoder(sparse=False) 41 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 42 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 43 | 44 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values 45 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values 46 | onehot_encoded_cancer_samples = onehot_encoded[remain_cancer_ids_ind] 47 | onehot_encoded_normal_samples = onehot_encoded[remain_normal_ids_ind] 48 | 49 | X_cancer_samples_mat = np.concatenate((X_cancer_samples,np.zeros((len(X_cancer_samples),9))),axis=1) 50 | ## add nine zeros to the end of each sample 51 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100)) 52 | 53 | ## This line is useful when only one fold training is needed 54 | x_train, x_test, y_train, y_test = train_test_split(X_cancer_samples_mat, onehot_encoded_cancer_samples, 55 | stratify= onehot_encoded_cancer_samples, 56 | test_size=0.25, random_state=42) 57 | 58 | 59 | img_rows, img_cols = len(x_test[0]), len(x_test[0][0]) 60 | num_classes = len(y_train[0]) 61 | batch_size = 128 62 | epochs = 20 63 | # fix random seed for reproducibility 64 | seed = 7 65 | np.random.seed(seed) 66 | 67 | 68 | input_Xs = X_cancer_samples_mat 69 | y_s = train_labels 70 | 71 | 72 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) 73 | cvscores = [] 74 | 75 | for j in range(10): 76 | i = 0 77 | for train, test in kfold.split(input_Xs, y_s): 78 | 79 | input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1) 80 | input_shape = (img_rows, img_cols, 1) 81 | input_Xs = input_Xs.astype('float32') 82 | 83 | label_encoder = LabelEncoder() 84 | integer_encoded = label_encoder.fit_transform(y_s) 85 | # binary encode 86 | onehot_encoder = OneHotEncoder(sparse=False) 87 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 88 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 89 | num_classes = len(onehot_encoded[0]) 90 | 91 | model = Sequential() 92 | ## *********** First layer Conv 93 | model.add(Conv2D(32, kernel_size=(10, 10), strides=(1, 1), 94 | input_shape=input_shape)) 95 | model.add(Activation('relu')) 96 | model.add(MaxPooling2D(2, 2)) 97 | ## ********* Classification layer 98 | model.add(Flatten()) 99 | model.add(Dense(128, activation='relu')) 100 | model.add(Dense(num_classes, activation='softmax')) 101 | model.compile(loss='categorical_crossentropy', 102 | optimizer='adam', 103 | metrics=['categorical_accuracy']) 104 | callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)] 105 | if i==0: 106 | model.summary() 107 | i = i +1 108 | history = model.fit(input_Xs[train], onehot_encoded[train], 109 | batch_size=batch_size, 110 | epochs=epochs, 111 | verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test])) 112 | scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0) 113 | # print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) 114 | cvscores.append(scores[1] * 100) 115 | 116 | print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) 117 | -------------------------------------------------------------------------------- /5cv_33class/5cv_hybrid_33class.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is written by Milad Mostavi, one of authors of 3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper. 4 | Please cite this paper in the case it was useful in your research 5 | ''' 6 | import pickle 7 | from numpy import array 8 | from numpy import argmax 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.preprocessing import OneHotEncoder 11 | import numpy as np 12 | from sklearn.model_selection import train_test_split 13 | import collections 14 | import matplotlib.pyplot as plt 15 | import pandas as pd 16 | import keras 17 | from keras.models import Sequential, Model 18 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, Input 19 | from keras.callbacks import EarlyStopping, ModelCheckpoint 20 | from keras.layers.normalization import BatchNormalization 21 | from keras.layers.advanced_activations import LeakyReLU 22 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score 23 | from sklearn.model_selection import StratifiedKFold 24 | 25 | 26 | 27 | 28 | 29 | A = open('TCGA_new_pre_second.pckl', 'rb') 30 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new, 31 | project_ids_new] = pickle.load(A) 32 | A.close() 33 | 34 | f = open('TCGA_new_pre_first.pckl', 'rb') 35 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f) 36 | f.close() 37 | 38 | 39 | ## embedding labels 40 | # integer encode 41 | label_encoder = LabelEncoder() 42 | integer_encoded = label_encoder.fit_transform(project_ids_new) 43 | # print(integer_encoded) 44 | # binary encode 45 | onehot_encoder = OneHotEncoder(sparse=False) 46 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 47 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 48 | 49 | 50 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values 51 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values 52 | onehot_encoded_cancer_samples = onehot_encoded[remain_cancer_ids_ind] 53 | onehot_encoded_normal_samples = onehot_encoded[remain_normal_ids_ind] 54 | 55 | X_cancer_samples_mat = np.concatenate((X_cancer_samples,np.zeros((len(X_cancer_samples),9))),axis=1) 56 | ## add nine zeros to the end of each sample 57 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100)) 58 | 59 | ## This line is useful when only one fold training is needed 60 | x_train, x_test, y_train, y_test = train_test_split(X_cancer_samples_mat, onehot_encoded_cancer_samples, 61 | stratify= onehot_encoded_cancer_samples, 62 | test_size=0.25, random_state=42) 63 | 64 | 65 | img_rows, img_cols = len(x_test[0][0]), len(x_test[0]) 66 | num_classes = len(y_train[0]) 67 | batch_size = 128 68 | epochs = 20 69 | seed = 7 70 | np.random.seed(seed) 71 | 72 | 73 | input_Xs = X_cancer_samples_mat 74 | y_s = train_labels 75 | 76 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) 77 | cvscores = [] 78 | 79 | for j in range(10): 80 | i = 0 81 | for train, test in kfold.split(input_Xs, y_s): 82 | 83 | input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1) 84 | input_shape = (img_rows, img_cols, 1) 85 | input_Xs = input_Xs.astype('float32') 86 | input_img = Input(input_shape) 87 | label_encoder = LabelEncoder() 88 | integer_encoded = label_encoder.fit_transform(y_s) 89 | # binary encode 90 | onehot_encoder = OneHotEncoder(sparse=False) 91 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 92 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 93 | num_classes = len(onehot_encoded[0]) 94 | 95 | tower_1 = Conv2D(32, (1, 71), activation='relu')(input_img) 96 | tower_1 = MaxPooling2D(1, 2)(tower_1) 97 | tower_1 = Flatten()(tower_1) 98 | 99 | tower_2 = Conv2D(32, (100, 1), activation='relu')(input_img) 100 | tower_2 = MaxPooling2D(1, 2)(tower_2) 101 | tower_2 = Flatten()(tower_2) 102 | 103 | output = keras.layers.concatenate([tower_1, tower_2], axis=1) 104 | out1 = Dense(128, activation='relu')(output) 105 | last_layer = Dense(num_classes, activation='softmax')(out1) 106 | model = Model(input=[input_img], output=last_layer) 107 | model.output_shape 108 | 109 | model.compile(loss='categorical_crossentropy', 110 | optimizer='adam', 111 | metrics=['categorical_accuracy']) 112 | callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)] 113 | if i==0: 114 | model.summary() 115 | i = i +1 116 | history = model.fit(input_Xs[train], onehot_encoded[train], 117 | batch_size=batch_size, 118 | epochs=epochs, 119 | verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test])) 120 | scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0) 121 | print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) 122 | cvscores.append(scores[1] * 100) 123 | 124 | print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) 125 | -------------------------------------------------------------------------------- /5cv_34class/5cv_1D_CNN_34class.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is written by Milad Mostavi, one of authors of 3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper. 4 | Please cite this paper in the case it was useful in your research 5 | ''' 6 | import pickle 7 | from numpy import array 8 | from numpy import argmax 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.preprocessing import OneHotEncoder 11 | import numpy as np 12 | from sklearn.model_selection import train_test_split 13 | import collections 14 | import matplotlib.pyplot as plt 15 | import pandas as pd 16 | from keras.models import Sequential 17 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten 18 | from keras.callbacks import EarlyStopping, ModelCheckpoint 19 | from keras.layers.normalization import BatchNormalization 20 | from keras.layers.advanced_activations import LeakyReLU 21 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score 22 | from sklearn.model_selection import StratifiedKFold 23 | from collections import Counter 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | A = open('TCGA_new_pre_second.pckl', 'rb') 32 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new, 33 | project_ids_new] = pickle.load(A) 34 | A.close() 35 | 36 | f = open('TCGA_new_pre_first.pckl', 'rb') 37 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f) 38 | f.close() 39 | 40 | batch_size = 128 41 | epochs = 50 42 | seed = 7 43 | np.random.seed(seed) 44 | 45 | 46 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values 47 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values 48 | 49 | name_cancer_samples = project_ids_new[remain_cancer_ids_ind] 50 | name_normal_samples = ['Normal Samples'] *len(X_normal_samples) 51 | 52 | X_cancer_samples_34 = np.concatenate((X_cancer_samples,X_normal_samples)) 53 | X_names = np.concatenate((name_cancer_samples,name_normal_samples)) 54 | X_cancer_samples_mat = np.concatenate((X_cancer_samples_34,np.zeros((len(X_cancer_samples_34),9))),axis=1) 55 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100)) 56 | 57 | 58 | 59 | 60 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) 61 | cvscores = [] 62 | cv_yscores = [] 63 | Y_test =[] 64 | 65 | input_Xs = X_cancer_samples_mat 66 | y_s = X_names 67 | 68 | img_rows, img_cols = len(input_Xs[0][0]), len(input_Xs[0]) 69 | num_classes = len(set(y_s)) 70 | 71 | label_encoder = LabelEncoder() 72 | integer_encoded = label_encoder.fit_transform(y_s) 73 | # binary encode 74 | onehot_encoder = OneHotEncoder(sparse=False) 75 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 76 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 77 | 78 | i = 0 79 | for train, test in kfold.split(X_cancer_samples_34, y_s): # input_Xs in normal case and shuffled should be shuffled_Xs 80 | 81 | input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1) 82 | input_shape = (img_rows, img_cols, 1) 83 | input_Xs = input_Xs.astype('float32') 84 | 85 | num_classes = len(onehot_encoded[0]) 86 | 87 | model = Sequential() 88 | ## *********** First layer Conv 89 | model.add(Conv2D(32, kernel_size=(1, 71), strides=(1, 1), 90 | input_shape=input_shape)) 91 | model.add(Activation('relu')) 92 | model.add(MaxPooling2D(1, 2)) 93 | ## ********* Classification layer 94 | model.add(Flatten()) 95 | model.add(Dense(128, activation='relu')) 96 | model.add(Dense(num_classes, activation='softmax')) 97 | model.compile(loss='categorical_crossentropy', 98 | optimizer='adam', 99 | metrics=['categorical_accuracy']) 100 | callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)] 101 | if i==0: 102 | model.summary() 103 | i = i +1 104 | history = model.fit(input_Xs[train], onehot_encoded[train], 105 | batch_size=batch_size, 106 | epochs=epochs, 107 | verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test])) 108 | scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0) 109 | y_score = model.predict(input_Xs[test]) 110 | print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) 111 | 112 | cvscores.append(scores[1] * 100) 113 | Y_test.append(onehot_encoded[test]) 114 | cv_yscores.append(y_score) 115 | 116 | print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) 117 | cv_yscores = np.concatenate(cv_yscores) 118 | Y_test = np.concatenate(Y_test) 119 | 120 | 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /5cv_34class/5cv_Vanilla_34class.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is written by Milad Mostavi, one of authors of 3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper. 4 | Please cite this paper in the case it was useful in your research 5 | ''' 6 | import pickle 7 | from numpy import array 8 | from numpy import argmax 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.preprocessing import OneHotEncoder 11 | import numpy as np 12 | from sklearn.model_selection import train_test_split 13 | import collections 14 | import matplotlib.pyplot as plt 15 | import pandas as pd 16 | from keras.models import Sequential 17 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten 18 | from keras.callbacks import EarlyStopping, ModelCheckpoint 19 | from keras.layers.normalization import BatchNormalization 20 | from keras.layers.advanced_activations import LeakyReLU 21 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score 22 | from sklearn.model_selection import StratifiedKFold 23 | from collections import Counter 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | A = open('TCGA_new_pre_second.pckl', 'rb') 32 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new, 33 | project_ids_new] = pickle.load(A) 34 | A.close() 35 | 36 | f = open('TCGA_new_pre_first.pckl', 'rb') 37 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f) 38 | f.close() 39 | 40 | batch_size = 128 41 | epochs = 50 42 | seed = 7 43 | np.random.seed(seed) 44 | 45 | 46 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values 47 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values 48 | 49 | name_cancer_samples = project_ids_new[remain_cancer_ids_ind] 50 | name_normal_samples = ['Normal Samples'] *len(X_normal_samples) 51 | 52 | X_cancer_samples_34 = np.concatenate((X_cancer_samples,X_normal_samples)) 53 | X_names = np.concatenate((name_cancer_samples,name_normal_samples)) 54 | X_cancer_samples_mat = np.concatenate((X_cancer_samples_34,np.zeros((len(X_cancer_samples_34),9))),axis=1) 55 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100)) 56 | 57 | 58 | 59 | 60 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) 61 | cvscores = [] 62 | cv_yscores = [] 63 | Y_test =[] 64 | 65 | input_Xs = X_cancer_samples_mat 66 | y_s = X_names 67 | 68 | img_rows, img_cols = len(input_Xs[0][0]), len(input_Xs[0]) 69 | num_classes = len(set(y_s)) 70 | 71 | label_encoder = LabelEncoder() 72 | integer_encoded = label_encoder.fit_transform(y_s) 73 | # binary encode 74 | onehot_encoder = OneHotEncoder(sparse=False) 75 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 76 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 77 | 78 | i = 0 79 | for train, test in kfold.split(X_cancer_samples_34, y_s): # input_Xs in normal case and shuffled should be shuffled_Xs 80 | 81 | input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1) 82 | input_shape = (img_rows, img_cols, 1) 83 | input_Xs = input_Xs.astype('float32') 84 | 85 | num_classes = len(onehot_encoded[0]) 86 | 87 | model = Sequential() 88 | ## *********** First layer Conv 89 | model.add(Conv2D(32, kernel_size=(10, 10), strides=(2, 2), 90 | input_shape=input_shape)) 91 | model.add(Activation('relu')) 92 | model.add(MaxPooling2D(2, 2)) 93 | ## ********* Classification layer 94 | model.add(Flatten()) 95 | model.add(Dense(128, activation='relu')) 96 | model.add(Dense(num_classes, activation='softmax')) 97 | model.compile(loss='categorical_crossentropy', 98 | optimizer='adam', 99 | metrics=['categorical_accuracy']) 100 | callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)] 101 | if i==0: 102 | model.summary() 103 | i = i +1 104 | history = model.fit(input_Xs[train], onehot_encoded[train], 105 | batch_size=batch_size, 106 | epochs=epochs, 107 | verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test])) 108 | scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0) 109 | y_score = model.predict(input_Xs[test]) 110 | print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) 111 | 112 | cvscores.append(scores[1] * 100) 113 | Y_test.append(onehot_encoded[test]) 114 | cv_yscores.append(y_score) 115 | 116 | print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) 117 | cv_yscores = np.concatenate(cv_yscores) 118 | Y_test = np.concatenate(Y_test) 119 | 120 | 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /5cv_34class/5cv_hybrid_34class.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is written by Milad Mostavi, one of authors of 3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper. 4 | Please cite this paper in the case it was useful in your research 5 | ''' 6 | import pickle 7 | from numpy import array 8 | from numpy import argmax 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.preprocessing import OneHotEncoder 11 | import numpy as np 12 | from sklearn.model_selection import train_test_split 13 | import collections 14 | import matplotlib.pyplot as plt 15 | import pandas as pd 16 | import keras 17 | from keras.models import Sequential, Model 18 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, Input 19 | from keras.callbacks import EarlyStopping, ModelCheckpoint 20 | from keras.layers.normalization import BatchNormalization 21 | from keras.layers.advanced_activations import LeakyReLU 22 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score 23 | from sklearn.model_selection import StratifiedKFold 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | A = open('TCGA_new_pre_second.pckl', 'rb') 32 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new, 33 | project_ids_new] = pickle.load(A) 34 | A.close() 35 | 36 | f = open('TCGA_new_pre_first.pckl', 'rb') 37 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f) 38 | f.close() 39 | 40 | batch_size = 128 41 | epochs = 50 42 | seed = 7 43 | np.random.seed(seed) 44 | 45 | 46 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values 47 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values 48 | 49 | name_cancer_samples = project_ids_new[remain_cancer_ids_ind] 50 | name_normal_samples = ['Normal Samples'] *len(X_normal_samples) 51 | 52 | X_cancer_samples_34 = np.concatenate((X_cancer_samples,X_normal_samples)) 53 | X_names = np.concatenate((name_cancer_samples,name_normal_samples)) 54 | X_cancer_samples_mat = np.concatenate((X_cancer_samples_34,np.zeros((len(X_cancer_samples_34),9))),axis=1) 55 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100)) 56 | 57 | 58 | 59 | 60 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) 61 | cvscores = [] 62 | cv_yscores = [] 63 | Y_test =[] 64 | 65 | input_Xs = X_cancer_samples_mat 66 | y_s = X_names 67 | 68 | img_rows, img_cols = len(input_Xs[0][0]), len(input_Xs[0]) 69 | num_classes = len(set(y_s)) 70 | 71 | label_encoder = LabelEncoder() 72 | integer_encoded = label_encoder.fit_transform(y_s) 73 | # binary encode 74 | onehot_encoder = OneHotEncoder(sparse=False) 75 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 76 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 77 | 78 | i = 0 79 | 80 | for train, test in kfold.split(input_Xs, y_s): # input_Xs in normal case and shuffled should be shuffled_Xs 81 | 82 | input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1) 83 | input_shape = (img_rows, img_cols, 1) 84 | input_Xs = input_Xs.astype('float32') 85 | input_img = Input(input_shape) 86 | label_encoder = LabelEncoder() 87 | integer_encoded = label_encoder.fit_transform(y_s) 88 | # binary encode 89 | onehot_encoder = OneHotEncoder(sparse=False) 90 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 91 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 92 | num_classes = len(onehot_encoded[0]) 93 | 94 | tower_1 = Conv2D(32, (1, 71), activation='relu')(input_img) 95 | # tower_1 = Conv2D(64, (3, 3), padding='same', activation='relu')(tower_1) 96 | tower_1 = MaxPooling2D(1, 2)(tower_1) 97 | tower_1 = Flatten()(tower_1) 98 | 99 | tower_2 = Conv2D(32, (100, 1), activation='relu')(input_img) 100 | # tower_2 = Conv2D(64, (5, 5), padding='same', activation='relu')(tower_2) 101 | tower_2 = MaxPooling2D(1, 2)(tower_2) 102 | tower_2 = Flatten()(tower_2) 103 | 104 | 105 | output = keras.layers.concatenate([tower_1, tower_2], axis=1) 106 | 107 | out1 = Dense(128, activation='relu')(output) 108 | last_layer = Dense(num_classes, activation='softmax')(out1) 109 | 110 | model = Model(input=[input_img], output=last_layer) 111 | model.output_shape 112 | 113 | model.compile(loss='categorical_crossentropy', 114 | optimizer='adam', 115 | metrics=['categorical_accuracy']) 116 | callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)] 117 | if i==0: 118 | model.summary() 119 | i = i +1 120 | history = model.fit(input_Xs[train], onehot_encoded[train], 121 | batch_size=batch_size, 122 | epochs=epochs, 123 | verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test])) 124 | scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0) 125 | print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) 126 | cvscores.append(scores[1] * 100) 127 | 128 | print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) 129 | -------------------------------------------------------------------------------- /Hyperparameter tuning/1d_CNN_33class_hyperparameters.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is written by Milad Mostavi, one of authors of 3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper. 4 | Please cite this paper in the case it was useful in your research 5 | ''' 6 | import pickle 7 | from numpy import array 8 | from numpy import argmax 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.preprocessing import OneHotEncoder 11 | import numpy as np 12 | from sklearn.model_selection import GridSearchCV 13 | 14 | from sklearn.model_selection import train_test_split 15 | import collections 16 | import matplotlib.pyplot as plt 17 | import pandas as pd 18 | from keras.models import Sequential 19 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten 20 | from keras.callbacks import EarlyStopping, ModelCheckpoint 21 | from keras.layers.normalization import BatchNormalization 22 | from keras.layers.advanced_activations import LeakyReLU 23 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score 24 | from collections import Counter 25 | from keras.wrappers.scikit_learn import KerasClassifier 26 | import keras 27 | 28 | 29 | 30 | A = open('TCGA_new_pre_second.pckl', 'rb') 31 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new, 32 | project_ids_new] = pickle.load(A) 33 | A.close() 34 | 35 | f = open('TCGA_new_pre_first.pckl', 'rb') 36 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f) 37 | f.close() 38 | 39 | 40 | ## embedding labels 41 | # integer encode 42 | label_encoder = LabelEncoder() 43 | integer_encoded = label_encoder.fit_transform(project_ids_new) 44 | # binary encode 45 | onehot_encoder = OneHotEncoder(sparse=False) 46 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 47 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 48 | 49 | X_cancer_samples = dropped_genes_final.iloc[:, remain_cancer_ids_ind].T.values 50 | X_normal_samples = dropped_genes_final.iloc[:, remain_normal_ids_ind].T.values 51 | onehot_encoded_cancer_samples = onehot_encoded[remain_cancer_ids_ind] 52 | onehot_encoded_normal_samples = onehot_encoded[remain_normal_ids_ind] 53 | 54 | X_cancer_samples_mat = np.concatenate((X_cancer_samples,np.zeros((len(X_cancer_samples),9))),axis=1) 55 | ## add nine zeros to the end of each sample 56 | # This line dimension needs to be changed for different kernel sizes 57 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100)) 58 | 59 | ## This line is useful when only one fold training is needed 60 | x_train, x_test, y_train, y_test = train_test_split(X_cancer_samples_mat, onehot_encoded_cancer_samples, 61 | stratify=onehot_encoded_cancer_samples, 62 | test_size=0.25, random_state=42) 63 | # adding one dimention to feed into CNN 64 | img_rows, img_cols = len(x_test[0]), len(x_test[0][0]) 65 | 66 | x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) 67 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) 68 | # X_normal_samples = X_normal_samples.reshape(X_normal_samples.shape[0], img_rows, img_cols, 1) 69 | x_train = x_train.astype('float32') 70 | x_test = x_test.astype('float32') 71 | 72 | 73 | ## Model 74 | 75 | def make_model(dense_layer_sizes, filters, kernel_size): 76 | img_rows, img_cols = len(x_test[0]), len(x_test[0][0]) 77 | num_classes = len(y_train[0]) 78 | input_shape = (img_rows, img_cols, 1) 79 | 80 | model = Sequential() 81 | ## *********** First layer Conv 82 | model.add(Conv2D(filters, kernel_size=kernel_size, strides=(1, 1), 83 | input_shape=input_shape)) 84 | # model.add(BatchNormalization()) 85 | model.add(Activation('relu')) 86 | # model.add(LeakyReLU()) 87 | model.add(MaxPooling2D(1, 2)) 88 | model.output_shape 89 | 90 | ## ********* Classification layer 91 | model.add(Flatten()) 92 | model.add(Dense(dense_layer_sizes, activation='relu')) 93 | 94 | model.add(Dense(num_classes, activation='softmax')) 95 | model.output_shape 96 | 97 | model.compile(loss='categorical_crossentropy', 98 | optimizer='adam', 99 | metrics=['categorical_accuracy']) 100 | model.summary() 101 | return model 102 | dense_size_candidates = [64, 128, 512] 103 | my_classifier = KerasClassifier(make_model, batch_size=128) 104 | validator = GridSearchCV(my_classifier, 105 | param_grid={'dense_layer_sizes': dense_size_candidates, 106 | # epochs is avail for tuning even when not 107 | # an argument to model building function 108 | 'epochs': [25], 109 | 'filters': [8,16, 32, 64], 110 | 'kernel_size': [(1, 71)]}, 111 | scoring='neg_log_loss', 112 | n_jobs=1) 113 | validator.fit(x_train, y_train) 114 | import csv 115 | print('The parameters of the best model are: ') 116 | print(validator.best_params_) 117 | # write it in a excel file 118 | with open('results_runs50.csv', 'w') as csv_file: 119 | writer = csv.writer(csv_file) 120 | for key, value in validator.cv_results_.items(): 121 | writer.writerow([key, value]) 122 | # validator.best_estimator_ returns sklearn-wrapped version of best model. 123 | # validator.best_estimator_.model returns the (unwrapped) keras model 124 | best_model = validator.best_estimator_.model 125 | metric_names = best_model.metrics_names 126 | metric_values = best_model.evaluate(x_test, y_test) 127 | for metric, value in zip(metric_names, metric_values): 128 | print(metric, ': ', value) 129 | -------------------------------------------------------------------------------- /Hyperparameter tuning/vanilla_CNN_33class_hyperparameters.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is written by Milad Mostavi, one of authors of 3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper. 4 | Please cite this paper in the case it was useful in your research 5 | ''' 6 | import pickle 7 | from numpy import array 8 | from numpy import argmax 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.preprocessing import OneHotEncoder 11 | import numpy as np 12 | from sklearn.model_selection import GridSearchCV 13 | 14 | from sklearn.model_selection import train_test_split 15 | import collections 16 | import matplotlib.pyplot as plt 17 | import pandas as pd 18 | from keras.models import Sequential 19 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten 20 | from keras.callbacks import EarlyStopping, ModelCheckpoint 21 | from keras.layers.normalization import BatchNormalization 22 | from keras.layers.advanced_activations import LeakyReLU 23 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score 24 | from collections import Counter 25 | from keras.wrappers.scikit_learn import KerasClassifier 26 | import keras 27 | 28 | 29 | 30 | A = open('TCGA_new_pre_second.pckl', 'rb') 31 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new, 32 | project_ids_new] = pickle.load(A) 33 | A.close() 34 | 35 | f = open('TCGA_new_pre_first.pckl', 'rb') 36 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f) 37 | f.close() 38 | 39 | 40 | ## embedding labels 41 | # integer encode 42 | label_encoder = LabelEncoder() 43 | integer_encoded = label_encoder.fit_transform(project_ids_new) 44 | # binary encode 45 | onehot_encoder = OneHotEncoder(sparse=False) 46 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) 47 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded) 48 | 49 | X_cancer_samples = dropped_genes_final.iloc[:, remain_cancer_ids_ind].T.values 50 | X_normal_samples = dropped_genes_final.iloc[:, remain_normal_ids_ind].T.values 51 | onehot_encoded_cancer_samples = onehot_encoded[remain_cancer_ids_ind] 52 | onehot_encoded_normal_samples = onehot_encoded[remain_normal_ids_ind] 53 | 54 | X_cancer_samples_mat = np.concatenate((X_cancer_samples,np.zeros((len(X_cancer_samples),9))),axis=1) 55 | ## add nine zeros to the end of each sample 56 | # This line dimension needs to be changed for different kernel sizes 57 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100)) 58 | 59 | ## This line is useful when only one fold training is needed 60 | x_train, x_test, y_train, y_test = train_test_split(X_cancer_samples_mat, onehot_encoded_cancer_samples, 61 | stratify=onehot_encoded_cancer_samples, 62 | test_size=0.25, random_state=42) 63 | # adding one dimention to feed into CNN 64 | img_rows, img_cols = len(x_test[0]), len(x_test[0][0]) 65 | 66 | x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) 67 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) 68 | # X_normal_samples = X_normal_samples.reshape(X_normal_samples.shape[0], img_rows, img_cols, 1) 69 | x_train = x_train.astype('float32') 70 | x_test = x_test.astype('float32') 71 | 72 | 73 | ## Model 74 | 75 | def make_model(dense_layer_sizes, filters, kernel_size): 76 | img_rows, img_cols = len(x_test[0]), len(x_test[0][0]) 77 | num_classes = len(y_train[0]) 78 | input_shape = (img_rows, img_cols, 1) 79 | 80 | model = Sequential() 81 | ## *********** First layer Conv 82 | model.add(Conv2D(filters, kernel_size=kernel_size, strides=stride, 83 | input_shape=input_shape)) 84 | # model.add(BatchNormalization()) 85 | model.add(Activation('relu')) 86 | # model.add(LeakyReLU()) 87 | model.add(MaxPooling2D(2, 2)) 88 | model.output_shape 89 | 90 | ## ********* Classification layer 91 | model.add(Flatten()) 92 | model.add(Dense(dense_layer_sizes, activation='relu')) 93 | 94 | model.add(Dense(num_classes, activation='softmax')) 95 | model.output_shape 96 | 97 | model.compile(loss='categorical_crossentropy', 98 | optimizer='adam', 99 | metrics=['categorical_accuracy']) 100 | model.summary() 101 | return model 102 | dense_size_candidates = [64, 128, 512] 103 | my_classifier = KerasClassifier(make_model, batch_size=128) 104 | validator = GridSearchCV(my_classifier, 105 | param_grid={'dense_layer_sizes': dense_size_candidates, 106 | # epochs is avail for tuning even when not 107 | # an argument to model building function 108 | 'epochs': [25], 109 | 'filters': [8, 16, 32, 64], 'stride': [(1, 1),(2, 2),(5, 5)], 110 | 'kernel_size': [(7, 7), (10, 10), (15, 15), (20, 20)]}, 111 | scoring='neg_log_loss', 112 | n_jobs=1) 113 | validator.fit(x_train, y_train) 114 | import csv 115 | print('The parameters of the best model are: ') 116 | print(validator.best_params_) 117 | # write it in a excel file 118 | with open('results_runs50.csv', 'w') as csv_file: 119 | writer = csv.writer(csv_file) 120 | for key, value in validator.cv_results_.items(): 121 | writer.writerow([key, value]) 122 | # validator.best_estimator_ returns sklearn-wrapped version of best model. 123 | # validator.best_estimator_.model returns the (unwrapped) keras model 124 | best_model = validator.best_estimator_.model 125 | metric_names = best_model.metrics_names 126 | metric_values = best_model.evaluate(x_test, y_test) 127 | for metric, value in zip(metric_names, metric_values): 128 | print(metric, ': ', value) 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Predicting all 33-cancer types and their normal tissues with CNN 2 | ![](image/git_models_fig.png) 3 | 4 | Folder ending with 33 class contains models which only work for classification of 33 cancer tumors. In order to see the impact of Normal tissues in classification, the codes that are in 34 class folder need to be run. Hyperparameter tuning folder is showing grid search result of some of the hyperparameters for 1D-CNN and Vanilla models. All codes are written in Keras with a simple structure which helps reader understand the modeling stage easier. 5 | 6 | ## Background 7 | Precise prediction of cancer types is vital for cancer diagnosis and therapy. Important cancer marker genes can be inferred through predictive model. Several studies have attempted to build machine learning models for this task however none has taken into consideration the tissue of origin effects that can potentially bias the identification of cancer markers. 8 | ## Results 9 | In this paper, we introduced several Convolutional Neural Network (CNN) models that take unstructured gene expression inputs to classify tumor and non-tumor samples into their designated cancer types or as normal. Based on different designs of gene embeddings and convolution schemes, we implemented three CNN models: 1D-CNN, 2D-Vanilla-CNN, and 2D-Hybrid-CNN. The models were trained and tested on combined 10,340 samples of 33 cancer types and 731 matched normal tissues of The Cancer Genome Atlas (TCGA). Our models achieved excellent prediction accuracies (93.9-95.0%) among 34 classes (33 cancers and normal). Furthermore, we interpreted the 1D-CNN model with a guided saliency technique and identified a total of 2,090 cancer markers (108 per class). The concordance of differential expression of these markers between the cancer type they represent and others is confirmed. In breast cancer, for instance, our model identified well-known markers, such as GATA3 and ESR1. Finally, we extended the 1D-CNN model for prediction of breast cancer subtypes and achieved an average accuracy of 88.42% among 5 subtypes. 10 | ## Conclusions 11 | Here we present novel CNN designs for accurate and simultaneous cancer/normal and cancer types prediction based on gene expression profiles, and unique model interpretation scheme to elucidate biologically relevance of cancer marker genes after eliminating the effects of tissue-of-origin. The proposed model had light hyperparameters to be trained and thus can be easily adapt to facilitate cancer diagnosis in the future. 12 | -------------------------------------------------------------------------------- /image/git_models_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenlabgccri/CancerTypePrediction/ed3991c960e29c071410db549960807dd6b6fe87/image/git_models_fig.png --------------------------------------------------------------------------------