├── .gitattributes
├── 5cv_33class
    ├── 5cv_1D_CNN_33class.py
    ├── 5cv_Vanilla_33class.py
    └── 5cv_hybrid_33class.py
├── 5cv_34class
    ├── 5cv_1D_CNN_34class.py
    ├── 5cv_Vanilla_34class.py
    └── 5cv_hybrid_34class.py
├── Hyperparameter tuning
    ├── 1d_CNN_33class_hyperparameters.py
    └── vanilla_CNN_33class_hyperparameters.py
├── README.md
└── image
    └── git_models_fig.png


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/5cv_33class/5cv_1D_CNN_33class.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is written by Milad Mostavi, one of authors of
  3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper.
  4 | Please cite this paper in the case it was useful in your research
  5 | '''
  6 | import pickle
  7 | from numpy import array
  8 | from numpy import argmax
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.preprocessing import OneHotEncoder
 11 | import numpy as np
 12 | from sklearn.model_selection import train_test_split
 13 | import collections
 14 | import matplotlib.pyplot as plt
 15 | import pandas as pd
 16 | from keras.models import Sequential, Model
 17 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, Input
 18 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 19 | from keras.layers.normalization import BatchNormalization
 20 | from keras.layers.advanced_activations import LeakyReLU
 21 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score
 22 | from sklearn.model_selection import StratifiedKFold
 23 | 
 24 | 
 25 | A = open('TCGA_new_pre_second.pckl', 'rb')
 26 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new,
 27 |  project_ids_new] = pickle.load(A)
 28 | A.close()
 29 | 
 30 | f = open('TCGA_new_pre_first.pckl', 'rb')
 31 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f)
 32 | f.close()
 33 | 
 34 | 
 35 | ## embedding labels
 36 | # integer encode
 37 | label_encoder = LabelEncoder()
 38 | integer_encoded = label_encoder.fit_transform(project_ids_new)
 39 | # binary encode
 40 | onehot_encoder = OneHotEncoder(sparse=False)
 41 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 42 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 43 | 
 44 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values
 45 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values
 46 | onehot_encoded_cancer_samples = onehot_encoded[remain_cancer_ids_ind]
 47 | onehot_encoded_normal_samples = onehot_encoded[remain_normal_ids_ind]
 48 | 
 49 | X_cancer_samples_mat = np.concatenate((X_cancer_samples,np.zeros((len(X_cancer_samples),9))),axis=1)
 50 | ## add nine zeros to the end of each sample
 51 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100))
 52 | 
 53 | ## This line is useful when only one fold training is needed
 54 | x_train, x_test, y_train, y_test = train_test_split(X_cancer_samples_mat, onehot_encoded_cancer_samples,
 55 |                                                     stratify= onehot_encoded_cancer_samples,
 56 |                                                     test_size=0.25, random_state=42)
 57 | 
 58 | 
 59 | img_rows, img_cols = len(x_test[0]), len(x_test[0][0])
 60 | num_classes = len(y_train[0])
 61 | batch_size = 128
 62 | epochs = 20
 63 | seed = 7
 64 | np.random.seed(seed)
 65 | 
 66 | 
 67 | input_Xs = X_cancer_samples_mat
 68 | y_s = train_labels
 69 | 
 70 | 
 71 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
 72 | cvscores = []
 73 | 
 74 | for j in range(10):
 75 |     i = 0
 76 |     for train, test in kfold.split(input_Xs, y_s):
 77 | 
 78 |         input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1)
 79 |         input_shape = (img_rows, img_cols, 1)
 80 |         input_Xs = input_Xs.astype('float32')
 81 | 
 82 |         label_encoder = LabelEncoder()
 83 |         integer_encoded = label_encoder.fit_transform(y_s)
 84 |         # binary encode
 85 |         onehot_encoder = OneHotEncoder(sparse=False)
 86 |         integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 87 |         onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 88 |         num_classes = len(onehot_encoded[0])
 89 | 
 90 |         model = Sequential()
 91 |         ## *********** First layer Conv
 92 |         model.add(Conv2D(32, kernel_size=(1, 71), strides=(1, 1),
 93 |                          input_shape=input_shape))
 94 |         model.add(Activation('relu'))
 95 |         model.add(MaxPooling2D(1, 2))
 96 |         ## ********* Classification layer
 97 |         model.add(Flatten())
 98 |         model.add(Dense(128, activation='relu'))
 99 |         model.add(Dense(num_classes, activation='softmax'))
100 |         model.compile(loss='categorical_crossentropy',
101 |                       optimizer='adam',
102 |                       metrics=['categorical_accuracy'])
103 |         callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)]
104 |         if i==0:
105 |             model.summary()
106 |             i = i +1
107 |         history = model.fit(input_Xs[train], onehot_encoded[train],
108 |                             batch_size=batch_size,
109 |                             epochs=epochs,
110 |                             verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test]))
111 |         scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0)
112 |         # print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
113 |         cvscores.append(scores[1] * 100)
114 | 
115 |     print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
116 | 


--------------------------------------------------------------------------------
/5cv_33class/5cv_Vanilla_33class.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is written by Milad Mostavi, one of authors of
  3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper.
  4 | Please cite this paper in the case it was useful in your research
  5 | '''
  6 | import pickle
  7 | from numpy import array
  8 | from numpy import argmax
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.preprocessing import OneHotEncoder
 11 | import numpy as np
 12 | from sklearn.model_selection import train_test_split
 13 | import collections
 14 | import matplotlib.pyplot as plt
 15 | import pandas as pd
 16 | from keras.models import Sequential, Model
 17 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, Input
 18 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 19 | from keras.layers.normalization import BatchNormalization
 20 | from keras.layers.advanced_activations import LeakyReLU
 21 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score
 22 | from sklearn.model_selection import StratifiedKFold
 23 | 
 24 | 
 25 | A = open('TCGA_new_pre_second.pckl', 'rb')
 26 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new,
 27 |  project_ids_new] = pickle.load(A)
 28 | A.close()
 29 | 
 30 | f = open('TCGA_new_pre_first.pckl', 'rb')
 31 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f)
 32 | f.close()
 33 | 
 34 | 
 35 | ## embedding labels
 36 | # integer encode
 37 | label_encoder = LabelEncoder()
 38 | integer_encoded = label_encoder.fit_transform(project_ids_new)
 39 | # binary encode
 40 | onehot_encoder = OneHotEncoder(sparse=False)
 41 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 42 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 43 | 
 44 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values
 45 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values
 46 | onehot_encoded_cancer_samples = onehot_encoded[remain_cancer_ids_ind]
 47 | onehot_encoded_normal_samples = onehot_encoded[remain_normal_ids_ind]
 48 | 
 49 | X_cancer_samples_mat = np.concatenate((X_cancer_samples,np.zeros((len(X_cancer_samples),9))),axis=1)
 50 | ## add nine zeros to the end of each sample
 51 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100))
 52 | 
 53 | ## This line is useful when only one fold training is needed
 54 | x_train, x_test, y_train, y_test = train_test_split(X_cancer_samples_mat, onehot_encoded_cancer_samples,
 55 |                                                     stratify= onehot_encoded_cancer_samples,
 56 |                                                     test_size=0.25, random_state=42)
 57 | 
 58 | 
 59 | img_rows, img_cols = len(x_test[0]), len(x_test[0][0])
 60 | num_classes = len(y_train[0])
 61 | batch_size = 128
 62 | epochs = 20
 63 | # fix random seed for reproducibility
 64 | seed = 7
 65 | np.random.seed(seed)
 66 | 
 67 | 
 68 | input_Xs = X_cancer_samples_mat
 69 | y_s = train_labels
 70 | 
 71 | 
 72 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
 73 | cvscores = []
 74 | 
 75 | for j in range(10):
 76 |     i = 0
 77 |     for train, test in kfold.split(input_Xs, y_s):
 78 | 
 79 |         input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1)
 80 |         input_shape = (img_rows, img_cols, 1)
 81 |         input_Xs = input_Xs.astype('float32')
 82 | 
 83 |         label_encoder = LabelEncoder()
 84 |         integer_encoded = label_encoder.fit_transform(y_s)
 85 |         # binary encode
 86 |         onehot_encoder = OneHotEncoder(sparse=False)
 87 |         integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 88 |         onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 89 |         num_classes = len(onehot_encoded[0])
 90 | 
 91 |         model = Sequential()
 92 |         ## *********** First layer Conv
 93 |         model.add(Conv2D(32, kernel_size=(10, 10), strides=(1, 1),
 94 |                          input_shape=input_shape))
 95 |         model.add(Activation('relu'))
 96 |         model.add(MaxPooling2D(2, 2))
 97 |         ## ********* Classification layer
 98 |         model.add(Flatten())
 99 |         model.add(Dense(128, activation='relu'))
100 |         model.add(Dense(num_classes, activation='softmax'))
101 |         model.compile(loss='categorical_crossentropy',
102 |                       optimizer='adam',
103 |                       metrics=['categorical_accuracy'])
104 |         callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)]
105 |         if i==0:
106 |             model.summary()
107 |             i = i +1
108 |         history = model.fit(input_Xs[train], onehot_encoded[train],
109 |                             batch_size=batch_size,
110 |                             epochs=epochs,
111 |                             verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test]))
112 |         scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0)
113 |         # print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
114 |         cvscores.append(scores[1] * 100)
115 | 
116 |     print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
117 | 


--------------------------------------------------------------------------------
/5cv_33class/5cv_hybrid_33class.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is written by Milad Mostavi, one of authors of
  3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper.
  4 | Please cite this paper in the case it was useful in your research
  5 | '''
  6 | import pickle
  7 | from numpy import array
  8 | from numpy import argmax
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.preprocessing import OneHotEncoder
 11 | import numpy as np
 12 | from sklearn.model_selection import train_test_split
 13 | import collections
 14 | import matplotlib.pyplot as plt
 15 | import pandas as pd
 16 | import keras
 17 | from keras.models import Sequential, Model
 18 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, Input
 19 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 20 | from keras.layers.normalization import BatchNormalization
 21 | from keras.layers.advanced_activations import LeakyReLU
 22 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score
 23 | from sklearn.model_selection import StratifiedKFold
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | A = open('TCGA_new_pre_second.pckl', 'rb')
 30 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new,
 31 |  project_ids_new] = pickle.load(A)
 32 | A.close()
 33 | 
 34 | f = open('TCGA_new_pre_first.pckl', 'rb')
 35 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f)
 36 | f.close()
 37 | 
 38 | 
 39 | ## embedding labels
 40 | # integer encode
 41 | label_encoder = LabelEncoder()
 42 | integer_encoded = label_encoder.fit_transform(project_ids_new)
 43 | # print(integer_encoded)
 44 | # binary encode
 45 | onehot_encoder = OneHotEncoder(sparse=False)
 46 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 47 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 48 | 
 49 | 
 50 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values
 51 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values
 52 | onehot_encoded_cancer_samples = onehot_encoded[remain_cancer_ids_ind]
 53 | onehot_encoded_normal_samples = onehot_encoded[remain_normal_ids_ind]
 54 | 
 55 | X_cancer_samples_mat = np.concatenate((X_cancer_samples,np.zeros((len(X_cancer_samples),9))),axis=1)
 56 | ## add nine zeros to the end of each sample
 57 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100))
 58 | 
 59 | ## This line is useful when only one fold training is needed
 60 | x_train, x_test, y_train, y_test = train_test_split(X_cancer_samples_mat, onehot_encoded_cancer_samples,
 61 |                                                     stratify= onehot_encoded_cancer_samples,
 62 |                                                     test_size=0.25, random_state=42)
 63 | 
 64 | 
 65 | img_rows, img_cols = len(x_test[0][0]), len(x_test[0])
 66 | num_classes = len(y_train[0])
 67 | batch_size = 128
 68 | epochs = 20
 69 | seed = 7
 70 | np.random.seed(seed)
 71 | 
 72 | 
 73 | input_Xs = X_cancer_samples_mat
 74 | y_s = train_labels
 75 | 
 76 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
 77 | cvscores = []
 78 | 
 79 | for j in range(10):
 80 |     i = 0
 81 |     for train, test in kfold.split(input_Xs, y_s):
 82 | 
 83 |         input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1)
 84 |         input_shape = (img_rows, img_cols, 1)
 85 |         input_Xs = input_Xs.astype('float32')
 86 |         input_img = Input(input_shape)
 87 |         label_encoder = LabelEncoder()
 88 |         integer_encoded = label_encoder.fit_transform(y_s)
 89 |         # binary encode
 90 |         onehot_encoder = OneHotEncoder(sparse=False)
 91 |         integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 92 |         onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 93 |         num_classes = len(onehot_encoded[0])
 94 | 
 95 |         tower_1 = Conv2D(32, (1, 71), activation='relu')(input_img)
 96 |         tower_1 = MaxPooling2D(1, 2)(tower_1)
 97 |         tower_1 = Flatten()(tower_1)
 98 | 
 99 |         tower_2 = Conv2D(32, (100, 1), activation='relu')(input_img)
100 |         tower_2 = MaxPooling2D(1, 2)(tower_2)
101 |         tower_2 = Flatten()(tower_2)
102 | 
103 |         output = keras.layers.concatenate([tower_1, tower_2], axis=1)
104 |         out1 = Dense(128, activation='relu')(output)
105 |         last_layer = Dense(num_classes, activation='softmax')(out1)
106 |         model = Model(input=[input_img], output=last_layer)
107 |         model.output_shape
108 | 
109 |         model.compile(loss='categorical_crossentropy',
110 |                       optimizer='adam',
111 |                       metrics=['categorical_accuracy'])
112 |         callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)]
113 |         if i==0:
114 |             model.summary()
115 |             i = i +1
116 |         history = model.fit(input_Xs[train], onehot_encoded[train],
117 |                             batch_size=batch_size,
118 |                             epochs=epochs,
119 |                             verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test]))
120 |         scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0)
121 |         print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
122 |         cvscores.append(scores[1] * 100)
123 | 
124 |     print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
125 | 


--------------------------------------------------------------------------------
/5cv_34class/5cv_1D_CNN_34class.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is written by Milad Mostavi, one of authors of
  3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper.
  4 | Please cite this paper in the case it was useful in your research
  5 | '''
  6 | import pickle
  7 | from numpy import array
  8 | from numpy import argmax
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.preprocessing import OneHotEncoder
 11 | import numpy as np
 12 | from sklearn.model_selection import train_test_split
 13 | import collections
 14 | import matplotlib.pyplot as plt
 15 | import pandas as pd
 16 | from keras.models import Sequential
 17 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten
 18 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 19 | from keras.layers.normalization import BatchNormalization
 20 | from keras.layers.advanced_activations import LeakyReLU
 21 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score
 22 | from sklearn.model_selection import StratifiedKFold
 23 | from collections import Counter
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | A = open('TCGA_new_pre_second.pckl', 'rb')
 32 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new,
 33 |  project_ids_new] = pickle.load(A)
 34 | A.close()
 35 | 
 36 | f = open('TCGA_new_pre_first.pckl', 'rb')
 37 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f)
 38 | f.close()
 39 | 
 40 | batch_size = 128
 41 | epochs = 50
 42 | seed = 7
 43 | np.random.seed(seed)
 44 | 
 45 | 
 46 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values
 47 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values
 48 | 
 49 | name_cancer_samples = project_ids_new[remain_cancer_ids_ind]
 50 | name_normal_samples = ['Normal Samples'] *len(X_normal_samples)
 51 | 
 52 | X_cancer_samples_34 = np.concatenate((X_cancer_samples,X_normal_samples))
 53 | X_names = np.concatenate((name_cancer_samples,name_normal_samples))
 54 | X_cancer_samples_mat = np.concatenate((X_cancer_samples_34,np.zeros((len(X_cancer_samples_34),9))),axis=1)
 55 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100))
 56 | 
 57 | 
 58 | 
 59 | 
 60 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
 61 | cvscores = []
 62 | cv_yscores = []
 63 | Y_test =[]
 64 | 
 65 | input_Xs = X_cancer_samples_mat
 66 | y_s = X_names
 67 | 
 68 | img_rows, img_cols = len(input_Xs[0][0]), len(input_Xs[0])
 69 | num_classes = len(set(y_s))
 70 | 
 71 | label_encoder = LabelEncoder()
 72 | integer_encoded = label_encoder.fit_transform(y_s)
 73 | # binary encode
 74 | onehot_encoder = OneHotEncoder(sparse=False)
 75 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 76 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 77 | 
 78 | i = 0
 79 | for train, test in kfold.split(X_cancer_samples_34, y_s):   # input_Xs in normal case and shuffled should be shuffled_Xs
 80 | 
 81 |     input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1)
 82 |     input_shape = (img_rows, img_cols, 1)
 83 |     input_Xs = input_Xs.astype('float32')
 84 | 
 85 |     num_classes = len(onehot_encoded[0])
 86 | 
 87 |     model = Sequential()
 88 |     ## *********** First layer Conv
 89 |     model.add(Conv2D(32, kernel_size=(1, 71), strides=(1, 1),
 90 |                      input_shape=input_shape))
 91 |     model.add(Activation('relu'))
 92 |     model.add(MaxPooling2D(1, 2))
 93 |     ## ********* Classification layer
 94 |     model.add(Flatten())
 95 |     model.add(Dense(128, activation='relu'))
 96 |     model.add(Dense(num_classes, activation='softmax'))
 97 |     model.compile(loss='categorical_crossentropy',
 98 |                   optimizer='adam',
 99 |                   metrics=['categorical_accuracy'])
100 |     callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)]
101 |     if i==0:
102 |         model.summary()
103 |         i = i +1
104 |     history = model.fit(input_Xs[train], onehot_encoded[train],
105 |                         batch_size=batch_size,
106 |                         epochs=epochs,
107 |                         verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test]))
108 |     scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0)
109 |     y_score = model.predict(input_Xs[test])
110 |     print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
111 | 
112 |     cvscores.append(scores[1] * 100)
113 |     Y_test.append(onehot_encoded[test])
114 |     cv_yscores.append(y_score)
115 | 
116 | print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
117 | cv_yscores = np.concatenate(cv_yscores)
118 | Y_test = np.concatenate(Y_test)
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/5cv_34class/5cv_Vanilla_34class.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is written by Milad Mostavi, one of authors of
  3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper.
  4 | Please cite this paper in the case it was useful in your research
  5 | '''
  6 | import pickle
  7 | from numpy import array
  8 | from numpy import argmax
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.preprocessing import OneHotEncoder
 11 | import numpy as np
 12 | from sklearn.model_selection import train_test_split
 13 | import collections
 14 | import matplotlib.pyplot as plt
 15 | import pandas as pd
 16 | from keras.models import Sequential
 17 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten
 18 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 19 | from keras.layers.normalization import BatchNormalization
 20 | from keras.layers.advanced_activations import LeakyReLU
 21 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score
 22 | from sklearn.model_selection import StratifiedKFold
 23 | from collections import Counter
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | A = open('TCGA_new_pre_second.pckl', 'rb')
 32 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new,
 33 |  project_ids_new] = pickle.load(A)
 34 | A.close()
 35 | 
 36 | f = open('TCGA_new_pre_first.pckl', 'rb')
 37 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f)
 38 | f.close()
 39 | 
 40 | batch_size = 128
 41 | epochs = 50
 42 | seed = 7
 43 | np.random.seed(seed)
 44 | 
 45 | 
 46 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values
 47 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values
 48 | 
 49 | name_cancer_samples = project_ids_new[remain_cancer_ids_ind]
 50 | name_normal_samples = ['Normal Samples'] *len(X_normal_samples)
 51 | 
 52 | X_cancer_samples_34 = np.concatenate((X_cancer_samples,X_normal_samples))
 53 | X_names = np.concatenate((name_cancer_samples,name_normal_samples))
 54 | X_cancer_samples_mat = np.concatenate((X_cancer_samples_34,np.zeros((len(X_cancer_samples_34),9))),axis=1)
 55 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100))
 56 | 
 57 | 
 58 | 
 59 | 
 60 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
 61 | cvscores = []
 62 | cv_yscores = []
 63 | Y_test =[]
 64 | 
 65 | input_Xs = X_cancer_samples_mat
 66 | y_s = X_names
 67 | 
 68 | img_rows, img_cols = len(input_Xs[0][0]), len(input_Xs[0])
 69 | num_classes = len(set(y_s))
 70 | 
 71 | label_encoder = LabelEncoder()
 72 | integer_encoded = label_encoder.fit_transform(y_s)
 73 | # binary encode
 74 | onehot_encoder = OneHotEncoder(sparse=False)
 75 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 76 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 77 | 
 78 | i = 0
 79 | for train, test in kfold.split(X_cancer_samples_34, y_s):   # input_Xs in normal case and shuffled should be shuffled_Xs
 80 | 
 81 |     input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1)
 82 |     input_shape = (img_rows, img_cols, 1)
 83 |     input_Xs = input_Xs.astype('float32')
 84 | 
 85 |     num_classes = len(onehot_encoded[0])
 86 | 
 87 |     model = Sequential()
 88 |     ## *********** First layer Conv
 89 |     model.add(Conv2D(32, kernel_size=(10, 10), strides=(2, 2),
 90 |                      input_shape=input_shape))
 91 |     model.add(Activation('relu'))
 92 |     model.add(MaxPooling2D(2, 2))
 93 |     ## ********* Classification layer
 94 |     model.add(Flatten())
 95 |     model.add(Dense(128, activation='relu'))
 96 |     model.add(Dense(num_classes, activation='softmax'))
 97 |     model.compile(loss='categorical_crossentropy',
 98 |                   optimizer='adam',
 99 |                   metrics=['categorical_accuracy'])
100 |     callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)]
101 |     if i==0:
102 |         model.summary()
103 |         i = i +1
104 |     history = model.fit(input_Xs[train], onehot_encoded[train],
105 |                         batch_size=batch_size,
106 |                         epochs=epochs,
107 |                         verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test]))
108 |     scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0)
109 |     y_score = model.predict(input_Xs[test])
110 |     print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
111 | 
112 |     cvscores.append(scores[1] * 100)
113 |     Y_test.append(onehot_encoded[test])
114 |     cv_yscores.append(y_score)
115 | 
116 | print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
117 | cv_yscores = np.concatenate(cv_yscores)
118 | Y_test = np.concatenate(Y_test)
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/5cv_34class/5cv_hybrid_34class.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is written by Milad Mostavi, one of authors of
  3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper.
  4 | Please cite this paper in the case it was useful in your research
  5 | '''
  6 | import pickle
  7 | from numpy import array
  8 | from numpy import argmax
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.preprocessing import OneHotEncoder
 11 | import numpy as np
 12 | from sklearn.model_selection import train_test_split
 13 | import collections
 14 | import matplotlib.pyplot as plt
 15 | import pandas as pd
 16 | import keras
 17 | from keras.models import Sequential, Model
 18 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, Input
 19 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 20 | from keras.layers.normalization import BatchNormalization
 21 | from keras.layers.advanced_activations import LeakyReLU
 22 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score
 23 | from sklearn.model_selection import StratifiedKFold
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | 
 31 | A = open('TCGA_new_pre_second.pckl', 'rb')
 32 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new,
 33 |  project_ids_new] = pickle.load(A)
 34 | A.close()
 35 | 
 36 | f = open('TCGA_new_pre_first.pckl', 'rb')
 37 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f)
 38 | f.close()
 39 | 
 40 | batch_size = 128
 41 | epochs = 50
 42 | seed = 7
 43 | np.random.seed(seed)
 44 | 
 45 | 
 46 | X_cancer_samples =dropped_genes_final.iloc[:,remain_cancer_ids_ind].T.values
 47 | X_normal_samples = dropped_genes_final.iloc[:,remain_normal_ids_ind].T.values
 48 | 
 49 | name_cancer_samples = project_ids_new[remain_cancer_ids_ind]
 50 | name_normal_samples = ['Normal Samples'] *len(X_normal_samples)
 51 | 
 52 | X_cancer_samples_34 = np.concatenate((X_cancer_samples,X_normal_samples))
 53 | X_names = np.concatenate((name_cancer_samples,name_normal_samples))
 54 | X_cancer_samples_mat = np.concatenate((X_cancer_samples_34,np.zeros((len(X_cancer_samples_34),9))),axis=1)
 55 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100))
 56 | 
 57 | 
 58 | 
 59 | 
 60 | kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
 61 | cvscores = []
 62 | cv_yscores = []
 63 | Y_test =[]
 64 | 
 65 | input_Xs = X_cancer_samples_mat
 66 | y_s = X_names
 67 | 
 68 | img_rows, img_cols = len(input_Xs[0][0]), len(input_Xs[0])
 69 | num_classes = len(set(y_s))
 70 | 
 71 | label_encoder = LabelEncoder()
 72 | integer_encoded = label_encoder.fit_transform(y_s)
 73 | # binary encode
 74 | onehot_encoder = OneHotEncoder(sparse=False)
 75 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 76 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 77 | 
 78 | i = 0
 79 | 
 80 | for train, test in kfold.split(input_Xs, y_s):   # input_Xs in normal case and shuffled should be shuffled_Xs
 81 | 
 82 |     input_Xs = input_Xs.reshape(input_Xs.shape[0], img_rows, img_cols, 1)
 83 |     input_shape = (img_rows, img_cols, 1)
 84 |     input_Xs = input_Xs.astype('float32')
 85 |     input_img = Input(input_shape)
 86 |     label_encoder = LabelEncoder()
 87 |     integer_encoded = label_encoder.fit_transform(y_s)
 88 |     # binary encode
 89 |     onehot_encoder = OneHotEncoder(sparse=False)
 90 |     integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 91 |     onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 92 |     num_classes = len(onehot_encoded[0])
 93 | 
 94 |     tower_1 = Conv2D(32, (1, 71), activation='relu')(input_img)
 95 |     # tower_1 = Conv2D(64, (3, 3), padding='same', activation='relu')(tower_1)
 96 |     tower_1 = MaxPooling2D(1, 2)(tower_1)
 97 |     tower_1 = Flatten()(tower_1)
 98 | 
 99 |     tower_2 = Conv2D(32, (100, 1), activation='relu')(input_img)
100 |     # tower_2 = Conv2D(64, (5, 5), padding='same', activation='relu')(tower_2)
101 |     tower_2 = MaxPooling2D(1, 2)(tower_2)
102 |     tower_2 = Flatten()(tower_2)
103 | 
104 | 
105 |     output = keras.layers.concatenate([tower_1, tower_2], axis=1)
106 | 
107 |     out1 = Dense(128, activation='relu')(output)
108 |     last_layer = Dense(num_classes, activation='softmax')(out1)
109 | 
110 |     model = Model(input=[input_img], output=last_layer)
111 |     model.output_shape
112 | 
113 |     model.compile(loss='categorical_crossentropy',
114 |                   optimizer='adam',
115 |                   metrics=['categorical_accuracy'])
116 |     callbacks = [EarlyStopping(monitor='categorical_accuracy', patience=3, verbose=0)]
117 |     if i==0:
118 |         model.summary()
119 |         i = i +1
120 |     history = model.fit(input_Xs[train], onehot_encoded[train],
121 |                         batch_size=batch_size,
122 |                         epochs=epochs,
123 |                         verbose=0, callbacks=callbacks, validation_data=(input_Xs[test], onehot_encoded[test]))
124 |     scores = model.evaluate(input_Xs[test], onehot_encoded[test], verbose=0)
125 |     print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
126 |     cvscores.append(scores[1] * 100)
127 | 
128 | print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))
129 | 


--------------------------------------------------------------------------------
/Hyperparameter tuning/1d_CNN_33class_hyperparameters.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is written by Milad Mostavi, one of authors of
  3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper.
  4 | Please cite this paper in the case it was useful in your research
  5 | '''
  6 | import pickle
  7 | from numpy import array
  8 | from numpy import argmax
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.preprocessing import OneHotEncoder
 11 | import numpy as np
 12 | from sklearn.model_selection import GridSearchCV
 13 | 
 14 | from sklearn.model_selection import train_test_split
 15 | import collections
 16 | import matplotlib.pyplot as plt
 17 | import pandas as pd
 18 | from keras.models import Sequential
 19 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten
 20 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 21 | from keras.layers.normalization import BatchNormalization
 22 | from keras.layers.advanced_activations import LeakyReLU
 23 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score
 24 | from collections import Counter
 25 | from keras.wrappers.scikit_learn import KerasClassifier
 26 | import keras
 27 | 
 28 | 
 29 | 
 30 | A = open('TCGA_new_pre_second.pckl', 'rb')
 31 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new,
 32 |  project_ids_new] = pickle.load(A)
 33 | A.close()
 34 | 
 35 | f = open('TCGA_new_pre_first.pckl', 'rb')
 36 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f)
 37 | f.close()
 38 | 
 39 | 
 40 | ## embedding labels
 41 | # integer encode
 42 | label_encoder = LabelEncoder()
 43 | integer_encoded = label_encoder.fit_transform(project_ids_new)
 44 | # binary encode
 45 | onehot_encoder = OneHotEncoder(sparse=False)
 46 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 47 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 48 | 
 49 | X_cancer_samples = dropped_genes_final.iloc[:, remain_cancer_ids_ind].T.values
 50 | X_normal_samples = dropped_genes_final.iloc[:, remain_normal_ids_ind].T.values
 51 | onehot_encoded_cancer_samples = onehot_encoded[remain_cancer_ids_ind]
 52 | onehot_encoded_normal_samples = onehot_encoded[remain_normal_ids_ind]
 53 | 
 54 | X_cancer_samples_mat = np.concatenate((X_cancer_samples,np.zeros((len(X_cancer_samples),9))),axis=1)
 55 | ## add nine zeros to the end of each sample
 56 | # This line dimension needs to be changed for different kernel sizes
 57 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100))
 58 | 
 59 | ## This line is useful when only one fold training is needed
 60 | x_train, x_test, y_train, y_test = train_test_split(X_cancer_samples_mat, onehot_encoded_cancer_samples,
 61 |                                                     stratify=onehot_encoded_cancer_samples,
 62 |                                                     test_size=0.25, random_state=42)
 63 | # adding one dimention to feed into CNN
 64 | img_rows, img_cols = len(x_test[0]), len(x_test[0][0])
 65 | 
 66 | x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
 67 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
 68 | # X_normal_samples = X_normal_samples.reshape(X_normal_samples.shape[0], img_rows, img_cols, 1)
 69 | x_train = x_train.astype('float32')
 70 | x_test = x_test.astype('float32')
 71 | 
 72 | 
 73 | ## Model
 74 | 
 75 | def make_model(dense_layer_sizes, filters, kernel_size):
 76 |     img_rows, img_cols = len(x_test[0]), len(x_test[0][0])
 77 |     num_classes = len(y_train[0])
 78 |     input_shape = (img_rows, img_cols, 1)
 79 | 
 80 |     model = Sequential()
 81 |     ## *********** First layer Conv
 82 |     model.add(Conv2D(filters, kernel_size=kernel_size, strides=(1, 1),
 83 |                      input_shape=input_shape))
 84 |     # model.add(BatchNormalization())
 85 |     model.add(Activation('relu'))
 86 |     # model.add(LeakyReLU())
 87 |     model.add(MaxPooling2D(1, 2))
 88 |     model.output_shape
 89 | 
 90 |     ## ********* Classification layer
 91 |     model.add(Flatten())
 92 |     model.add(Dense(dense_layer_sizes, activation='relu'))
 93 | 
 94 |     model.add(Dense(num_classes, activation='softmax'))
 95 |     model.output_shape
 96 | 
 97 |     model.compile(loss='categorical_crossentropy',
 98 |                   optimizer='adam',
 99 |                   metrics=['categorical_accuracy'])
100 |     model.summary()
101 |     return model
102 | dense_size_candidates = [64, 128, 512]
103 | my_classifier = KerasClassifier(make_model, batch_size=128)
104 | validator = GridSearchCV(my_classifier,
105 |                          param_grid={'dense_layer_sizes': dense_size_candidates,
106 |                                      # epochs is avail for tuning even when not
107 |                                      # an argument to model building function
108 |                                      'epochs': [25],
109 |                                      'filters': [8,16, 32, 64],
110 |                                      'kernel_size': [(1, 71)]},
111 |                          scoring='neg_log_loss',
112 |                          n_jobs=1)
113 | validator.fit(x_train, y_train)
114 | import csv
115 | print('The parameters of the best model are: ')
116 | print(validator.best_params_)
117 | # write it in a excel file
118 | with open('results_runs50.csv', 'w') as csv_file:
119 |     writer = csv.writer(csv_file)
120 |     for key, value in validator.cv_results_.items():
121 |        writer.writerow([key, value])
122 | # validator.best_estimator_ returns sklearn-wrapped version of best model.
123 | # validator.best_estimator_.model returns the (unwrapped) keras model
124 | best_model = validator.best_estimator_.model
125 | metric_names = best_model.metrics_names
126 | metric_values = best_model.evaluate(x_test, y_test)
127 | for metric, value in zip(metric_names, metric_values):
128 |     print(metric, ': ', value)
129 | 


--------------------------------------------------------------------------------
/Hyperparameter tuning/vanilla_CNN_33class_hyperparameters.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is written by Milad Mostavi, one of authors of
  3 | "Convolutional neural network models for cancer type prediction based on gene expression" paper.
  4 | Please cite this paper in the case it was useful in your research
  5 | '''
  6 | import pickle
  7 | from numpy import array
  8 | from numpy import argmax
  9 | from sklearn.preprocessing import LabelEncoder
 10 | from sklearn.preprocessing import OneHotEncoder
 11 | import numpy as np
 12 | from sklearn.model_selection import GridSearchCV
 13 | 
 14 | from sklearn.model_selection import train_test_split
 15 | import collections
 16 | import matplotlib.pyplot as plt
 17 | import pandas as pd
 18 | from keras.models import Sequential
 19 | from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Activation, Flatten
 20 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 21 | from keras.layers.normalization import BatchNormalization
 22 | from keras.layers.advanced_activations import LeakyReLU
 23 | from sklearn.metrics import precision_recall_curve, roc_curve, auc, average_precision_score
 24 | from collections import Counter
 25 | from keras.wrappers.scikit_learn import KerasClassifier
 26 | import keras
 27 | 
 28 | 
 29 | 
 30 | A = open('TCGA_new_pre_second.pckl', 'rb')
 31 | [dropped_genes_final, dropped_gene_name, dropped_Ens_id, samp_id_new, diag_name_new,
 32 |  project_ids_new] = pickle.load(A)
 33 | A.close()
 34 | 
 35 | f = open('TCGA_new_pre_first.pckl', 'rb')
 36 | [_, _, _, _, remain_cancer_ids_ind, remain_normal_ids_ind] = pickle.load(f)
 37 | f.close()
 38 | 
 39 | 
 40 | ## embedding labels
 41 | # integer encode
 42 | label_encoder = LabelEncoder()
 43 | integer_encoded = label_encoder.fit_transform(project_ids_new)
 44 | # binary encode
 45 | onehot_encoder = OneHotEncoder(sparse=False)
 46 | integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
 47 | onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
 48 | 
 49 | X_cancer_samples = dropped_genes_final.iloc[:, remain_cancer_ids_ind].T.values
 50 | X_normal_samples = dropped_genes_final.iloc[:, remain_normal_ids_ind].T.values
 51 | onehot_encoded_cancer_samples = onehot_encoded[remain_cancer_ids_ind]
 52 | onehot_encoded_normal_samples = onehot_encoded[remain_normal_ids_ind]
 53 | 
 54 | X_cancer_samples_mat = np.concatenate((X_cancer_samples,np.zeros((len(X_cancer_samples),9))),axis=1)
 55 | ## add nine zeros to the end of each sample
 56 | # This line dimension needs to be changed for different kernel sizes
 57 | X_cancer_samples_mat = np.reshape(X_cancer_samples_mat, (-1, 71, 100))
 58 | 
 59 | ## This line is useful when only one fold training is needed
 60 | x_train, x_test, y_train, y_test = train_test_split(X_cancer_samples_mat, onehot_encoded_cancer_samples,
 61 |                                                     stratify=onehot_encoded_cancer_samples,
 62 |                                                     test_size=0.25, random_state=42)
 63 | # adding one dimention to feed into CNN
 64 | img_rows, img_cols = len(x_test[0]), len(x_test[0][0])
 65 | 
 66 | x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
 67 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
 68 | # X_normal_samples = X_normal_samples.reshape(X_normal_samples.shape[0], img_rows, img_cols, 1)
 69 | x_train = x_train.astype('float32')
 70 | x_test = x_test.astype('float32')
 71 | 
 72 | 
 73 | ## Model
 74 | 
 75 | def make_model(dense_layer_sizes, filters, kernel_size):
 76 |     img_rows, img_cols = len(x_test[0]), len(x_test[0][0])
 77 |     num_classes = len(y_train[0])
 78 |     input_shape = (img_rows, img_cols, 1)
 79 | 
 80 |     model = Sequential()
 81 |     ## *********** First layer Conv
 82 |     model.add(Conv2D(filters, kernel_size=kernel_size, strides=stride,
 83 |                      input_shape=input_shape))
 84 |     # model.add(BatchNormalization())
 85 |     model.add(Activation('relu'))
 86 |     # model.add(LeakyReLU())
 87 |     model.add(MaxPooling2D(2, 2))
 88 |     model.output_shape
 89 | 
 90 |     ## ********* Classification layer
 91 |     model.add(Flatten())
 92 |     model.add(Dense(dense_layer_sizes, activation='relu'))
 93 | 
 94 |     model.add(Dense(num_classes, activation='softmax'))
 95 |     model.output_shape
 96 | 
 97 |     model.compile(loss='categorical_crossentropy',
 98 |                   optimizer='adam',
 99 |                   metrics=['categorical_accuracy'])
100 |     model.summary()
101 |     return model
102 | dense_size_candidates = [64, 128, 512]
103 | my_classifier = KerasClassifier(make_model, batch_size=128)
104 | validator = GridSearchCV(my_classifier,
105 |                          param_grid={'dense_layer_sizes': dense_size_candidates,
106 |                                      # epochs is avail for tuning even when not
107 |                                      # an argument to model building function
108 |                                      'epochs': [25],
109 |                                      'filters': [8, 16, 32, 64], 'stride': [(1, 1),(2, 2),(5, 5)],
110 |                                      'kernel_size': [(7, 7), (10, 10), (15, 15), (20, 20)]},
111 |                          scoring='neg_log_loss',
112 |                          n_jobs=1)
113 | validator.fit(x_train, y_train)
114 | import csv
115 | print('The parameters of the best model are: ')
116 | print(validator.best_params_)
117 | # write it in a excel file
118 | with open('results_runs50.csv', 'w') as csv_file:
119 |     writer = csv.writer(csv_file)
120 |     for key, value in validator.cv_results_.items():
121 |        writer.writerow([key, value])
122 | # validator.best_estimator_ returns sklearn-wrapped version of best model.
123 | # validator.best_estimator_.model returns the (unwrapped) keras model
124 | best_model = validator.best_estimator_.model
125 | metric_names = best_model.metrics_names
126 | metric_values = best_model.evaluate(x_test, y_test)
127 | for metric, value in zip(metric_names, metric_values):
128 |     print(metric, ': ', value)
129 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Predicting all 33-cancer types and their normal tissues with CNN
 2 | ![](image/git_models_fig.png)
 3 | 
 4 | Folder ending with 33 class contains models which only work for classification of 33 cancer tumors. In order to see the impact of Normal tissues in classification, the codes that are in 34 class folder need to be run. Hyperparameter tuning folder is showing grid search result of some of the hyperparameters for 1D-CNN and Vanilla models. All codes are written in Keras with a simple structure which helps reader understand the modeling stage easier. 
 5 | 
 6 | ## Background
 7 | Precise prediction of cancer types is vital for cancer diagnosis and therapy. Important cancer marker genes can be inferred through predictive model. Several studies have attempted to build machine learning models for this task however none has taken into consideration the tissue of origin effects that can potentially bias the identification of cancer markers.
 8 | ## Results
 9 | In this paper, we introduced several Convolutional Neural Network (CNN) models that take unstructured gene expression inputs to classify tumor and non-tumor samples into their designated cancer types or as normal. Based on different designs of gene embeddings and convolution schemes, we implemented three CNN models: 1D-CNN, 2D-Vanilla-CNN, and 2D-Hybrid-CNN. The models were trained and tested on combined 10,340 samples of 33 cancer types and 731 matched normal tissues of The Cancer Genome Atlas (TCGA). Our models achieved excellent prediction accuracies (93.9-95.0%) among 34 classes (33 cancers and normal). Furthermore, we interpreted the 1D-CNN model with a guided saliency technique and identified a total of 2,090 cancer markers (108 per class). The concordance of differential expression of these markers between the cancer type they represent and others is confirmed. In breast cancer, for instance, our model identified well-known markers, such as GATA3 and ESR1. Finally, we extended the 1D-CNN model for prediction of breast cancer subtypes and achieved an average accuracy of 88.42% among 5 subtypes.
10 | ## Conclusions
11 | Here we present novel CNN designs for accurate and simultaneous cancer/normal and cancer types prediction based on gene expression profiles, and unique model interpretation scheme to elucidate biologically relevance of cancer marker genes after eliminating the effects of tissue-of-origin. The proposed model had light hyperparameters to be trained and thus can be easily adapt to facilitate cancer diagnosis in the future.
12 | 


--------------------------------------------------------------------------------
/image/git_models_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenlabgccri/CancerTypePrediction/ed3991c960e29c071410db549960807dd6b6fe87/image/git_models_fig.png


--------------------------------------------------------------------------------